# Fake/real news classification [tensorflow (keras)]
Binary classification of news being fake or real, based on their text.
Reference notebook: <Reference notebook: <https://www.kaggle.com/code/madz2000/nlp-using-glove-embeddings-99-87-accuracy/notebook>
Dataset: <https://www.kaggle.com/code/madz2000/nlp-using-glove-embeddings-99-87-accuracy/input>

## Import libraries

In [1]:
import os
import re, string

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from keras.optimizers import Adam
from keras.models import Sequential
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split

import giskard
from giskard import Dataset, Model

2023-05-15 18:13:48.588040: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Define constants

In [2]:
# Constants.
MAX_TOKENS = 10000
MAX_SEQUENCE_LENGTH = 300

STOPWORDS = stopwords.words('english')

# Paths.
DATA_DIRECTORY = os.path.join(".", "datasets", "fake_real_news_dataset")
EMBEDDING_FILE = os.path.join(DATA_DIRECTORY, "glove_100d.txt")

## Load data

In [3]:
def load_data(**kwargs):
    # Load and label real news data.
    real_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "True.csv"), **kwargs)
    real_df['category'] = 1

    # Load and label fake news data.
    fake_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "Fake.csv"), **kwargs)
    fake_df['category'] = 0

    # Create common df.
    full_df = pd.concat([real_df, fake_df])

    # Drop useless columns.
    full_df.drop(columns=["subject", "date"], inplace=True)

    return full_df

In [4]:
news_df = load_data(nrows=2000)
news_df.head()

Unnamed: 0,title,text,category
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


## Train-test split

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(news_df[["title", "text"]], news_df.category, random_state=0)

## Define text preprocessing logic

In [6]:
def clean_text(df: pd.DataFrame) -> np.ndarray:
    # Merge text data into single column.
    df.text = df.text + " " + df.title
    df.drop(columns=["title"], inplace=True)

    # Remove text inside square brackets.
    df.text = df.text.apply(lambda sentence: re.sub('\[[^]]*\]', '', sentence))

    # Remove punctuation.
    df.text = df.text.apply(lambda text: text.translate(str.maketrans('', '', string.punctuation)))

    # Remove stop words.
    df.text = df.text.apply(lambda sentence: ' '.join([_word for _word in sentence.split() if _word.lower() not in STOPWORDS]))

    return df.text

## Wrap data with giskard

In [7]:
raw_data = pd.concat([X_test.copy(), Y_test.copy()], axis=1)
wrapped_data = Dataset(raw_data, name="fake_and_real_news", target="category", column_types={"title": "text", "text": "text"})

Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.


In [8]:
X_train = clean_text(X_train)
X_test = clean_text(X_test)

In [9]:
# Fit tokenizer.
tokenizer = Tokenizer(num_words=MAX_TOKENS)
tokenizer.fit_on_texts(X_train)

# Tokenize train text.
tokenized_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(tokenized_train, maxlen=MAX_SEQUENCE_LENGTH)

# Tokenize test text.
tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(tokenized_test, maxlen=MAX_SEQUENCE_LENGTH)

## Define preprocessing function

In [10]:
def preprocessing_function(df: pd.DataFrame) -> np.ndarray:
    cleaned_text = clean_text(df)
    tokens = tokenizer.texts_to_sequences(cleaned_text)
    tokens_with_padding = pad_sequences(tokens, maxlen=MAX_SEQUENCE_LENGTH)

    return tokens_with_padding

## Create embeddings matrix

In [11]:
def parse_line(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def init_embeddings_matrix(embeddings_dict):
    num_embeddings = min(MAX_TOKENS, len(tokenizer.word_index))

    stacked_embeddings = np.stack(embeddings_dict.values())
    embeddings_mean, embeddings_std, embeddings_dimension = stacked_embeddings.mean(), stacked_embeddings.std(), stacked_embeddings.shape[1]

    embeddings_matrix = np.random.normal(embeddings_mean, embeddings_std, (num_embeddings, embeddings_dimension))

    return embeddings_matrix

def get_embeddings_matrix():
    embeddings_dict = dict(parse_line(*line.rstrip().rsplit(' ')) for line in open(EMBEDDING_FILE))
    embeddings_matrix = init_embeddings_matrix(embeddings_dict)

    for word, idx in tokenizer.word_index.items():
        if idx >= MAX_TOKENS:
            continue

        embedding_vector = embeddings_dict.get(word, None)

        if embedding_vector is not None:
            embeddings_matrix[idx] = embedding_vector

    return embeddings_matrix

embed_matrix = get_embeddings_matrix()

  embed_matrix = get_embeddings_matrix()


## Train model

In [12]:
# Define model container.
model = Sequential()

# Non-trainable embedding layer.
model.add(Embedding(MAX_TOKENS, output_dim=100, weights=[embed_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))

# LSTM stage.
model.add(LSTM(units=32 , return_sequences=True , recurrent_dropout=0.25, dropout=0.25))
model.add(LSTM(units=16 , recurrent_dropout=0.1 , dropout=0.1))

# Dense stage.
model.add(Dense(units=16 , activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

# Build model.
model.compile(optimizer=Adam(lr=0.01), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 100)          1000000   
                                                                 
 lstm (LSTM)                 (None, 300, 32)           17024     
                                                                 
 lstm_1 (LSTM)               (None, 16)                3136      
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1,020,449
Trainable params: 20,449
Non-trainable params: 1,000,000
_________________________________________________________________


  super().__init__(name, **kwargs)


In [13]:
# Define hyperparameters.
n_epochs = 1
batch_size = 256

# Fit model.
_ = model.fit(X_train, Y_train, batch_size=batch_size, validation_data=(X_test, Y_test), epochs=n_epochs)



## Calculate train and test accuracy

In [14]:
train_accuracy = model.evaluate(X_train, Y_train)[1]
test_accuracy = model.evaluate(X_test, Y_test)[1]



In [15]:
print(f"Train accuracy: {train_accuracy: .2f}")
print(f"Test accuracy: {test_accuracy: .2f}")

Train accuracy:  0.97
Test accuracy:  0.97


## Wrap data and model and run scanner

In [25]:
wrapped_model = Model(model,
                      model_type="classification",
                      data_preprocessing_function=preprocessing_function,
                      name="fake_real_news_classification",
                      feature_names=["title", "text"],
                      classification_threshold=0.5,
                      classification_labels=[1, 0]
                      )

Your 'model' is successfully wrapped by Giskard's 'TensorFlowModel' wrapper class.


In [26]:
wrapped_pred = wrapped_model.predict(wrapped_data)

Your binary classification model prediction is of the shape (1000, 1). 
In Giskard we expect the shape (1000, 2) for binary classification models. 
We automatically inferred the second class prediction but please make sure that 
the probability output of your model corresponds to the first label of the 
classification_labels ([1, 0]) you provided us with.
NoneType: None


In [18]:
scanning_results = giskard.scan(wrapped_model, wrapped_data)

2023-05-15 18:15:06.743152: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,32]
	 [[{{node while/Placeholder_2}}]]
2023-05-15 18:15:06.974192: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,32]
	 [[{{node while/Placeholder_2}}]]
2023-05-15 18:15:07.046122: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and sh

INFO:tensorflow:Assets written to: /var/folders/4q/3_bfyqnn7yv5jcjq98x2jf680000gn/T/giskard-model-4e98i2mb/data/model/assets
2023-05-15 18:15:16,534 pid:8347 MainThread tensorflow   INFO     Assets written to: /var/folders/4q/3_bfyqnn7yv5jcjq98x2jf680000gn/T/giskard-model-4e98i2mb/data/model/assets




Hint: "Your target variable values are numeric. It is recommended to have Human readable string as your target values to make results more understandable in Giskard."
Your 'pandas.DataFrame' is successfully wrapped by Giskard's 'Dataset' wrapper class.
Your binary classification model prediction is of the shape (10, 1). 
In Giskard we expect the shape (10, 2) for binary classification models. 
We automatically inferred the second class prediction but please make sure that 
the probability output of your model corresponds to the first label of the 
classification_labels ([1, 0]) you provided us with.
NoneType: None
Your binary classification model prediction is of the shape (10, 1). 
In Giskard we expect the shape (10, 2) for binary classification models. 
We automatically inferred the second class prediction but please make sure that 
the probability output of your model corresponds to the first label of the 
classification_labels ([1, 0]) you provided us with.
NoneType: None
Your mode

ValueError: Found input variables with inconsistent numbers of samples: [1000, 1278]