# Fake/real news classification [tensorflow (keras)]
* Binary classification of news being fake or real, based on their text.
* Reference notebook: <https://www.kaggle.com/code/madz2000/nlp-using-glove-embeddings-99-87-accuracy/notebook>
* Dataset: <https://www.kaggle.com/code/madz2000/nlp-using-glove-embeddings-99-87-accuracy/input>

## Import libraries

In [1]:
import os
import string
from typing import Tuple

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from keras.optimizers import Adam
from keras.models import Sequential
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split

import giskard
from giskard import Dataset, Model

2023-05-16 13:27:26.671092: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Define constants

In [2]:
# Constants.
MAX_TOKENS = 10000
MAX_SEQUENCE_LENGTH = 300

STOPWORDS = stopwords.words('english')

TEXT_COLUMN_NAME = "text"
TARGET_COLUMN_NAME = "category"

# Paths.
DATA_DIRECTORY = os.path.join(".", "datasets", "fake_real_news_dataset")
EMBEDDING_FILE = os.path.join(DATA_DIRECTORY, "glove_100d.txt")

## Load data

In [3]:
def load_data(**kwargs) -> pd.DataFrame:
    """Load data."""
    print(f"Loading data...")

    # Load and label real news data.
    real_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "True.csv"), **kwargs)
    real_df[TARGET_COLUMN_NAME] = 1

    # Load and label fake news data.
    fake_df = pd.read_csv(os.path.join(DATA_DIRECTORY, "Fake.csv"), **kwargs)
    fake_df[TARGET_COLUMN_NAME] = 0

    # Create common df.
    full_df = pd.concat([real_df, fake_df])

    # Drop useless columns.
    full_df.drop(columns=["subject", "date"], inplace=True)

    print(f"Finished loading data!")

    return full_df

In [4]:
news_df = load_data(nrows=2000)
news_df.head()

Loading data...
Finished loading data!


Unnamed: 0,title,text,category
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


## Train-test split

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(news_df[["title", TEXT_COLUMN_NAME]], news_df[TARGET_COLUMN_NAME], random_state=0)

## Define text preprocessing logic

In [6]:
def clean_text(df: pd.DataFrame) -> np.ndarray:
    """Perform text-data cleaning: punctuation and stop words removal."""
    # Merge text data into single column.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME] + " " + df.title
    df.drop(columns=["title"], inplace=True)

    # Remove punctuation.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(
        lambda text: text.translate(str.maketrans('', '', string.punctuation)))

    # Remove stop words.
    df[TEXT_COLUMN_NAME] = df[TEXT_COLUMN_NAME].apply(
        lambda sentence: ' '.join([_word for _word in sentence.split() if _word.lower() not in STOPWORDS]))

    return df[TEXT_COLUMN_NAME]

## Wrap data with giskard

In [7]:
raw_data = pd.concat([X_test.copy(), Y_test.copy()], axis=1)
wrapped_data = Dataset(raw_data, name="fake_and_real_news", target=TARGET_COLUMN_NAME, column_types={"title": "text", TEXT_COLUMN_NAME: "text"})

Hint: "Your target variable values are numeric. It is recommended to have Human readable string as your target values to make results more understandable in Giskard."


In [8]:
print(f"Cleaning train data...")
X_train = clean_text(X_train)

print(f"Cleaning test data...")
X_test = clean_text(X_test)

print(f"Finished cleaning!")

Cleaning train data...
Cleaning test data...
Finished cleaning!


In [9]:
# Fit tokenizer.
tokenizer = Tokenizer(num_words=MAX_TOKENS)
tokenizer.fit_on_texts(X_train)

# Tokenize train text.
tokenized_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(tokenized_train, maxlen=MAX_SEQUENCE_LENGTH)

# Tokenize test text.
tokenized_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(tokenized_test, maxlen=MAX_SEQUENCE_LENGTH)

## Define preprocessing function

In [10]:
def preprocessing_function(df: pd.DataFrame) -> np.ndarray:
    """Preprocessing function used by giskard."""
    # Clean text.
    cleaned_text = clean_text(df)

    # Tokenize text.
    tokens = tokenizer.texts_to_sequences(cleaned_text)
    tokens_with_padding = pad_sequences(tokens, maxlen=MAX_SEQUENCE_LENGTH)

    return tokens_with_padding

## Create embeddings matrix

In [11]:
def parse_line(word: str, *arr: list) -> Tuple[str, np.ndarray]:
    """Parse line from the file with embeddings.
    The first value of the line is the word and the rest values are related glove embedding: (<word>, 0.66, 0.23, ...)."""
    return word, np.asarray(arr, dtype='float32')

def init_embeddings_matrix(embeddings_dict: dict) -> np.ndarray:
    """Initialization of the matrix, where each row is a specific embedding vector."""
    num_embeddings = min(MAX_TOKENS, len(tokenizer.word_index))
    stacked_embeddings = np.stack(list(embeddings_dict.values()))
    embeddings_mean, embeddings_std, embeddings_dimension = stacked_embeddings.mean(), stacked_embeddings.std(), stacked_embeddings.shape[1]

    embeddings_matrix = np.random.normal(embeddings_mean, embeddings_std, (num_embeddings, embeddings_dimension))

    return embeddings_matrix

def get_embeddings_matrix() -> np.ndarray:
    """Create matrix, where each row is an embedding of a specific word."""
    print(f"Building embeddings matrix...")

    # Load glove embeddings.
    embeddings_dict = dict(parse_line(*line.rstrip().rsplit(' ')) for line in open(EMBEDDING_FILE))

    # Initialization of embeddings matrix.
    embeddings_matrix = init_embeddings_matrix(embeddings_dict)

    # Fill-in embeddings matrix with glove word vectors.
    for word, idx in tokenizer.word_index.items():
        if idx >= MAX_TOKENS:
            continue

        embedding_vector = embeddings_dict.get(word, None)

        if embedding_vector is not None:
            embeddings_matrix[idx] = embedding_vector

    print(f"Finished building embedding matrix!")

    return embeddings_matrix

embed_matrix = get_embeddings_matrix()

Building embeddings matrix...
Finished building embedding matrix!


## Train model

In [12]:
def init_model() -> Sequential:
    """Initialize new TF model."""
    # Define model container.
    _model = Sequential()

    # Non-trainable embedding layer.
    _model.add(Embedding(MAX_TOKENS, output_dim=100, weights=[embed_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))

    # LSTM stage.
    _model.add(LSTM(units=32 , return_sequences=True , recurrent_dropout=0.25, dropout=0.25))
    _model.add(LSTM(units=16 , recurrent_dropout=0.1 , dropout=0.1))

    # Dense stage.
    _model.add(Dense(units=16 , activation='relu'))
    _model.add(Dense(units=1, activation='sigmoid'))

    # Build model.
    _model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])
    return _model

model = init_model()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 100)          1000000   
                                                                 
 lstm (LSTM)                 (None, 300, 32)           17024     
                                                                 
 lstm_1 (LSTM)               (None, 16)                3136      
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1,020,449
Trainable params: 20,449
Non-trainable params: 1,000,000
_________________________________________________________________


In [13]:
# Define hyperparameters.
n_epochs = 1
batch_size = 256

# Fit model.
_ = model.fit(X_train, Y_train, batch_size=batch_size, validation_data=(X_test, Y_test), epochs=n_epochs)



## Calculate train and test accuracy

In [14]:
train_metric = model.evaluate(X_train, Y_train)[1]
test_metric = model.evaluate(X_test, Y_test)[1]



In [15]:
print(f"Train accuracy: {train_metric: .2f}")
print(f"Test accuracy: {test_metric: .2f}")

Train accuracy:  0.97
Test accuracy:  0.98


## Wrap data and model

In [16]:
wrapped_model = Model(model,
                      model_type="classification",
                      data_preprocessing_function=preprocessing_function,
                      name="fake_real_news_classification",
                      feature_names=["title", "text"],
                      classification_threshold=0.5,
                      classification_labels=[1, 0])

In [17]:
wrapped_model.predict(wrapped_data)

Your binary classification model prediction is of the shape (1000, 1). 
In Giskard we expect the shape (1000, 2) for binary classification models. 
We automatically inferred the second class prediction but please make sure that 
the probability output of your model corresponds to the first label of the 
classification_labels ([1, 0]) you provided us with.
NoneType: None


ModelPredictionResults(raw=array([[0.0838308 , 0.9161692 ],
       [0.98467845, 0.01532155],
       [0.03202904, 0.96797097],
       ...,
       [0.04147451, 0.9585255 ],
       [0.9852804 , 0.01471961],
       [0.9855013 , 0.01449871]], dtype=float32), prediction=array([0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
     

## Scan model

In [18]:
scanning_results = giskard.scan(wrapped_model, wrapped_data)

2023-05-16 13:28:45.379745: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,32]
	 [[{{node while/Placeholder_2}}]]
2023-05-16 13:28:45.539928: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'while/Placeholder_2' with dtype float and shape [?,32]
	 [[{{node while/Placeholder_2}}]]
2023-05-16 13:28:45.584897: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and sh

INFO:tensorflow:Assets written to: /var/folders/4q/3_bfyqnn7yv5jcjq98x2jf680000gn/T/giskard-model-z4hl_f8u/data/model/assets
2023-05-16 13:28:55,450 pid:9251 MainThread tensorflow   INFO     Assets written to: /var/folders/4q/3_bfyqnn7yv5jcjq98x2jf680000gn/T/giskard-model-z4hl_f8u/data/model/assets




Your binary classification model prediction is of the shape (10, 1). 
In Giskard we expect the shape (10, 2) for binary classification models. 
We automatically inferred the second class prediction but please make sure that 
the probability output of your model corresponds to the first label of the 
classification_labels ([1, 0]) you provided us with.
NoneType: None
Your binary classification model prediction is of the shape (10, 1). 
In Giskard we expect the shape (10, 2) for binary classification models. 
We automatically inferred the second class prediction but please make sure that 
the probability output of your model corresponds to the first label of the 
classification_labels ([1, 0]) you provided us with.
NoneType: None
Your model is successfully validated.
Running scan…
Your binary classification model prediction is of the shape (1000, 1). 
In Giskard we expect the shape (1000, 2) for binary classification models. 
We automatically inferred the second class prediction but plea

ValueError: Found input variables with inconsistent numbers of samples: [1000, 1278]