# Setting up to start the model

In [1]:
# Import the required libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics import confusion_matrix

# Load the already presplit data sets into train, validate and test

dataset_test = pd.read_csv('Data/test_df.csv')
dataset_val = pd.read_csv('Data/val_df.csv')
dataset_train = pd.read_csv('Data/train_df.csv')

# Processing and preparing the data

In [2]:
# Function to proccess the text by removing stop words and punctuation
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def TextPreprocessing(text):
    text = str(text).lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    word_tokens = word_tokenize(text)
    text = ' '.join([w for w in word_tokens if not w in stop_words])
    return text

# Apply the preprocessing to all datasets
dataset_train['text'] = dataset_train['text'].apply(TextPreprocessing)
dataset_val['text'] = dataset_val['text'].apply(TextPreprocessing)
dataset_test['text'] = dataset_test['text'].apply(TextPreprocessing)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fredf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fredf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Extarct data and turn them into arrays
train_texts = dataset_train['text'].values
val_texts = dataset_val['text'].values
test_texts = dataset_test['text'].values

# Extract the labels of the data into arrays and insure the proper int64 type
train_labels = dataset_train['label'].values.astype(np.int64)
val_labels = dataset_val['label'].values.astype(np.int64)
test_labels = dataset_test['label'].values.astype(np.int64)

# Vectorize the data with the 50 most common/repeated features
num_of_features = 50
vectorizer = TfidfVectorizer(max_features=num_of_features)
vectorizer.fit(train_texts)

# Define a function to take the text and its label then return a TF-IDF vector with its label
def encode_text(text, label):
    text = vectorizer.transform([text.numpy().decode('utf-8')]).toarray()
    return text[0], label

# Define function which using the previous function to convert the text from the vectors to tensor flow with a specific dimension
def tf_encode_text(text, label):
    encoded_text, label = tf.py_function(encode_text, inp=[text, label], Tout=[tf.float32, tf.int64])
    encoded_text.set_shape([num_of_features])
    label.set_shape([])
    return encoded_text, label

# Create the TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_texts, train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((val_texts, val_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_texts, test_labels))

# Apply to each element the tf_encode_text function in the datasets with parallel processing
train_dataset = train_dataset.map(tf_encode_text, num_parallel_calls=tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.map(tf_encode_text, num_parallel_calls=tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.map(tf_encode_text, num_parallel_calls=tf.data.experimental.AUTOTUNE)

# Set batch size to 8 and epochs to 30
batch_size = 8
epochs = 30

# Shuffle the data in the training dataset to randomize it (to prevent overfitting), batch it, and prefetch it
train_dataset = train_dataset.shuffle(buffer_size=10000).batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# Batch the validation and test data sets and prefetch them
val_dataset = val_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
test_dataset = test_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# Function to reshape data to match LSTM (RNN Layer) requirements
def reshape_dataset(dataset):
    return dataset.map(lambda x, y: (tf.expand_dims(x, -1), y))

# Reshape data to fit the LSTM requirements
train_dataset = reshape_dataset(train_dataset)
val_dataset = reshape_dataset(val_dataset)
test_dataset = reshape_dataset(test_dataset)

# Model

In [4]:
# Define the model
model = tf.keras.Sequential([ # Stack layers in a sequence using keras API
    tf.keras.layers.Input(shape=(num_of_features, 1)),
    tf.keras.layers.LSTM(128, activation='relu'), # One LSTM (RNN) layer with a relu activation function with 256 neurons
    tf.keras.layers.Dense(64, activation='relu'), # One fully connected layer with a relu activation function with 64 neurons
    tf.keras.layers.Dense(1, activation='sigmoid') # Output layer with 1 neuron output as it is fake or real classification (binary)
])

# Optimizing the model using Adam optimizer and calculating accuracy and loss
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Function to reduce the learning rate each 5 epochs (step decrease)
def step_decay(epoch):
    initial_lr = 0.0005 # Set the initial learning rate to 0.0005 (same as Adam learning rate in our case)
    drop = 0.8 # Droping the learning rate by 20%
    epochs_drop = 5 # Drop the learning rate each 5 epochs
    lr = initial_lr * (drop ** np.floor((1 + epoch) / epochs_drop)) # Calculate new learning rate
    return lr

# Allow keras to chnage the learning rate based on the function above
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(step_decay)

# Train the model with the training and validation data sets
model.fit(train_dataset, validation_data=val_dataset, epochs=epochs, callbacks=[lr_scheduler])

Epoch 1/30
[1m10462/10462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 11ms/step - accuracy: 0.6301 - loss: 0.6374 - val_accuracy: 0.7020 - val_loss: 0.5752 - learning_rate: 5.0000e-04
Epoch 2/30
[1m10462/10462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 10ms/step - accuracy: 0.7150 - loss: 0.5610 - val_accuracy: 0.7409 - val_loss: 0.5330 - learning_rate: 5.0000e-04
Epoch 3/30
[1m10462/10462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 10ms/step - accuracy: 0.7437 - loss: 0.5291 - val_accuracy: 0.7475 - val_loss: 0.5194 - learning_rate: 5.0000e-04
Epoch 4/30
[1m10462/10462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 10ms/step - accuracy: 0.7537 - loss: 0.5152 - val_accuracy: 0.7599 - val_loss: 0.5020 - learning_rate: 5.0000e-04
Epoch 5/30
[1m10462/10462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 10ms/step - accuracy: 0.7650 - loss: 0.4981 - val_accuracy: 0.7665 - val_loss: 0.4947 - learning_rate: 4.0000e-04
Epoch 6/30
[1m

<keras.src.callbacks.history.History at 0x1ae807c5a60>

# Evaluating the Model

In [6]:
# Evaluate the model using the testing data set
loss, accuracy = model.evaluate(test_dataset)

# Confusion matrix analysis of results on the testing data set
y_true = []
y_pred = []

for x, y in test_dataset:
    pred = model.predict(x)
    pred= (pred > 0.5).astype(int)
    y_true.extend(y.numpy())
    y_pred.extend(pred.flatten())

# Convert to numpy arrays
y_true = np.array(y_true)
y_pred = np.array(y_pred)

# Calculate the confusion matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

# Calculate False Positive Rate and False Negative Rates
fpr = fp / (fp + tn)
fnr = fn / (fn + tp)
tpr = tp / (tp + fn)
tnr = tn / (tn + fp)

# Print the results
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Loss: {loss:.2f}')
print(f"True Positives (TP): {tp}")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")

[1m2242/2242[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.8122 - loss: 0.4188
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━

# Trial Run with Output

In [12]:
# Load the news article
dataset_try_path = 'Data/try_df.csv'
dataset_try = pd.read_csv(dataset_try_path)

In [13]:
# Process and fix the format of the news article as done above
dataset_try['text'] = dataset_try['text'].apply(TextPreprocessing)
try_texts = dataset_try['text'].values
try_labels = dataset_try['label'].values.astype(np.int64)
try_dataset = tf.data.Dataset.from_tensor_slices((try_texts, try_labels))
try_dataset = try_dataset.map(tf_encode_text, num_parallel_calls=tf.data.experimental.AUTOTUNE)
try_dataset = try_dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
try_dataset = reshape_dataset(try_dataset)

# Confusion matrix analysis of results on the testing data set
y_true = []
y_pred = []

for x, y in try_dataset:
    pred = model.predict(x)
    pred= (pred > 0.5).astype(int)
    y_true.extend(y.numpy())
    y_pred.extend(pred.flatten())

# Convert to numpy arrays
y_true = np.array(y_true)
y_pred = np.array(y_pred)

# Calculate the confusion matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

# Calculate False Positive Rate and False Negative Rates
fpr = fp / (fp + tn)
fnr = fn / (fn + tp)
tpr = tp / (tp + fn)
tnr = tn / (tn + fp)

# Print the results
print(f'Test Accuracy: {accuracy:.2f}')
print(f'Test Loss: {loss:.2f}')
print(f"True Positives (TP): {tp}")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"False Positive Rate (FPR): {fpr:.2f}")
print(f"False Negative Rate (FNR): {fnr:.2f}")
print(f"True Positive Rate (TPR): {tpr:.2f}")
print(f"True Negative Rate (TNR): {tnr:.2f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Test Accuracy: 0.81
Test Loss: 0.42
True Positives (TP): 1
True Negatives (TN): 1
False Positives (FP): 0
False Negatives (FN): 0
False Positive Rate (FPR): 0.00
False Negative Rate (FNR): 0.00
True Positive Rate (TPR): 1.00
True Negative Rate (TNR): 1.00
