<a href="https://colab.research.google.com/github/LeeMinQi-25/Fake-News-Detection-with-Deep-Learning/blob/main/NLP_Group_Project_Group16_(LSTM).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.31.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import gradio as gr

# ----- DATA LOADING & PREPROCESSING -----
df = pd.read_csv("WELFake_Dataset.csv")  # Replace with your actual CSV filename
df = df.dropna(subset=['title', 'text', 'label']).reset_index(drop=True)
df['Text'] = df['title'] + " " + df['text']
df = df[['Text', 'label']].rename(columns={'label': 'Label'})
df['Label'] = df['Label'].astype(int)

print("Dataset shape after cleaning:", df.shape)

# ----- TRAIN-TEST SPLIT -----
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Text'].tolist(), df['Label'].tolist(), test_size=0.2
)

Dataset shape after cleaning: (71537, 2)


In [None]:
# Save cleaned dataset to CSV
df.to_csv("cleaned_dataset.csv", index=False)

In [None]:
import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available: 1


In [None]:
# ---- LSTM Model Setup ----
# Tokenize the texts for LSTM (using padding and truncation)
from tensorflow.keras.preprocessing.text import Tokenizer
max_length = 200  # LSTM model maximum input length
vocab_size = 10000

# Initialize and fit the tokenizer on training texts
lstm_tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
lstm_tokenizer.fit_on_texts(train_texts)

# Convert texts to sequences
train_sequences = lstm_tokenizer.texts_to_sequences(train_texts)
test_sequences = lstm_tokenizer.texts_to_sequences(test_texts)

# Pad sequences to ensure uniform input size
train_sequences = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_sequences = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Define the LSTM model architecture
lstm_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')
])

# Compile the LSTM model
lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                   loss='sparse_categorical_crossentropy',
                   metrics=['accuracy'])

import numpy as np

train_labels = np.array(train_labels).astype("int")
test_labels = np.array(test_labels).astype("int")

# Train the LSTM model
lstm_history = lstm_model.fit(train_sequences, train_labels,
                              epochs=3,
                              batch_size=16,
                              validation_data=(test_sequences, test_labels))

Epoch 1/3
[1m3577/3577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2356s[0m 656ms/step - accuracy: 0.6839 - loss: 0.5836 - val_accuracy: 0.9421 - val_loss: 0.1494
Epoch 2/3
[1m3577/3577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2354s[0m 655ms/step - accuracy: 0.9583 - loss: 0.1244 - val_accuracy: 0.9745 - val_loss: 0.0667
Epoch 3/3
[1m3577/3577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2338s[0m 648ms/step - accuracy: 0.9828 - loss: 0.0496 - val_accuracy: 0.9774 - val_loss: 0.0650


In [None]:
# Save the full model
lstm_model.save("saved_lstm_model.keras")


Try to evaluate


In [None]:
# ---- Import Required Modules ----
import tensorflow as tf
from tensorflow.keras.models import load_model
from transformers import TFBertForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

# ---- Load Test Data ----
# Load the cleaned dataset
df_cleaned = pd.read_csv("cleaned_dataset.csv")
test_texts = df_cleaned['Text'].tolist()
test_labels = df_cleaned['Label'].tolist()

# ---- Load Pretrained Models ----
# Load LSTM model
lstm_model = load_model("saved_lstm_model.keras")

# Load BERT model and tokenizer
bert_model = TFBertForSequenceClassification.from_pretrained("saved_bert_model")
bert_tokenizer = BertTokenizer.from_pretrained("saved_bert_model")

# ---- Preprocess Test Data ----

# Preprocess for LSTM
from tensorflow.keras.preprocessing.text import Tokenizer

# Load the tokenizer used during training (should be saved if needed)
# For now, rebuild using same vocab_size and fit again (best to persist tokenizer next time)
vocab_size = 10000
max_length = 200

lstm_tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
lstm_tokenizer.fit_on_texts(test_texts)  # not ideal, but okay for simple reuse
test_sequences = lstm_tokenizer.texts_to_sequences(test_texts)
test_sequences = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Preprocess for BERT
test_encodings = bert_tokenizer(test_texts, truncation=True, padding=True, max_length=256, return_tensors='tf')

# ---- Model Predictions ----
# BERT predictions
bert_outputs = bert_model.predict(dict(test_encodings), batch_size=8)
bert_predicted_classes = tf.argmax(bert_outputs.logits, axis=1).numpy()

# LSTM predictions
lstm_outputs = lstm_model.predict(test_sequences, batch_size=8)
lstm_predicted_classes = tf.argmax(lstm_outputs, axis=1).numpy()

# ---- Evaluation Metrics for Both Models ----
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_model(predicted_classes, true_labels):
    accuracy = accuracy_score(true_labels, predicted_classes)
    precision = precision_score(true_labels, predicted_classes)
    recall = recall_score(true_labels, predicted_classes)
    f1 = f1_score(true_labels, predicted_classes)
    conf_matrix = confusion_matrix(true_labels, predicted_classes)
    return accuracy, precision, recall, f1, conf_matrix

# Evaluate BERT
bert_accuracy, bert_precision, bert_recall, bert_f1, bert_conf_matrix = evaluate_model(bert_predicted_classes, test_labels)

# Evaluate LSTM
lstm_accuracy, lstm_precision, lstm_recall, lstm_f1, lstm_conf_matrix = evaluate_model(lstm_predicted_classes, test_labels)

# ---- Print Evaluation Results ----

print("BERT Model Evaluation:")
print(f"Accuracy: {bert_accuracy:.4f}")
print(f"Precision: {bert_precision:.4f}")
print(f"Recall: {bert_recall:.4f}")
print(f"F1 Score: {bert_f1:.4f}")
print(f"Confusion Matrix:\n{bert_conf_matrix}")

print("\nLSTM Model Evaluation:")
print(f"Accuracy: {lstm_accuracy:.4f}")
print(f"Precision: {lstm_precision:.4f}")
print(f"Recall: {lstm_recall:.4f}")
print(f"F1 Score: {lstm_f1:.4f}")
print(f"Confusion Matrix:\n{lstm_conf_matrix}")


Some layers from the model checkpoint at saved_bert_model were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at saved_bert_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


[1m8943/8943[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m987s[0m 110ms/step
BERT Model Evaluation:
Accuracy: 0.9990
Precision: 0.9992
Recall: 0.9988
F1 Score: 0.9990
Confusion Matrix:
[[34999    29]
 [   45 36464]]

LSTM Model Evaluation:
Accuracy: 0.8049
Precision: 0.8430
Recall: 0.7591
F1 Score: 0.7989
Confusion Matrix:
[[29868  5160]
 [ 8794 27715]]


In [None]:
# Save tokenizer as JSON
tokenizer_json = lstm_tokenizer.to_json()
with open("lstm_tokenizer.json", "w") as f:
    f.write(tokenizer_json)
