In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
import pandas as pd

df = pd.read_csv("data/processed/combined_fake_news_dataset.csv")
df.head()


Unnamed: 0,text,label,source
0,IRAN MAKES MAJOR Announcement About How They P...,fake,kaggle
1,Britain seeks new ways to detect explosives in...,real,kaggle
2,Fox News Host Calls GOP Out On Voter ID Laws ...,fake,kaggle
3,AUSTRIAN JUSTICE SYSTEM Gives Teen With Homema...,fake,kaggle
4,What Katy Perry Did With This Gift John Mayer ...,fake,gossipcop


In [3]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df["text_clean"] = df["text"].apply(clean_text)


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["text_clean"])

sequences = tokenizer.texts_to_sequences(df["text_clean"])
padded = pad_sequences(sequences, maxlen=300, padding='post', truncating='post')


In [5]:
from sklearn.model_selection import train_test_split

labels = df["label"].map({"real": 0, "fake": 1}).values
X_train, X_test, y_train, y_test = train_test_split(
    padded, labels, test_size=0.2, stratify=labels, random_state=42
)


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

model_lstm = Sequential([
    Embedding(10000, 64, input_length=300),
    Bidirectional(LSTM(64)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.fit(X_train, y_train, validation_split=0.2, epochs=5, batch_size=64)




Epoch 1/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 309ms/step - accuracy: 0.8654 - loss: 0.2700 - val_accuracy: 0.9585 - val_loss: 0.1006
Epoch 2/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 296ms/step - accuracy: 0.9696 - loss: 0.0760 - val_accuracy: 0.9557 - val_loss: 0.1062
Epoch 3/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 279ms/step - accuracy: 0.9775 - loss: 0.0561 - val_accuracy: 0.9565 - val_loss: 0.1156
Epoch 4/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 300ms/step - accuracy: 0.9879 - loss: 0.0319 - val_accuracy: 0.9545 - val_loss: 0.1740
Epoch 5/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 280ms/step - accuracy: 0.9923 - loss: 0.0192 - val_accuracy: 0.9509 - val_loss: 0.1842


<keras.src.callbacks.history.History at 0x1c910ebcaf0>

In [7]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D

model_cnn = Sequential([
    Embedding(10000, 64, input_length=300),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn.fit(X_train, y_train, validation_split=0.2, epochs=5, batch_size=64)


Epoch 1/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 81ms/step - accuracy: 0.8383 - loss: 0.3182 - val_accuracy: 0.9565 - val_loss: 0.1090
Epoch 2/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 83ms/step - accuracy: 0.9674 - loss: 0.0867 - val_accuracy: 0.9604 - val_loss: 0.0967
Epoch 3/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 80ms/step - accuracy: 0.9844 - loss: 0.0463 - val_accuracy: 0.9581 - val_loss: 0.1097
Epoch 4/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 80ms/step - accuracy: 0.9952 - loss: 0.0167 - val_accuracy: 0.9496 - val_loss: 0.1657
Epoch 5/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 84ms/step - accuracy: 0.9992 - loss: 0.0042 - val_accuracy: 0.9497 - val_loss: 0.1874


<keras.src.callbacks.history.History at 0x1c92a1a18a0>

In [8]:
from tensorflow.keras.layers import GRU

model_gru = Sequential([
    Embedding(10000, 64, input_length=300),
    GRU(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_gru.fit(X_train, y_train, validation_split=0.2, epochs=5, batch_size=64)


Epoch 1/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 174ms/step - accuracy: 0.6922 - loss: 0.5633 - val_accuracy: 0.8831 - val_loss: 0.3557
Epoch 2/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 175ms/step - accuracy: 0.8749 - loss: 0.3572 - val_accuracy: 0.9044 - val_loss: 0.2737
Epoch 3/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 176ms/step - accuracy: 0.8505 - loss: 0.3461 - val_accuracy: 0.9174 - val_loss: 0.2023
Epoch 4/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 176ms/step - accuracy: 0.9168 - loss: 0.1862 - val_accuracy: 0.9212 - val_loss: 0.2337
Epoch 5/5
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 176ms/step - accuracy: 0.9385 - loss: 0.1511 - val_accuracy: 0.9208 - val_loss: 0.1674


<keras.src.callbacks.history.History at 0x1c952375c30>

In [9]:
from sklearn.metrics import classification_report

def evaluate(model, name):
    y_pred = (model.predict(X_test) > 0.5).astype("int32")
    print(f"=== {name} ===")
    print(classification_report(y_test, y_pred))

evaluate(model_lstm, "BiLSTM")
evaluate(model_cnn, "CNN")
evaluate(model_gru, "GRU")


[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 54ms/step
=== BiLSTM ===
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      5750
           1       0.95      0.96      0.95      5206

    accuracy                           0.96     10956
   macro avg       0.96      0.96      0.96     10956
weighted avg       0.96      0.96      0.96     10956

[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step
=== CNN ===
              precision    recall  f1-score   support

           0       0.97      0.94      0.95      5750
           1       0.94      0.97      0.95      5206

    accuracy                           0.95     10956
   macro avg       0.95      0.95      0.95     10956
weighted avg       0.95      0.95      0.95     10956

[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 42ms/step
=== GRU ===
              precision    recall  f1-score   support

           0       0.96

In [11]:
# 1. Get predictions (probabilities) from both models
bilstm_probs = model_lstm.predict(X_test)
cnn_probs = model_cnn.predict(X_test)

# 2. Average the predicted probabilities (soft voting)
ensemble_probs = (bilstm_probs + cnn_probs) / 2

# 3. Convert probabilities to final class predictions
ensemble_preds = (ensemble_probs > 0.5).astype(int)

# 4. Evaluate ensemble performance
from sklearn.metrics import classification_report, confusion_matrix

print("=== Ensemble (BiLSTM + CNN) ===")
print(classification_report(y_test, ensemble_preds))



[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 46ms/step
[1m343/343[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step
=== Ensemble (BiLSTM + CNN) ===
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      5750
           1       0.95      0.97      0.96      5206

    accuracy                           0.96     10956
   macro avg       0.96      0.96      0.96     10956
weighted avg       0.96      0.96      0.96     10956



In [12]:
model_lstm.save("models/bilstm_model.h5")
model_cnn.save("models/cnn_model.h5")


