In [None]:
import pandas as pd
import numpy as np
import re
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# ensure reproducibility
tf.random.set_seed(42)

# output directories
os.makedirs("output/models", exist_ok=True)
os.makedirs("output/results", exist_ok=True)
os.makedirs("output/visualizations", exist_ok=True)

In [None]:
base_dir = Path("data/processed")

files = {
    "gossipcop_fake.csv": "fake",
    "gossipcop_real.csv": "real",
    "politifact_fake.csv": "fake",
    "politifact_real.csv": "real",
}

dfs = []
for f, label in files.items():
    df = pd.read_csv(base_dir / f)
    df["label"] = 0 if label == "fake" else 1  # 0=fake, 1=real
    dfs.append(df[["title", "label"]])

full_df = pd.concat(dfs, ignore_index=True)
print("✅ Combined dataset:", full_df.shape)
full_df.head()

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

full_df["clean_title"] = full_df["title"].apply(clean_text)
full_df.head()

In [None]:
train_df, temp_df = train_test_split(full_df, test_size=0.3, stratify=full_df["label"], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["label"], random_state=42)

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

In [None]:
MAX_VOCAB = 20000
MAX_LEN = 50

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df["clean_title"])

def tokenize_texts(texts):
    seqs = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seqs, maxlen=MAX_LEN, padding="post", truncating="post")

X_train = tokenize_texts(train_df["clean_title"])
X_val = tokenize_texts(val_df["clean_title"])
X_test = tokenize_texts(test_df["clean_title"])

y_train = np.array(train_df["label"])
y_val = np.array(val_df["label"])
y_test = np.array(test_df["label"])

print("✅ Tokenization done:", X_train.shape, X_val.shape, X_test.shape)

In [None]:
EMBED_DIM = 128
FILTERS = 64
LSTM_UNITS = 64

model = Sequential([
    Embedding(MAX_VOCAB, EMBED_DIM, input_length=MAX_LEN),
    Conv1D(FILTERS, 5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Bidirectional(LSTM(LSTM_UNITS, dropout=0.2, recurrent_dropout=0.2)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
es = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
checkpoint_path = "output/models/cnn_lstm_model.h5"

mc = ModelCheckpoint(checkpoint_path, monitor="val_accuracy", save_best_only=True, verbose=1)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64,
    callbacks=[es, mc],
    verbose=1
)

In [None]:
plt.figure(figsize=(10,4))
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title("CNN-LSTM Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.savefig("output/visualizations/accuracy_curve.png")
plt.show()

plt.figure(figsize=(10,4))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title("CNN-LSTM Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.savefig("output/visualizations/loss_curve.png")
plt.show()

In [None]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")

acc = accuracy_score(y_test, y_pred)
print(f"✅ Test Accuracy: {acc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Fake", "Real"]))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Fake", "Real"], yticklabels=["Fake", "Real"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.savefig("output/visualizations/confusion_matrix.png")
plt.show()

In [None]:
import pickle

# Model already saved via checkpoint
print("✅ Saved model to:", checkpoint_path)

# Save tokenizer
with open("output/models/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
print("✅ Saved tokenizer.")

In [None]:
def predict_text(text):
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=MAX_LEN, padding="post")
    pred = model.predict(padded)[0][0]
    label = "Real" if pred >= 0.5 else "Fake"
    return label, float(pred)

sample = "Breaking news: scientists found water on Mars!"
label, score = predict_text(sample)
print(f"Prediction: {label} ({score:.3f})")