In [1]:
import sys
import os

sys.path.append(os.path.abspath("../scripts"))

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
import joblib
from sklearn.metrics import precision_recall_curve

from data_loader import load_data
from model_builder import build_model

In [3]:
# Load data
df = load_data()

# Use 'text' as input and 'label' as target
X = df['text']
y = df['label']

print(df['label'].value_counts())

label
0    23481
1    21417
Name: count, dtype: int64


In [4]:
# Train+Dev vs Test split (90% / 10%)
X_train_dev, X_test, y_train_dev, y_test = train_test_split(X, y, test_size=0.10, stratify=y, random_state=42)

In [5]:
# Train vs Dev split (70% / 15% of total)
# X_train, X_dev, y_train, y_dev = train_test_split(X_train_dev, y_train_dev, test_size=0.222, stratify=y_train_dev, random_state=42)

X_train, X_dev, y_train, y_dev = train_test_split(X_train_dev, y_train_dev, test_size=0.475, stratify=y_train_dev, random_state=42)

In [6]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    max_features=5000, 
    ngram_range=(1, 3), 
    min_df=3, 
    max_df=0.8, 
    stop_words='english', 
    sublinear_tf=True
)

X_train_vect = vectorizer.fit_transform(X_train).toarray()
X_dev_vect = vectorizer.transform(X_dev).toarray()
X_test_vect = vectorizer.transform(X_test).toarray()

In [7]:
# Build the model
model = build_model(input_dim=X_train_vect.shape[1])

In [8]:
# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

In [15]:
# Train the model
model.fit(
    X_train_vect, y_train,
    validation_data=(X_dev_vect, y_dev),
    epochs=150,
    batch_size=32,
    callbacks=[early_stopping],
    class_weight = {0: 1.0, 1: 1.1}
)

Epoch 1/150
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9762 - loss: 0.2810 - val_accuracy: 0.9849 - val_loss: 0.2555
Epoch 2/150
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9780 - loss: 0.2773 - val_accuracy: 0.9853 - val_loss: 0.2524
Epoch 3/150
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9769 - loss: 0.2768 - val_accuracy: 0.9858 - val_loss: 0.2499
Epoch 4/150
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9789 - loss: 0.2698 - val_accuracy: 0.9861 - val_loss: 0.2464
Epoch 5/150
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9806 - loss: 0.2689 - val_accuracy: 0.9853 - val_loss: 0.2432
Epoch 6/150
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9808 - loss: 0.2621 - val_accuracy: 0.9853 - val_loss: 0.2425
Epoch 7/150
[1m663/66

<keras.src.callbacks.history.History at 0x22d0f09b880>

In [10]:
# Final Testing Loop — Evaluation on Test Set
print("\nFinal Evaluation on Test Set:")
y_pred_prob = model.predict(X_test_vect)
y_pred = (y_pred_prob > 0.5).astype(int)
print(classification_report(y_test, y_pred))


Final Evaluation on Test Set:
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 900us/step
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      2348
           1       0.97      0.99      0.98      2142

    accuracy                           0.98      4490
   macro avg       0.98      0.98      0.98      4490
weighted avg       0.98      0.98      0.98      4490



In [11]:
# Save model and vectorizer
model.save("../models/model.keras")
joblib.dump(vectorizer, "../models/vectorizer.pkl")

['../models/vectorizer.pkl']

In [12]:
# Load for inference
model = load_model("../models/model.keras")
vectorizer = joblib.load("../models/vectorizer.pkl")

In [13]:
# Real-world examples
print("\nInference on Real-world Samples:")

examples = [
    ("Breaking: Prime Minister announces new economic reforms.", "politicsNews"),  # Real
    ("Aliens landed in Ohio according to anonymous sources.", "worldnews"),        # Fake
    ("The COVID-19 vaccine rollout continues across Europe.", "healthNews"),       # Real
    ("NASA confirms water on the Moon.", "scienceNews"),                           # Real
    ("Donald Trump wins the presidential election again.", "politicsNews"),        # Fake/Unlikely
    ("India is a country.", "worldnews"),                                          # Generic/ambiguous
    ("White House, Congress prepare for talks on spending, immigration", "politicsNews"),  # Real
    ("Scientists discover a way to live forever using jellyfish DNA.", "scienceNews"),     # Likely Fake
    ("Stocks crash after rumors of global economic collapse.", "businessNews"),            # Possibly Fake
    ("New study shows chocolate improves brain function.", "healthNews"),                  # Real-sounding
    ("Vaccine causes telepathic powers in 1% of recipients, claims study.", "healthNews"), # Fake
    ("UN convenes emergency session on climate change crisis.", "worldnews"),             # Real
    ("Elon Musk launches reusable rocket that lands on Mars.", "scienceNews"),            # Partially Fake (as of now)
    ("Government passes bill banning all smartphones by 2025.", "politicsNews"),          # Likely Fake
    ("Earthquake hits Tokyo, no casualties reported.", "worldnews")                       # Real
]


for title, subject in examples:
    combined_text = f"{title} [SEP] {subject}"
    X_ex = vectorizer.transform([combined_text])
    pred_prob = model.predict(X_ex.toarray())[0][0]

    # Use threshold tuned on dev set if available, else fallback
    threshold = 0.4  
    label = "Real" if pred_prob >= threshold else "Fake"

    print(f"{title} -> {label} ({pred_prob:.2f})")

# Check vocab info
print("\nSample features:")
print(vectorizer.get_feature_names_out()[:20])
print("Total features in vocab:", len(vectorizer.get_feature_names_out()))


Inference on Real-world Samples:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Breaking: Prime Minister announces new economic reforms. -> Real (0.70)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Aliens landed in Ohio according to anonymous sources. -> Fake (0.17)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
The COVID-19 vaccine rollout continues across Europe. -> Fake (0.22)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
NASA confirms water on the Moon. -> Fake (0.26)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Donald Trump wins the presidential election again. -> Real (0.55)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
India is a country. -> Fake (0.28)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
White House, Congress prepare for talks on spending, immigration -> Real (0.48)
[1m1/1[0m [32m━━━━━━

In [14]:
from sklearn.metrics import classification_report

# Train predictions
y_train_pred = (model.predict(X_train_vect) > 0.5).astype(int)
print("\nTrain Classification Report:")
print(classification_report(y_train, y_train_pred))

# Dev predictions
y_dev_pred = (model.predict(X_dev_vect) > 0.5).astype(int)
print("\nDev Classification Report:")
print(classification_report(y_dev, y_dev_pred))

# Test predictions
y_test_pred = (model.predict(X_test_vect) > 0.5).astype(int)
print("\nTest Classification Report:")
print(classification_report(y_test, y_test_pred))

[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 711us/step

Train Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     11095
           1       0.98      0.99      0.98     10119

    accuracy                           0.99     21214
   macro avg       0.98      0.99      0.99     21214
weighted avg       0.99      0.99      0.99     21214

[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 698us/step

Dev Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99     10038
           1       0.97      0.99      0.98      9156

    accuracy                           0.98     19194
   macro avg       0.98      0.98      0.98     19194
weighted avg       0.98      0.98      0.98     19194

[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 814us/step

Test Classification Report:
              precision    r