# Fake News Detection System (Without BERT)

In [1]:
# Libraries Import
import sys
import os

sys.path.append(os.path.abspath("../scripts"))

In [2]:
# Core packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model
import joblib
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report

from data_loader import load_data
from model_builder import build_model

## Load and Explore the Dataset

label 0 = Fake, label 1 = Real. The dataset is nearly balanced — this is good for training a classifier.

In [3]:
# Load data
df = load_data()

print(df['label'].value_counts())

label
0    23481
1    21417
Name: count, dtype: int64


## Train-Validation-Test Split

The final splits are approximately:

- 70% Train

- 20% Dev

- 10% Test

In [4]:
# Split 90% of data for Train+Dev, and 10% for Test
X = df['text']
y = df['label']

# Train+Dev vs Test split (90% / 10%)
X_train_dev, X_test, y_train_dev, y_test = train_test_split(X, y, test_size=0.10, stratify=y, random_state=42)

# Train vs Dev split (70% / 20% of total)
# X_train, X_dev, y_train, y_dev = train_test_split(X_train_dev, y_train_dev, test_size=0.222, stratify=y_train_dev, random_state=42)

X_train, X_dev, y_train, y_dev = train_test_split(X_train_dev, y_train_dev, test_size=0.475, stratify=y_train_dev, random_state=42)

## TF-IDF Vectorization

TF-IDF vectorization turns raw text into meaningful numerical features using unigrams, bigrams, and trigrams.

In [5]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    max_features=5000, 
    ngram_range=(1, 3), 
    min_df=3, 
    max_df=0.8, 
    stop_words='english', 
    sublinear_tf=True
)

X_train_vect = vectorizer.fit_transform(X_train).toarray()
X_dev_vect = vectorizer.transform(X_dev).toarray()
X_test_vect = vectorizer.transform(X_test).toarray()

## Build and Train the Neural Network

The model is a simple Feedforward Neural Network with regularization and dropout. The class_weight addresses slight label imbalance.

In [6]:
# Build the model
model = build_model(input_dim=X_train_vect.shape[1])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train the model
model.fit(
    X_train_vect, y_train,
    validation_data=(X_dev_vect, y_dev),
    epochs=150,
    batch_size=32,
    callbacks=[early_stopping],
    class_weight = {0: 1.0, 1: 1.1}
)

Epoch 1/150
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7128 - loss: 3.1646 - val_accuracy: 0.9385 - val_loss: 0.6715
Epoch 2/150
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9150 - loss: 0.6960 - val_accuracy: 0.9511 - val_loss: 0.6553
Epoch 3/150
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9190 - loss: 0.6812 - val_accuracy: 0.9541 - val_loss: 0.6401
Epoch 4/150
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9228 - loss: 0.6660 - val_accuracy: 0.9533 - val_loss: 0.6245
Epoch 5/150
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9267 - loss: 0.6477 - val_accuracy: 0.9550 - val_loss: 0.6087
Epoch 6/150
[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9295 - loss: 0.6343 - val_accuracy: 0.9576 - val_loss: 0.5921
Epoch 7/150
[1m663/66

<keras.src.callbacks.history.History at 0x1eac7f05f30>

## Evaluate on Test Set

Final test accuracy is ~99%, indicating strong generalization on in-distribution test data.

In [7]:
# Final Testing Loop — Evaluation on Test Set
print("\nFinal Evaluation on Test Set:")
y_pred_prob = model.predict(X_test_vect)
y_pred = (y_pred_prob > 0.5).astype(int)
print(classification_report(y_test, y_pred))


Final Evaluation on Test Set:
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 927us/step
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2348
           1       0.98      0.99      0.99      2142

    accuracy                           0.99      4490
   macro avg       0.99      0.99      0.99      4490
weighted avg       0.99      0.99      0.99      4490



## Save and Reload Model + Vectorizer

In [8]:
# Save model and vectorizer
model.save("../models/model.keras")
joblib.dump(vectorizer, "../models/vectorizer.pkl")

['../models/vectorizer.pkl']

In [9]:
# Load for inference
model = load_model("../models/model.keras")
vectorizer = joblib.load("../models/vectorizer.pkl")

## Real-world Inference Examples

The model sometimes mislabels real-looking headlines as fake. This is a known limitation of shallow models like TF-IDF.

In [13]:
# Real-world examples
print("\nInference on Real-world Samples:")

examples = [
    ("Breaking: Prime Minister announces new economic reforms.", "politicsNews"),  # Real
    ("Aliens landed in Ohio according to anonymous sources.", "worldnews"),        # Fake
    ("The COVID-19 vaccine rollout continues across Europe.", "healthNews"),       # Real
    ("NASA confirms water on the Moon.", "scienceNews"),                           # Real
    ("Donald Trump wins the presidential election again.", "politicsNews"),        # Fake/Unlikely
    ("India is a country.", "worldnews"),                                          # Generic/ambiguous
    ("White House, Congress prepare for talks on spending, immigration", "politicsNews"),  # Real
    ("Scientists discover a way to live forever using jellyfish DNA.", "scienceNews"),     # Likely Fake
    ("Stocks crash after rumors of global economic collapse.", "businessNews"),            # Possibly Fake
    ("New study shows chocolate improves brain function.", "healthNews"),                  # Real-sounding
    ("Vaccine causes telepathic powers in 1% of recipients, claims study.", "healthNews"), # Fake
    ("UN convenes emergency session on climate change crisis.", "worldnews"),             # Real
    ("Elon Musk launches reusable rocket that lands on Mars.", "scienceNews"),            # Partially Fake (as of now)
    ("Government passes bill banning all smartphones by 2025.", "politicsNews"),          # Likely Fake
    ("Earthquake hits Tokyo, no casualties reported.", "worldnews")                       # Real
]


for title, subject in examples:
    combined_text = f"{title} [SEP] {subject}"
    X_ex = vectorizer.transform([combined_text])
    pred_prob = model.predict(X_ex.toarray())[0][0]

    # Use threshold tuned on dev set if available, else fallback
    threshold = 0.4  
    label = "Real" if pred_prob >= threshold else "Fake"

    print(f"{title} -> {label} ({pred_prob:.2f})")


Inference on Real-world Samples:
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Breaking: Prime Minister announces new economic reforms. -> Real (0.55)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Aliens landed in Ohio according to anonymous sources. -> Fake (0.05)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
The COVID-19 vaccine rollout continues across Europe. -> Fake (0.07)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
NASA confirms water on the Moon. -> Fake (0.12)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Donald Trump wins the presidential election again. -> Real (0.57)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
India is a country. -> Fake (0.13)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
White House, Congress prepare for talks on spending, immigration -> Fake (0.33)
[1m1/1[0m [32m━━━━━━

## Classification Report for All Splits

All splits report ~99% precision, recall, and F1 — suggesting very strong consistency across data splits.

In [14]:
# Train
y_train_pred = (model.predict(X_train_vect) > 0.5).astype(int)
print("\nTrain Classification Report:")
print(classification_report(y_train, y_train_pred))

# Dev
y_dev_pred = (model.predict(X_dev_vect) > 0.5).astype(int)
print("\nDev Classification Report:")
print(classification_report(y_dev, y_dev_pred))

# Test
y_test_pred = (model.predict(X_test_vect) > 0.5).astype(int)
print("\nTest Classification Report:")
print(classification_report(y_test, y_test_pred))

[1m663/663[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 755us/step

Train Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     11095
           1       0.99      0.99      0.99     10119

    accuracy                           0.99     21214
   macro avg       0.99      0.99      0.99     21214
weighted avg       0.99      0.99      0.99     21214

[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 721us/step

Dev Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     10038
           1       0.99      0.99      0.99      9156

    accuracy                           0.99     19194
   macro avg       0.99      0.99      0.99     19194
weighted avg       0.99      0.99      0.99     19194

[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 832us/step

Test Classification Report:
              precision    r

## Feature Exploration

Inspecting TF-IDF vocabulary

In [15]:
print("\nSample features:")
print(vectorizer.get_feature_names_out()[:20])
print("Total features in vocab:", len(vectorizer.get_feature_names_out()))


Sample features:
['00' '000' '000 people' '10' '10 000' '10 percent' '10 years' '100'
 '100 000' '11' '12' '120' '13' '14' '15' '150' '16' '17' '18' '19']
Total features in vocab: 5000
