## Imports

In [1]:
import sys
import os

sys.path.append(os.path.abspath("../scripts"))

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from tensorflow import keras
import joblib

from data_loader import load_data
from model_builder import build_model

## Load and prepare data

In [3]:
df = load_data()

# Split into features and labels
X = df['text']
y = df['label']

In [4]:
df['label'].value_counts()

label
0    23481
1    21417
Name: count, dtype: int64

In [5]:
# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Vectorize text using TF-IDF
# vectorizer = TfidfVectorizer(max_features=5000)
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')

X_train_vect = vectorizer.fit_transform(X_train).toarray()
X_test_vect = vectorizer.transform(X_test).toarray()

In [7]:
# Build the model
model = build_model(input_dim=X_train_vect.shape[1])

In [8]:
# Train the model
model.fit(X_train_vect, y_train, epochs=10, batch_size=32, validation_split=0.1)

Epoch 1/10
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8814 - loss: 0.3586 - val_accuracy: 0.9894 - val_loss: 0.0363
Epoch 2/10
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9884 - loss: 0.0448 - val_accuracy: 0.9922 - val_loss: 0.0245
Epoch 3/10
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.9926 - loss: 0.0271 - val_accuracy: 0.9925 - val_loss: 0.0234
Epoch 4/10
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9956 - loss: 0.0155 - val_accuracy: 0.9936 - val_loss: 0.0267
Epoch 5/10
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9961 - loss: 0.0108 - val_accuracy: 0.9936 - val_loss: 0.0285
Epoch 6/10
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9965 - loss: 0.0095 - val_accuracy: 0.9942 - val_loss: 0.0306
Epoch 7/10
[1m1

<keras.src.callbacks.history.History at 0x22f25ec2ce0>

In [9]:
# Evaluate on test set
y_pred_prob = model.predict(X_test_vect)
y_pred = (y_pred_prob > 0.5).astype(int)

print(classification_report(y_test, y_pred))

[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4710
           1       0.99      0.99      0.99      4270

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [10]:
# Save model in native Keras format
model.save("../models/model.keras")

In [11]:
# Save the vectorizer
joblib.dump(vectorizer, "../models/vectorizer.pkl")

['../models/vectorizer.pkl']

In [12]:
from tensorflow.keras.models import load_model

In [13]:
# Load the model
model = load_model("../models/model.keras")

# Load the vectorizer
vectorizer = joblib.load("../models/vectorizer.pkl")

In [14]:
examples = [
    "Breaking: Prime Minister announces new economic reforms.",
    "Aliens landed in Ohio according to anonymous sources.",
    "The COVID-19 vaccine rollout continues across Europe.",
    "NASA confirms water on the Moon.",
    "Donald Trump wins the presidential election again.",
    "India is a country."
]

for text in examples:
    X_ex = vectorizer.transform([text])
    pred = model.predict(X_ex.toarray())
    label = "Real" if pred[0][0] >= 0.5 else "Fake"
    print(f"{text} -> {label} ({pred[0][0]:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Breaking: Prime Minister announces new economic reforms. -> Fake (0.15)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Aliens landed in Ohio according to anonymous sources. -> Fake (0.00)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
The COVID-19 vaccine rollout continues across Europe. -> Fake (0.00)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
NASA confirms water on the Moon. -> Fake (0.00)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Donald Trump wins the presidential election again. -> Real (0.65)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
India is a country. -> Fake (0.01)


In [15]:
print(vectorizer.get_feature_names_out()[:20])  # See first 20
print(len(vectorizer.get_feature_names_out()))  # Size of vocab

['00' '00 pm' '000' '000 people' '10' '10 000' '10 percent' '10 years'
 '100' '100 000' '11' '12' '120' '13' '14' '15' '150' '16' '17' '18']
5000
