In [1]:
# Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import resample
import pickle, gzip


In [2]:
# Load datasets (update your local paths if needed)
fake_data = pd.read_csv(r"C:\Users\ishik\Downloads\Fake news detection System\Fake (1).csv")
true_data = pd.read_csv(r"C:\Users\ishik\Downloads\Fake news detection System\True (1).csv")

print("Fake samples:", len(fake_data))
print("True samples:", len(true_data))


Fake samples: 23481
True samples: 21417


In [3]:
# Label data
fake_data["label"] = 1  # 1 = Fake
true_data["label"] = 0  # 0 = True

# Combine both datasets and shuffle
df = pd.concat([fake_data, true_data], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

print(df.head())
print(df["label"].value_counts())


                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      1  
1       April 5, 2017       0  
2  September 27, 2017       0  
3         May 22, 2017      1  
4       June 24, 2016       0  
label
1    23481
0    21417
Name: count, dtype: in

In [4]:
# Split into separate dataframes
fake_df = df[df["label"] == 1]
true_df = df[df["label"] == 0]

# Balance dataset (upsample smaller class)
fake_balanced = resample(fake_df, n_samples=len(true_df), random_state=42)
balanced_df = pd.concat([fake_balanced, true_df]).sample(frac=1, random_state=42)

print("After balancing:")
print(balanced_df["label"].value_counts())


After balancing:
label
1    21417
0    21417
Name: count, dtype: int64


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    balanced_df["text"], balanced_df["label"], test_size=0.2, random_state=42
)
print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


Training samples: 34267
Testing samples: 8567


In [6]:
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7, max_features=10000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Vectorization complete!")


Vectorization complete!


In [7]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = model.predict(X_test_tfidf)
print("âœ… Model trained successfully!")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


âœ… Model trained successfully!
Accuracy: 0.9877436675615735

Confusion Matrix:
 [[4161   44]
 [  61 4301]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4205
           1       0.99      0.99      0.99      4362

    accuracy                           0.99      8567
   macro avg       0.99      0.99      0.99      8567
weighted avg       0.99      0.99      0.99      8567



In [8]:
# Try manual predictions
sample_news = [
    "Government announces new education reform policy for 2025.",
    "Aliens spotted dancing on the Eiffel Tower.",
    "NASA confirms successful Mars rover operation.",
    "Actor claims to have met time traveler in Mumbai."
]

sample_tfidf = vectorizer.transform(sample_news)
predictions = model.predict(sample_tfidf)

for news, pred in zip(sample_news, predictions):
    label = "ðŸŸ¢ Real" if pred == 0 else "ðŸ”´ Fake"
    print(f"\nNews: {news}\nPrediction: {label}")



News: Government announces new education reform policy for 2025.
Prediction: ðŸŸ¢ Real

News: Aliens spotted dancing on the Eiffel Tower.
Prediction: ðŸ”´ Fake

News: NASA confirms successful Mars rover operation.
Prediction: ðŸ”´ Fake

News: Actor claims to have met time traveler in Mumbai.
Prediction: ðŸ”´ Fake


In [9]:
# Save compressed model and vectorizer for Streamlit deployment
os.makedirs("models", exist_ok=True)

with gzip.open("model.pkl.gz", "wb") as f:
    pickle.dump(model, f)

with gzip.open("vectorizer.pkl.gz", "wb") as f:
    pickle.dump(vectorizer, f)

print("âœ… Model and vectorizer saved successfully as .pkl.gz files!")


âœ… Model and vectorizer saved successfully as .pkl.gz files!


In [10]:
# Verify the saved files work correctly
with gzip.open("model.pkl.gz", "rb") as f:
    loaded_model = pickle.load(f)
with gzip.open("vectorizer.pkl.gz", "rb") as f:
    loaded_vectorizer = pickle.load(f)

test_input = ["Breaking: New vaccine approved for COVID-25 in India."]
test_tfidf = loaded_vectorizer.transform(test_input)
test_pred = loaded_model.predict(test_tfidf)[0]

print("Prediction:", "ðŸŸ¢ Real" if test_pred == 0 else "ðŸ”´ Fake")


Prediction: ðŸ”´ Fake
