In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [3]:
# Load datasets
fake_df = pd.read_csv('dataset/Fake.csv')
true_df = pd.read_csv('dataset/True.csv')

# Add labels
fake_df['label'] = 'FAKE'
true_df['label'] = 'REAL'

# Combine and shuffle
df = pd.concat([fake_df, true_df], axis=0)
df = df.sample(frac=1).reset_index(drop=True)

# Combine title and text
df['text'] = df['title'] + " " + df['text']

# Check sample
df[['text', 'label']].head()


Unnamed: 0,text,label
0,Hillary Clinton Snags Coveted Endorsement Of ...,FAKE
1,Trump says hopes to avoid use of military acti...,REAL
2,China's premier says U.S. ties will develop no...,REAL
3,Romney under 'active consideration' for Secret...,REAL
4,CHECK OUT NEW BEN & JERRY’S FLAVOR: Touting Am...,FAKE


In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)                 # Remove HTML
    text = re.sub(r'[^a-zA-Z]', ' ', text)            # Remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text)                  # Remove extra whitespace
    return text

df['clean_text'] = df['text'].apply(clean_text)


In [6]:
X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [8]:
model = PassiveAggressiveClassifier(max_iter=50)
model.fit(X_train_tfidf, y_train)


In [9]:
y_pred = model.predict(X_test_tfidf)

print("Accuracy:", round(accuracy_score(y_test, y_pred)*100, 2), "%")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 99.44 %

Confusion Matrix:
 [[4646   33]
 [  17 4284]]

Classification Report:
               precision    recall  f1-score   support

        FAKE       1.00      0.99      0.99      4679
        REAL       0.99      1.00      0.99      4301

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [10]:
def predict_news(news_text):
    clean = clean_text(news_text)
    vector = tfidf.transform([clean])
    prediction = model.predict(vector)
    return prediction[0]

# Try it
print(predict_news("COVID-19 vaccine found to contain tracking chip!"))


FAKE
