<a href="https://colab.research.google.com/github/Mani-sh24/FakeNewsDetection/blob/main/fake_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [206]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import kagglehub

# Download dataset
print("Downloading dataset...")
path = kagglehub.dataset_download("algord/fake-news")
print("Path to dataset files:", path)

Downloading dataset...
Path to dataset files: /root/.cache/kagglehub/datasets/algord/fake-news/versions/1


loading and checking missing values freom data


In [207]:
df_news = pd.read_csv(f"{path}/FakeNewsNet.csv")
print(f"\nDataset loaded: {len(df_news)} rows")
print(f"Columns: {df_news.columns.tolist()}")
print(f"\nMissing values:\n{df_news.isnull().sum()}")



Dataset loaded: 23196 rows
Columns: ['title', 'news_url', 'source_domain', 'tweet_num', 'real']

Missing values:
title              0
news_url         330
source_domain    330
tweet_num          0
real               0
dtype: int64


In [208]:
df_news['title'] = df_news['title'].fillna("") # filling

In [209]:
print("\nDownloading NLTK resources...")
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)


Downloading NLTK resources...


True

importing stop words and cleaning the data and tokenisation


In [210]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """Clean and preprocess text"""
    if not isinstance(text, str):
        return ""

    # Remove non-alphabetic characters
    text = re.sub(r"[^a-zA-Z]", " ", text)
    # Convert to lowercase
    text = text.lower()
    # Tokenization
    words = text.split()
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2]
    return " ".join(words)


removing rows with empty cleaned titles and checkind distribution of dataa

In [211]:
print("\nCleaning text data...")
df_news["cleaned_title"] = df_news["title"].apply(clean_text)
df_news = df_news[df_news["cleaned_title"].str.strip() != ""]
print(f"After cleaning: {len(df_news)} rows")
print(f"\nClass distribution:")
print(df_news['real'].value_counts())



Cleaning text data...
After cleaning: 23188 rows

Class distribution:
real
1    17434
0     5754
Name: count, dtype: int64


Preparing data and features

In [212]:

print("\nVectorizing text...")
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_df=0.8,           # Ignore terms that appear in >80% of documents
    min_df=4,             # Ignore terms that appear in <5 documents
    ngram_range=(1, 3),
    max_features=10000,
    sublinear_tf=True       # Logarithmic scaling
)

X = vectorizer.fit_transform(df_news["cleaned_title"])
y = df_news["real"]

print(f"Feature matrix shape: {X.shape}")



Vectorizing text...
Feature matrix shape: (23188, 10000)


In [213]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
# Train model
print("\nTraining PassiveAggressiveClassifier...")
model = PassiveAggressiveClassifier(max_iter=1000, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("\nEvaluating model...")
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"\n{'='*50}")
print(f"Accuracy: {accuracy*100:.2f}%")
print(f"{'='*50}")


Training set: 18550 samples
Test set: 4638 samples

Training PassiveAggressiveClassifier...

Evaluating model...

Accuracy: 79.99%


In [214]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\nConfusion Matrix:")
print(f"                Predicted")
print(f"              Fake    Real")
print(f"Actual Fake   {cm[0][0]:4d}   {cm[0][1]:4d}")
print(f"       Real   {cm[1][0]:4d}   {cm[1][1]:4d}")


Confusion Matrix:
                Predicted
              Fake    Real
Actual Fake    710    441
       Real    487   3000


In [215]:
# Classification report
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Fake', 'Real']))


Classification Report:
              precision    recall  f1-score   support

        Fake       0.59      0.62      0.60      1151
        Real       0.87      0.86      0.87      3487

    accuracy                           0.80      4638
   macro avg       0.73      0.74      0.74      4638
weighted avg       0.80      0.80      0.80      4638



In [216]:
def predict_news(text):
    """Predict if news is real or fake"""
    cleaned = clean_text(text)
    if not cleaned:
        return "Cannot classify empty text"

    text_tfidf = vectorizer.transform([cleaned])
    prediction = model.predict(text_tfidf)[0]
    confidence = model.decision_function(text_tfidf)[0]

    if prediction == 1:
        return f"✅ REAL NEWS (confidence: {abs(confidence):.2f})"
    else:
        return f"🚨 FAKE NEWS (confidence: {abs(confidence):.2f})"



In [217]:
print(f"\n{'='*50}")
print("Testing Model:")
print(f"{'='*50}")

test_examples = [
    "A guy woke up from his grave on his own shocking",
    "A univerity named 'svvv' in gram baroli bombed 300 dead",
    "100 dead in a massive explosion left ppl crying in svvv uni",
    "Scientists develop new vaccine for disease for hiv aids",
    "Trump just sent a bill to michelle obama that she will never be able to pay in her lifetime",
    "'End this drama': Rahul Gandhi meets Haryana IPS officer's wife, daughters; 'act against accused officers,' opposition leader tells PM Narendra Modi"
]

for example in test_examples:
    print(f"\nText: '{example}'")
    print(predict_news(example))

print(f"\n{'='*50}")
print("Model ready for predictions!")
print(f"{'='*50}")


Testing Model:

Text: 'A guy woke up from his grave on his own shocking'
🚨 FAKE NEWS (confidence: 1.85)

Text: 'A univerity named 'svvv' in gram baroli bombed 300 dead'
✅ REAL NEWS (confidence: 2.18)

Text: '100 dead in a massive explosion left ppl crying in svvv uni'
🚨 FAKE NEWS (confidence: 2.32)

Text: 'Scientists develop new vaccine for disease for hiv aids'
🚨 FAKE NEWS (confidence: 2.27)

Text: 'Trump just sent a bill to michelle obama that she will never be able to pay in her lifetime'
🚨 FAKE NEWS (confidence: 4.40)

Text: ''End this drama': Rahul Gandhi meets Haryana IPS officer's wife, daughters; 'act against accused officers,' opposition leader tells PM Narendra Modi'
✅ REAL NEWS (confidence: 5.04)

Model ready for predictions!
