In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load Data
fake = pd.read_csv("Fake.csv", encoding='latin-1')
true = pd.read_csv("True.csv", encoding='latin-1')
fake['label'] = 0
true['label'] = 1
data = pd.concat([fake, true], axis=0)
data = data.sample(frac=1).reset_index(drop=True)

# Download stopwords
nltk.download('stopwords')
stop_word = set(stopwords.words('english'))

# Clean function
def clean(text):
    text = re.sub(r'[^a-zA-Z]', ' ', str(text))
    text = text.lower().split()
    text = [word for word in text if word not in stop_word]
    return ' '.join(text)

data['clean_data'] = data['text'].apply(clean)

# TF-IDF
vec = TfidfVectorizer(max_features=3000, stop_words='english', max_df=0.7)
x = vec.fit_transform(data['clean_data']).toarray()
y = data['label']

# Split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Model options
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100)
}

# Training & Evaluation
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(f"\nModel: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Model: Naive Bayes
Accuracy: 0.9217149220489977
[[4412  299]
 [ 404 3865]]
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      4711
           1       0.93      0.91      0.92      4269

    accuracy                           0.92      8980
   macro avg       0.92      0.92      0.92      8980
weighted avg       0.92      0.92      0.92      8980


Model: Logistic Regression
Accuracy: 0.9853006681514477
[[4635   76]
 [  56 4213]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4711
           1       0.98      0.99      0.98      4269

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


Model: Random Forest
Accuracy: 0.9981069042316258
[[4702    9]
 [   8 4261]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4711