In [None]:
import pandas as pd
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))


In [None]:
true = pd.read_csv("True.csv")
fake = pd.read_csv("Fake.csv")

true["label"], fake["label"] = 1, 0
df = pd.concat([true, fake]).sample(frac=1, random_state=42).reset_index(drop=True)

df.head()


In [None]:
print("Dataset shape:", df.shape)
print(df["label"].value_counts())

sns.countplot(data=df, x="label", palette="Set2")
plt.title("Distribution of Real vs Fake News")
plt.show()


In [None]:
def clean(text):
    text = re.sub(r"http\S+|www\S+", "", str(text))       # remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)               # keep letters only
    text = text.lower().split()
    text = [w for w in text if w not in stop_words]
    return " ".join(text)

df["text"] = (df["title"].astype(str) + " " + df["text"].astype(str)).apply(clean)
df.head()


In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["text"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=300),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=200),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=200)
}

results = {}

for name, model in models.items():
    print("\n" + "="*50)
    print(f" Training {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()


In [None]:
plt.bar(results.keys(), results.values(), color="skyblue")
plt.ylabel("Accuracy")
plt.title("Model Comparison")
plt.xticks(rotation=30)
plt.show()

results
