In [8]:
# ===============================
# 1. Import Libraries
# ===============================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# ===============================
# 2. Load Dataset
# ===============================

df = pd.read_csv("train1.csv")

# Clean label column (important)
df['label'] = pd.to_numeric(df['label'], errors='coerce')
df = df[df['label'].isin([0, 1])]

# Fill missing values
df = df.fillna('')

# Combine important columns into one
df['content'] = df['title'] + " " + df['author'] + " " + df['text']

# Features and Target
X = df['content']
y = df['label']


# ===============================
# 3. Split Data
# ===============================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# ===============================
# 4. Convert Text to TF-IDF
# ===============================

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


# ===============================
# 5. Logistic Regression
# ===============================

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)

y_pred_lr = lr.predict(X_test_tfidf)

print("\n===== Logistic Regression =====")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))


# ===============================
# 6. Naive Bayes
# ===============================

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

y_pred_nb = nb.predict(X_test_tfidf)

print("\n===== Naive Bayes =====")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))


# ===============================
# 7. Random Forest
# ===============================

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_tfidf, y_train)

y_pred_rf = rf.predict(X_test_tfidf)

print("\n===== Random Forest =====")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


# ===============================
# 8. Compare Accuracies
# ===============================

print("\n===== Model Comparison =====")
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


# ===============================
# 9. Predict New News
# ===============================

def predict_news(news_text):
    news_vector = vectorizer.transform([news_text])
    prediction = lr.predict(news_vector)
    
    if prediction[0] == 1:
        return "Fake News"
    else:
        return "Real News"


# Example Prediction
sample_news = "Government secretly planning to control weather through satellites"

print("\nPrediction for sample news:")
print(predict_news(sample_news))



===== Logistic Regression =====
Accuracy: 0.91
Confusion Matrix:
 [[104  10]
 [  8  78]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.93      0.91      0.92       114
         1.0       0.89      0.91      0.90        86

    accuracy                           0.91       200
   macro avg       0.91      0.91      0.91       200
weighted avg       0.91      0.91      0.91       200


===== Naive Bayes =====
Accuracy: 0.83
Confusion Matrix:
 [[113   1]
 [ 33  53]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.77      0.99      0.87       114
         1.0       0.98      0.62      0.76        86

    accuracy                           0.83       200
   macro avg       0.88      0.80      0.81       200
weighted avg       0.86      0.83      0.82       200


===== Random Forest =====
Accuracy: 0.895
Confusion Matrix:
 [[107   7]
 [ 14  72]]
Classification Report:
               pre