In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import joblib

# Load datasets
true_news = pd.read_csv("True.csv", low_memory=False)
fake_news = pd.read_csv("Fake.csv", low_memory=False)

# Keep only relevant columns
true_news = true_news[['title', 'text']].dropna()
fake_news = fake_news[['title', 'text']].dropna()

# Assign labels
true_news["label"] = 0  # 0 for true news
fake_news["label"] = 1  # 1 for fake news

# Clean text function
def clean_text(text):
    text = str(text).lower().strip()  # Convert to lowercase and remove spaces
    text = re.sub(r"\W+", " ", text)  # Remove special characters
    text = re.sub(r"\d+", "", text)  # Remove numbers
    return text.strip()

# Clean title and text separately
true_news["title_clean"] = true_news["title"].apply(clean_text)
true_news["text_clean"] = true_news["text"].apply(clean_text)
fake_news["title_clean"] = fake_news["title"].apply(clean_text)
fake_news["text_clean"] = fake_news["text"].apply(clean_text)

# Prepare separate datasets
true_titles = true_news[["title_clean", "label"]]
true_texts = true_news[["text_clean", "label"]]
fake_titles = fake_news[["title_clean", "label"]]
fake_texts = fake_news[["text_clean", "label"]]

# Combine separately for better classification
df_titles = pd.concat([true_titles, fake_titles], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)
df_texts = pd.concat([true_texts, fake_texts], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

# Split datasets for training
X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(
    df_titles["title_clean"], df_titles["label"], test_size=0.2, random_state=42
)
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
    df_texts["text_clean"], df_texts["label"], test_size=0.2, random_state=42
)

# Convert text into numerical format using TF-IDF (Separate Vectorizers)
vectorizer_title = TfidfVectorizer(stop_words="english", max_features=5000, ngram_range=(1,2))
vectorizer_text = TfidfVectorizer(stop_words="english", max_features=5000, ngram_range=(1,2))

X_train_title_tfidf = vectorizer_title.fit_transform(X_train_title)
X_test_title_tfidf = vectorizer_title.transform(X_test_title)

X_train_text_tfidf = vectorizer_text.fit_transform(X_train_text)
X_test_text_tfidf = vectorizer_text.transform(X_test_text)

# Fix class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_title_tfidf, y_train_title = smote.fit_resample(X_train_title_tfidf, y_train_title)
X_train_text_tfidf, y_train_text = smote.fit_resample(X_train_text_tfidf, y_train_text)

# Train two separate models (One for Title & One for Text)
model_title = MultinomialNB()
model_text = MultinomialNB()

model_title.fit(X_train_title_tfidf, y_train_title)
model_text.fit(X_train_text_tfidf, y_train_text)

# Predict on test set
y_pred_title = model_title.predict(X_test_title_tfidf)
y_pred_text = model_text.predict(X_test_text_tfidf)

# Calculate accuracy
accuracy_title = accuracy_score(y_test_title, y_pred_title)
accuracy_text = accuracy_score(y_test_text, y_pred_text)

print(f"Title Model Accuracy: {accuracy_title:.4f}")
print(f"Text Model Accuracy: {accuracy_text:.4f}")

print("\nTitle Classification Report:\n", classification_report(y_test_title, y_pred_title))
print("\nText Classification Report:\n", classification_report(y_test_text, y_pred_text))

# Save models and vectorizers separately
joblib.dump(model_title, "naive_bayes_title_model.pkl")
joblib.dump(model_text, "naive_bayes_text_model.pkl")
joblib.dump(vectorizer_title, "tfidf_vectorizer_title.pkl")
joblib.dump(vectorizer_text, "tfidf_vectorizer_text.pkl")

print("Models trained and saved successfully!")


Title Model Accuracy: 0.9322
Text Model Accuracy: 0.9280

Title Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93      4318
           1       0.93      0.94      0.93      4666

    accuracy                           0.93      8984
   macro avg       0.93      0.93      0.93      8984
weighted avg       0.93      0.93      0.93      8984


Text Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.94      0.93      4318
           1       0.95      0.91      0.93      4666

    accuracy                           0.93      8984
   macro avg       0.93      0.93      0.93      8984
weighted avg       0.93      0.93      0.93      8984

Models trained and saved successfully!
