In [1]:
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

In [2]:
# Step 2: Load the Dataset
# Update the path to your dataset
df = pd.read_csv(r'C:\Users\laiba\Downloads\archive\data.csv')

# Display the column names
print("Column Names:", df.columns)

Column Names: Index(['URLs', 'Headline', 'Body', 'Label'], dtype='object')


In [3]:
# Step 3: Download NLTK Data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\laiba\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\laiba\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\laiba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Step 4: Initialize NLTK Tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [5]:
# Check for missing values in the 'Body' column
print("Missing values in 'Body':", df['Body'].isnull().sum())

Missing values in 'Body': 21


In [6]:
# Drop rows with missing values in the 'Body' column
df = df.dropna(subset=['Body'])

In [7]:
# Ensure all values in 'Body' are strings
df['Body'] = df['Body'].astype(str)

In [8]:
# Step 5: Define Text Preprocessing Function
def preprocess_text(text):
    # Skip processing if the text is empty
    if not text.strip():
        return ''
    
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)
    # Remove single characters
    text = re.sub(r'\s+[a-z]\s+', ' ', text)
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Apply stemming
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    # Apply lemmatization
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

In [9]:
# Step 6: Apply Preprocessing to the 'Body' Column
df['processed_text'] = df['Body'].apply(preprocess_text)

# Display the processed data
print("Processed Data Sample:")
print(df[['Body', 'processed_text']].head())

Processed Data Sample:
                                                Body  \
0  Image copyright Getty Images\nOn Sunday mornin...   
1  LONDON (Reuters) - “Last Flag Flying”, a comed...   
2  The feud broke into public view last week when...   
3  MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...   
4  Country singer Jason Aldean, who was performin...   

                                      processed_text  
0  imag copyright getti imag sunday morn donald t...  
1  london reuter last flag fli comedi drama vietn...  
2  feud broke public view last week mr corker sai...  
3  mexico citi reuter egypt cheiron hold limit ri...  
4  countri singer jason aldean perform la vega sh...  


In [10]:
# Step 7: Split the Dataset into Features and Labels
X = df['processed_text']  # Features (processed text)
y = df['Label']           # Labels (0 for fake news, 1 for real news)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Step 8: Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [12]:
# Step 9: Train the Naïve Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

In [13]:
# Step 10: Make Predictions and Evaluate the Naïve Bayes Model
y_pred_nb = nb_model.predict(X_test_tfidf)

print("Naïve Bayes Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))

Naïve Bayes Results:
Accuracy: 0.924812030075188
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.91      0.93       450
           1       0.89      0.94      0.92       348

    accuracy                           0.92       798
   macro avg       0.92      0.93      0.92       798
weighted avg       0.93      0.92      0.92       798

Confusion Matrix:
 [[411  39]
 [ 21 327]]


In [14]:
# Step 11: Train the Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # Initialize Random Forest
rf_model.fit(X_train_tfidf, y_train)  # Train the model

In [15]:
# Step 12: Make Predictions and Evaluate the Random Forest Model
y_pred_rf = rf_model.predict(X_test_tfidf)

print("\nRandom Forest Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest Results:
Accuracy: 0.9711779448621554
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97       450
           1       0.94      1.00      0.97       348

    accuracy                           0.97       798
   macro avg       0.97      0.97      0.97       798
weighted avg       0.97      0.97      0.97       798

Confusion Matrix:
 [[428  22]
 [  1 347]]


In [16]:
# Step 13: Save the Models and Vectorizer
joblib.dump(nb_model, 'naive_bayes_model.pkl')
joblib.dump(rf_model, 'random_forest_model.pkl')  # Save Random Forest model
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("Models and vectorizer saved successfully!")

Models and vectorizer saved successfully!
