# Fake vs Real News Classification using Naive Bayes
## ishat noor mahi

## Step 1: Import Required Libraries

In [None]:
# Import necessary libraries
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

In [2]:
df = pd.read_csv('fake_or_real_news.csv')
df.head()

Unnamed: 0,text,label,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 129,Unnamed: 130,Unnamed: 131,Unnamed: 132,Unnamed: 133,Unnamed: 134,Unnamed: 135,Unnamed: 136,Unnamed: 137,Unnamed: 138
0,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,,,,,,,,,...,,,,,,,,,,
1,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,,,,,,,,,...,,,,,,,,,,
2,U.S. Secretary of State John F. Kerry said Mon...,REAL,,,,,,,,,...,,,,,,,,,,
3,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,,,,,,,,,...,,,,,,,,,,
4,It's primary day in New York and front-runners...,REAL,,,,,,,,,...,,,,,,,,,,


In [4]:
df.info()
print("Missing Values:")
print(df.isnull().sum())
print("Duplicate Rows:", df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7795 entries, 0 to 7794
Columns: 139 entries, text to Unnamed: 138
dtypes: object(139)
memory usage: 8.3+ MB
Missing Values:
text             866
label           1040
Unnamed: 2      7477
Unnamed: 3      7554
Unnamed: 4      7616
                ... 
Unnamed: 134    7794
Unnamed: 135    7794
Unnamed: 136    7794
Unnamed: 137    7794
Unnamed: 138    7794
Length: 139, dtype: int64
Duplicate Rows: 1142


In [5]:
print("Class Distribution:")
print(df['label'].value_counts())
print("Class Percentages:")
print(df['label'].value_counts(normalize=True) * 100)

Class Distribution:
label
REAL                                                                                                                                                                                                           3161
FAKE                                                                                                                                                                                                           3154
 or naturalization or by jus sanguinis – inherited through ancestors/parents                                                                                                                                      3
 etc.                                                                                                                                                                                                             2
 Philippines                                                                                                                  

In [7]:
# Check text statistics
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()
print("Text Statistics:")
print(df[['text_length', 'word_count']].describe())

Text Statistics:
        text_length   word_count
count   6929.000000  6929.000000
mean    4251.583345   700.783663
std     4412.048384   731.276753
min        1.000000     0.000000
25%     1228.000000   203.000000
50%     3247.000000   533.000000
75%     5861.000000   970.000000
max    32727.000000  5913.000000


## Step 3: Data Preprocessing

In [None]:
# Handle missing values
print("Handling Missing Values...")
print("="*50)

# Check for missing values in text column
missing_text = df['text'].isnull().sum()
print(f"Missing values in 'text': {missing_text}")

# Drop rows with missing text if any
if missing_text > 0:
    df = df.dropna(subset=['text'])
    print(f"Dropped {missing_text} rows with missing text")
else:
    print("No missing values in text column")

missing_labels = df['label'].isnull().sum()
print(f"Missing values in 'label': {missing_labels}")

if missing_labels > 0:
    df = df.dropna(subset=['label'])
    print(f"Dropped {missing_labels} rows with missing labels")
else:
    print("No missing values in label column")

print("\nFinal dataset shape:", df.shape)

In [None]:
duplicates_before = df.duplicated().sum()
if duplicates_before > 0:
    df = df.drop_duplicates()
    print(f"Removed {duplicates_before} duplicate rows")
    print(f"Dataset shape after removing duplicates: {df.shape}")
else:
    print("No duplicate rows found")

In [None]:
# Prepare features (X) and target (y)
X = df['text']
y = df['label']
print(y.value_counts())

label
REAL                                                                                                                                                                                                           3161
FAKE                                                                                                                                                                                                           3154
 or naturalization or by jus sanguinis – inherited through ancestors/parents                                                                                                                                      3
 etc.                                                                                                                                                                                                             2
 Philippines                                                                                                                                      

## Step 4: Split Data into Training and Testing Sets

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


ValueError: Input contains NaN

## Step 5: Text Vectorization (Feature Engineering)

In [None]:
# Convert text to numerical features using TF-IDF
print("Converting text to TF-IDF features...")

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Limit to top 5000 features
    stop_words='english',  # Remove common English stop words
    max_df=0.7,  # Ignore terms that appear in more than 70% of documents
    min_df=5  # Ignore terms that appear in fewer than 5 documents
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"Training features shape: {X_train_tfidf.shape}")
print(f"Testing features shape: {X_test_tfidf.shape}")
print(f"Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")

## Step 6: Build Naive Bayes Model

In [None]:
print("Training Naive Bayes Model...")

nb_model = MultinomialNB(alpha=1.0)  
nb_model.fit(X_train_tfidf, y_train)

print("Model training completed!")
print(f"Model type: {type(nb_model).__name__}")

## Step 7: Make Predictions

In [None]:
# Make predictions on training and testing sets
y_train_pred = nb_model.predict(X_train_tfidf)
y_test_pred = nb_model.predict(X_test_tfidf)

# Get prediction probabilities
y_test_pred_proba = nb_model.predict_proba(X_test_tfidf)

print("Predictions completed!")
print(f"Training predictions shape: {y_train_pred.shape}")
print(f"Testing predictions shape: {y_test_pred.shape}")

## Step 8: Evaluate Model Performance

In [None]:
# Calculate accuracy scores
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("MODEL ACCURACY SCORES")
print("="*50)
print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Testing Accuracy:  {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print("="*50)

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, y_test_pred)

print("\nCONFUSION MATRIX")
print("="*50)
print(cm)

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['FAKE', 'REAL'], 
            yticklabels=['FAKE', 'REAL'],
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - Naive Bayes Model', fontsize=16, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
# Generate classification report
print("\nCLASSIFICATION REPORT")
print("="*50)
print(classification_report(y_test, y_test_pred, target_names=['FAKE', 'REAL']))

## Step 8: Model Interpretation - Feature Importance

In [None]:
# Get feature log probabilities
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())

# Get top features for each class
n_top_features = 20

# For FAKE news (class 0)
fake_log_prob = nb_model.feature_log_prob_[0]
top_fake_indices = np.argsort(fake_log_prob)[-n_top_features:][::-1]
top_fake_features = feature_names[top_fake_indices]
top_fake_values = fake_log_prob[top_fake_indices]

# For REAL news (class 1)
real_log_prob = nb_model.feature_log_prob_[1]
top_real_indices = np.argsort(real_log_prob)[-n_top_features:][::-1]
top_real_features = feature_names[top_real_indices]
top_real_values = real_log_prob[top_real_indices]

print("TOP 20 FEATURES FOR FAKE NEWS")
print("="*50)
for i, (feature, value) in enumerate(zip(top_fake_features, top_fake_values), 1):
    print(f"{i:2d}. {feature:20s} (log prob: {value:.4f})")

print("\n" + "="*50)
print("TOP 20 FEATURES FOR REAL NEWS")
print("="*50)
for i, (feature, value) in enumerate(zip(top_real_features, top_real_values), 1):
    print(f"{i:2d}. {feature:20s} (log prob: {value:.4f})")

## Step 10: Test the Model with Sample Predictions

In [None]:
# Test with sample articles from the test set
sample_indices = np.random.choice(X_test.index, size=5, replace=False)

print("SAMPLE PREDICTIONS")
print("="*70)

for idx in sample_indices:
    text = X_test.loc[idx]
    true_label = y_test.loc[idx]
    
    # Transform and predict
    text_tfidf = tfidf_vectorizer.transform([text])
    prediction = nb_model.predict(text_tfidf)[0]
    probability = nb_model.predict_proba(text_tfidf)[0]
    
    print(f"\nArticle Text (first 200 chars):")
    print(f"{text[:200]}...")
    print(f"\nTrue Label: {true_label}")
    print(f"Predicted Label: {prediction}")
    print(f"Confidence: FAKE={probability[0]:.4f}, REAL={probability[1]:.4f}")
    print(f"Result: {'✓ CORRECT' if prediction == true_label else '✗ INCORRECT'}")
    print("="*70)

## Step 11: Summary of Findings

### Key Results:

The Naive Bayes classification model has been successfully built and evaluated for fake vs real news detection.

#### Model Performance:
- **Algorithm Used**: Multinomial Naive Bayes
- **Feature Extraction**: TF-IDF Vectorization with 5000 max features
- **Train-Test Split**: 80-20 split with stratification

