# Enhanced Fake News Detection with NLP, ACO, and BERT

This notebook implements an advanced fake news detection pipeline on a synthetic dataset of 20,000 news articles. Features:
- **EDA**: Missing values, label/category distribution, word clouds, sentiment analysis.
- **Preprocessing**: Text cleaning, TF-IDF, feature engineering (text length, sentiment).
- **Feature Selection**: Embedded-Filter Ant Colony Optimization (ACO) with clustering-based mutual information.
- **Models**: Logistic Regression, XGBoost, and BERT (DistilBERT for efficiency).
- **Outputs**: Plotly JSON for interactive reading card integration.

**Dataset**: `/kaggle/input/fake-news-dataset/fake_news_dataset.csv` (20,000 rows, 7 columns, ~5% missing values).

**JNTUK AI Course**: Unit 3 (ML Basics), CO2 (apply ML/NLP, PO3), CO4 (evaluate models, PO4).

**Abstract**: Implements Embedded-Filter ACO with clustering-based MI for text feature selection.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_classif
from xgboost import XGBClassifier
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from wordcloud import WordCloud
import re
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

## 2. Load and Explore Dataset

In [None]:
# Load dataset
df = pd.read_csv('/kaggle/input/fake-news-dataset/fake_news_dataset.csv')

# Display basic info
print("Dataset Shape:", df.shape)
print("\nColumns and Data Types:")
print(df.dtypes)
print("\nFirst 5 Rows:")
print(df.head())

# Check missing values
missing = df.isnull().sum() / len(df) * 100
print("\nMissing Values (%):")
print(missing)

# Visualize missing values
fig = px.bar(x=missing.index, y=missing.values, title="Missing Values by Column (%)",
             labels={'x': 'Column', 'y': 'Percentage'}, color_discrete_sequence=['#2563eb'])
fig.update_layout(showlegend=False)
fig.show()

# Label distribution
label_counts = df['label'].value_counts()
fig = px.pie(values=label_counts.values, names=label_counts.index,
             title="Label Distribution (Real vs. Fake)", color_discrete_sequence=['#2563eb', '#ff007a'])
fig.show()

# Category distribution
category_counts = df['category'].value_counts()
fig = px.bar(x=category_counts.index, y=category_counts.values,
             title="Article Categories", labels={'x': 'Category', 'y': 'Count'},
             color_discrete_sequence=['#2563eb'])
fig.show()

# Sentiment distribution
df['sentiment'] = df['text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
fig = px.histogram(df, x='sentiment', color='label', nbins=50,
                   title="Sentiment Distribution by Label", barmode='overlay',
                   color_discrete_sequence=['#2563eb', '#ff007a'])
fig.show()

# Word clouds
real_text = ' '.join(df[df['label'] == 'real']['text'].dropna())
fake_text = ' '.join(df[df['label'] == 'fake']['text'].dropna())

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(real_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Real News Word Cloud')
plt.axis('off')

plt.subplot(1, 2, 2)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(fake_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Fake News Word Cloud')
plt.axis('off')
plt.show()

## 3. Preprocess Data and Feature Engineering

In [None]:
# Handle missing values
df['source'] = df['source'].fillna('Unknown')
df['author'] = df['author'].fillna('Unknown')
df['text'] = df['text'].fillna('')
df['title'] = df['title'].fillna('')

# Combine title and text
df['content'] = df['title'] + ' ' + df['text']

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply text cleaning
df['clean_content'] = df['content'].apply(clean_text)

# Feature engineering
df['text_length'] = df['text'].apply(lambda x: len(x.split()))
df['title_length'] = df['title'].apply(lambda x: len(x.split()))
df['sentiment'] = df['text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Encode labels
df['label_encoded'] = df['label'].map({'real': 1, 'fake': 0})

# Correlation analysis
corr = df[['text_length', 'title_length', 'sentiment', 'label_encoded']].corr()
fig = px.imshow(corr, text_auto=True, title="Correlation Matrix",
                color_continuous_scale='Blues')
fig.show()

# Verify preprocessing
print("\nSample Cleaned Content:")
print(df['clean_content'].iloc[0][:200], '...')
print("\nFeature Engineered Columns:")
print(df[['text_length', 'title_length', 'sentiment']].head())

## 4. TF-IDF Vectorization and Feature Combination

In [None]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_text = vectorizer.fit_transform(df['clean_content']).toarray()
feature_names = vectorizer.get_feature_names_out()

# Combine TF-IDF with engineered features
X_engineered = df[['text_length', 'title_length', 'sentiment']].values
X = np.hstack((X_text, X_engineered))
feature_names = np.concatenate((feature_names, ['text_length', 'title_length', 'sentiment']))
y = df['label_encoded']

print("\nFeature Matrix Shape:", X.shape)
print("Sample Features:", feature_names[:10])

## 5. Embedded-Filter ACO Feature Selection

In [None]:
# Clustering-based mutual information
def clustering_based_mi(X, y, n_clusters=3):
    mi_scores = []
    for i in range(X.shape[1]):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(X[:, [i]])
        mi = mutual_info_classif(clusters.reshape(-1, 1), y, random_state=42)[0]
        mi_scores.append(mi)
    return np.array(mi_scores)

# Enhanced ACO for feature selection
def aco_feature_selection(X, y, mi_scores, n_ants=15, n_iterations=30, evaporation_rate=0.4, max_features=200):
    n_features = X.shape[1]
    pheromones = np.ones(n_features) * 0.1
    best_subset = None
    best_accuracy = 0
    
    for _ in range(n_iterations):
        subsets = []
        for _ in range(n_ants):
            prob = pheromones * mi_scores
            prob = np.clip(prob / prob.sum(), 0, 1)
            n_select = np.random.randint(50, max_features + 1)
            subset = np.random.choice(n_features, size=n_select, p=prob, replace=False)
            subsets.append(subset)
        
        for subset in subsets:
            X_subset = X[:, subset]
            X_train, X_test, y_train, y_test = train_test_split(X_subset, y, test_size=0.2, random_state=42)
            model = LogisticRegression(max_iter=1000, random_state=42)
            model.fit(X_train, y_train)
            accuracy = accuracy_score(y_test, model.predict(X_test))
            
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_subset = subset
            
            pheromones *= (1 - evaporation_rate)
            pheromones[np.array(subset)] += accuracy * (max_features / len(subset))
    
    return best_subset, best_accuracy

# Compute MI scores
mi_scores = clustering_based_mi(X, y)

# Run ACO
selected_indices, aco_accuracy = aco_feature_selection(X, y, mi_scores, max_features=200)
selected_features = feature_names[selected_indices]
print("\nACO Selected Features (Top 10):")
print(selected_features[:10])
print("ACO Preliminary Accuracy:", aco_accuracy)

# Update X with selected features
X_selected = X[:, selected_indices]

# Save MI scores plot as JSON
mi_selected = mi_scores[selected_indices]
fig = px.bar(x=selected_features[:20], y=mi_selected[:20],
             title="Top 20 Selected Features by MI Score",
             labels={'x': 'Feature', 'y': 'MI Score'}, color_discrete_sequence=['#2563eb'])
with open('/kaggle/working/mi_scores.json', 'w') as f:
    json.dump({'data': [trace.__dict__ for trace in fig.data], 'layout': fig.layout.__dict__}, f)

## 6. Train and Evaluate Models (Logistic Regression, XGBoost)

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)
lr_report = classification_report(y_test, lr_pred, target_names=['Fake', 'Real'])

print("\nLogistic Regression Performance:")
print("Accuracy:", lr_accuracy)
print("Classification Report:\n", lr_report)

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_report = classification_report(y_test, xgb_pred, target_names=['Fake', 'Real'])

print("\nXGBoost Performance:")
print("Accuracy:", xgb_accuracy)
print("Classification Report:\n", xgb_report)

## 7. BERT Model (DistilBERT)

In [None]:
# Custom Dataset for BERT
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Prepare BERT data
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['clean_content'], df['label_encoded'], test_size=0.2, random_state=42
)

train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
test_dataset = NewsDataset(test_texts, test_labels, tokenizer)

# BERT model
bert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='/kaggle/working/bert_results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='/kaggle/working/bert_logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True
)

# Trainer
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train and evaluate
trainer.train()
bert_results = trainer.evaluate()
bert_accuracy = bert_results['eval_accuracy'] if 'eval_accuracy' in bert_results else 0

print("\nBERT (DistilBERT) Performance:")
print("Accuracy:", bert_accuracy)

## 8. Compare Models and Save Outputs

In [None]:
# Compare models
fig = px.bar(x=['Logistic Regression', 'XGBoost', 'BERT'],
             y=[lr_accuracy, xgb_accuracy, bert_accuracy],
             title="Model Accuracy Comparison", labels={'x': 'Model', 'y': 'Accuracy'},
             color_discrete_sequence=['#2563eb'])
fig.show()

# Save comparison plot as JSON
with open('/kaggle/working/model_comparison.json', 'w') as f:
    json.dump({'data': [trace.__dict__ for trace in fig.data], 'layout': fig.layout.__dict__}, f)

# Save models and vectorizer
joblib.dump(lr_model, '/kaggle/working/fake_news_lr_model.pkl')
joblib.dump(xgb_model, '/kaggle/working/fake_news_xgb_model.pkl')
joblib.dump(vectorizer, '/kaggle/working/tfidf_vectorizer.pkl')
joblib.dump(selected_indices, '/kaggle/working/selected_indices.pkl')
trainer.save_model('/kaggle/working/bert_model')

# Save selected features
with open('/kaggle/working/selected_features.txt', 'w') as f:
    f.write('\n'.join(selected_features))

print("Models, vectorizer, and outputs saved to /kaggle/working/")