# Fake News Detection - Analysis Notebook

This notebook provides comprehensive analysis of the fake news detection system including:
- Data exploration and visualization
- Model training and comparison
- Performance evaluation
- Feature analysis
- Example predictions

In [None]:
# Import required libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Add parent directory to path to import our modules
sys.path.append('../')
from src.preprocess import NewsPreprocessor, create_sample_data
from src.model import FakeNewsClassifier
from src.predict import FakeNewsPredictor, quick_predict

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("📚 All libraries imported successfully!")

## 1. Data Loading and Exploration

In [None]:
# Load the sample dataset
df = pd.read_csv('../data/sample_dataset.csv')

print(f"Dataset Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
df.head()

In [None]:
# Dataset statistics
print("=== Dataset Statistics ===")
print(f"Total articles: {len(df)}")
print(f"Fake news articles: {sum(df['label'])} ({sum(df['label'])/len(df)*100:.1f}%)")
print(f"Real news articles: {len(df) - sum(df['label'])} ({(len(df) - sum(df['label']))/len(df)*100:.1f}%)")

# Text length analysis
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

print(f"\nText Length Statistics:")
print(f"Average text length: {df['text_length'].mean():.1f} characters")
print(f"Average word count: {df['word_count'].mean():.1f} words")

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Class distribution
class_counts = df['label'].value_counts()
axes[0, 0].pie(class_counts.values, labels=['Real News', 'Fake News'], autopct='%1.1f%%', startangle=90)
axes[0, 0].set_title('Class Distribution')

# Text length distribution
axes[0, 1].hist(df[df['label']==0]['text_length'], alpha=0.7, label='Real News', bins=20)
axes[0, 1].hist(df[df['label']==1]['text_length'], alpha=0.7, label='Fake News', bins=20)
axes[0, 1].set_xlabel('Text Length (characters)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Text Length Distribution')
axes[0, 1].legend()

# Word count distribution
axes[1, 0].hist(df[df['label']==0]['word_count'], alpha=0.7, label='Real News', bins=20)
axes[1, 0].hist(df[df['label']==1]['word_count'], alpha=0.7, label='Fake News', bins=20)
axes[1, 0].set_xlabel('Word Count')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Word Count Distribution')
axes[1, 0].legend()

# Box plot of text lengths by class
df.boxplot(column='text_length', by='label', ax=axes[1, 1])
axes[1, 1].set_title('Text Length by Class')
axes[1, 1].set_xlabel('Class (0=Real, 1=Fake)')
axes[1, 1].set_ylabel('Text Length')

plt.tight_layout()
plt.show()

## 2. Text Preprocessing Analysis

In [None]:
# Initialize preprocessor
preprocessor = NewsPreprocessor(max_features=1000)

# Example of text preprocessing
sample_text = df.iloc[0]['text']
print("=== Text Preprocessing Example ===")
print(f"Original text: {sample_text}")
print(f"Cleaned text: {preprocessor.clean_text(sample_text)}")
print(f"Processed text: {preprocessor.preprocess_text(sample_text)}")

In [None]:
# Prepare data for analysis
texts, labels = preprocessor.prepare_data(df)

print(f"Preprocessed texts: {len(texts)}")
print(f"Labels: {len(labels)}")

# Show some examples
print("\n=== Preprocessed Text Examples ===")
for i in range(3):
    print(f"Text {i+1} (Label: {labels[i]}): {texts[i][:100]}...")

## 3. Model Training and Comparison

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.3, random_state=42, stratify=labels
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

# Fit vectorizer and transform data
preprocessor.fit_vectorizer(X_train)
X_train_tfidf = preprocessor.transform_texts(X_train)
X_test_tfidf = preprocessor.transform_texts(X_test)

print(f"Training features shape: {X_train_tfidf.shape}")
print(f"Test features shape: {X_test_tfidf.shape}")

In [None]:
# Train all models
classifier = FakeNewsClassifier()
print("🚀 Training all models...")
results = classifier.train_all_models(X_train_tfidf, y_train, X_test_tfidf, y_test)

print("\n✅ Training completed!")

In [None]:
# Model comparison
comparison_df = classifier.get_model_comparison()
print("=== Model Performance Comparison ===")
display(comparison_df)

In [None]:
# Visualize model performance
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# CV Scores
axes[0, 0].bar(comparison_df['Model'], comparison_df['CV Mean'])
axes[0, 0].set_title('Cross-Validation Scores')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].tick_params(axis='x', rotation=45)

# Train vs Test Accuracy
x = np.arange(len(comparison_df))
width = 0.35
axes[0, 1].bar(x - width/2, comparison_df['Train Accuracy'], width, label='Train', alpha=0.8)
axes[0, 1].bar(x + width/2, comparison_df['Test Accuracy'], width, label='Test', alpha=0.8)
axes[0, 1].set_title('Train vs Test Accuracy')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].set_xticks(x)
axes[0, 1].set_xticklabels(comparison_df['Model'], rotation=45)
axes[0, 1].legend()

# CV Score with Error Bars
axes[1, 0].errorbar(comparison_df['Model'], comparison_df['CV Mean'], 
                   yerr=comparison_df['CV Std'], fmt='o', capsize=5, capthick=2, markersize=8)
axes[1, 0].set_title('CV Scores with Standard Deviation')
axes[1, 0].set_ylabel('Accuracy')
axes[1, 0].tick_params(axis='x', rotation=45)

# Performance heatmap
performance_data = comparison_df[['Model', 'Train Accuracy', 'CV Mean', 'Test Accuracy']].set_index('Model')
sns.heatmap(performance_data.T, annot=True, fmt='.3f', cmap='YlOrRd', ax=axes[1, 1])
axes[1, 1].set_title('Performance Heatmap')

plt.tight_layout()
plt.show()

print(f"🏆 Best Model: {classifier.best_model_name} (CV Score: {max(comparison_df['CV Mean']):.3f})")

## 4. Detailed Model Analysis

In [None]:
# Detailed analysis of the best model
best_model_name = classifier.best_model_name
best_model_results = classifier.model_scores[best_model_name]

print(f"=== Detailed Analysis: {best_model_name.upper()} ===")
print(f"Training Accuracy: {best_model_results['train_accuracy']:.4f}")
print(f"Test Accuracy: {best_model_results['test_accuracy']:.4f}")
print(f"CV Mean: {best_model_results['cv_mean']:.4f}")
print(f"CV Std: {best_model_results['cv_std']:.4f}")

print("\n=== Classification Report ===")
print(best_model_results['classification_report'])

In [None]:
# Confusion Matrix Visualization
cm = best_model_results['confusion_matrix']

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Real News', 'Fake News'],
            yticklabels=['Real News', 'Fake News'])
plt.title(f'Confusion Matrix - {best_model_name.upper()}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Calculate additional metrics
tn, fp, fn, tp = cm.ravel()
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)
specificity = tn / (tn + fp)

print(f"\n=== Additional Metrics ===")
print(f"Precision (Fake News): {precision:.4f}")
print(f"Recall (Fake News): {recall:.4f}")
print(f"F1-Score (Fake News): {f1_score:.4f}")
print(f"Specificity (Real News): {specificity:.4f}")

## 5. Feature Analysis

In [None]:
# Get feature names and importance (for models that support it)
feature_names = preprocessor.get_feature_names()
print(f"Total features: {len(feature_names)}")
print(f"Sample features: {list(feature_names[:20])}")

# For linear models, we can analyze feature weights
if 'logistic_regression' in classifier.trained_models:
    lr_model = classifier.trained_models['logistic_regression']
    coefficients = lr_model.coef_[0]
    
    # Get top positive and negative features
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'coefficient': coefficients
    })
    
    top_fake = feature_importance.nlargest(10, 'coefficient')
    top_real = feature_importance.nsmallest(10, 'coefficient')
    
    print("\n=== Top Features Indicating FAKE NEWS ===")
    for _, row in top_fake.iterrows():
        print(f"{row['feature']}: {row['coefficient']:.4f}")
    
    print("\n=== Top Features Indicating REAL NEWS ===")
    for _, row in top_real.iterrows():
        print(f"{row['feature']}: {row['coefficient']:.4f}")
        
    # Visualize feature importance
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Top fake news indicators
    axes[0].barh(range(len(top_fake)), top_fake['coefficient'])
    axes[0].set_yticks(range(len(top_fake)))
    axes[0].set_yticklabels(top_fake['feature'])
    axes[0].set_title('Top Features for Fake News Detection')
    axes[0].set_xlabel('Coefficient Value')
    
    # Top real news indicators
    axes[1].barh(range(len(top_real)), top_real['coefficient'])
    axes[1].set_yticks(range(len(top_real)))
    axes[1].set_yticklabels(top_real['feature'])
    axes[1].set_title('Top Features for Real News Detection')
    axes[1].set_xlabel('Coefficient Value')
    
    plt.tight_layout()
    plt.show()

## 6. Prediction Examples and Analysis

In [None]:
# Test with custom examples
test_articles = [
    "Scientists at MIT have developed a new renewable energy technology that could revolutionize solar power.",
    "BREAKING: Doctors HATE this one weird trick that cures everything! Click here to learn more!",
    "The Federal Reserve announced today that interest rates will remain unchanged following their monthly meeting.",
    "SHOCKING: Celebrity spotted with aliens, government cover-up exposed!",
    "Local university receives federal grant for climate change research project."
]

print("=== Prediction Examples ===")
for i, article in enumerate(test_articles, 1):
    # Make prediction using our trained model
    processed_text = preprocessor.preprocess_text(article)
    text_tfidf = preprocessor.transform_texts([processed_text])
    prediction = classifier.predict(text_tfidf)[0]
    
    # Get probability if available
    try:
        probabilities = classifier.predict_proba(text_tfidf)[0]
        confidence = max(probabilities)
        fake_prob = probabilities[1]
    except:
        confidence = "N/A"
        fake_prob = "N/A"
    
    label = "FAKE NEWS" if prediction == 1 else "REAL NEWS"
    
    print(f"\n--- Article {i} ---")
    print(f"Text: {article}")
    print(f"Prediction: {label}")
    if confidence != "N/A":
        print(f"Confidence: {confidence:.2%}")
        print(f"Fake News Probability: {fake_prob:.2%}")

In [None]:
# Error Analysis - look at misclassified examples
print("=== Error Analysis ===")
test_predictions = classifier.predict(X_test_tfidf)
misclassified_indices = np.where(test_predictions != y_test)[0]

print(f"Total misclassified: {len(misclassified_indices)} out of {len(y_test)}")
print(f"Error rate: {len(misclassified_indices)/len(y_test)*100:.1f}%")

# Show a few misclassified examples
print("\n=== Sample Misclassified Examples ===")
for i, idx in enumerate(misclassified_indices[:3]):
    actual_label = "FAKE NEWS" if y_test[idx] == 1 else "REAL NEWS"
    predicted_label = "FAKE NEWS" if test_predictions[idx] == 1 else "REAL NEWS"
    
    print(f"\n--- Example {i+1} ---")
    print(f"Text: {X_test[idx]}")
    print(f"Actual: {actual_label}")
    print(f"Predicted: {predicted_label}")

## 7. Model Comparison Visualization

In [None]:
# Create comprehensive model comparison visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Individual model performance bars
for i, model_name in enumerate(['naive_bayes', 'svm', 'random_forest', 'logistic_regression']):
    row = i // 2
    col = i % 2
    
    if model_name in classifier.model_scores:
        scores = classifier.model_scores[model_name]
        metrics = ['Train Acc', 'Test Acc', 'CV Mean']
        values = [scores['train_accuracy'], scores['test_accuracy'], scores['cv_mean']]
        
        axes[row, col].bar(metrics, values, alpha=0.7)
        axes[row, col].set_title(f'{model_name.upper()} Performance')
        axes[row, col].set_ylim([0, 1])
        axes[row, col].set_ylabel('Accuracy')
        
        # Add value labels on bars
        for j, v in enumerate(values):
            axes[row, col].text(j, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

# Overall comparison
models = comparison_df['Model']
cv_scores = comparison_df['CV Mean']
test_scores = comparison_df['Test Accuracy']

x = np.arange(len(models))
width = 0.35

axes[0, 2].bar(x - width/2, cv_scores, width, label='CV Score', alpha=0.8)
axes[0, 2].bar(x + width/2, test_scores, width, label='Test Score', alpha=0.8)
axes[0, 2].set_title('Model Comparison')
axes[0, 2].set_ylabel('Accuracy')
axes[0, 2].set_xticks(x)
axes[0, 2].set_xticklabels(models, rotation=45, ha='right')
axes[0, 2].legend()
axes[0, 2].set_ylim([0, 1])

# Model ranking
sorted_df = comparison_df.sort_values('CV Mean', ascending=True)
axes[1, 2].barh(range(len(sorted_df)), sorted_df['CV Mean'])
axes[1, 2].set_yticks(range(len(sorted_df)))
axes[1, 2].set_yticklabels(sorted_df['Model'])
axes[1, 2].set_title('Model Ranking (by CV Score)')
axes[1, 2].set_xlabel('CV Score')

plt.tight_layout()
plt.show()

## 8. Summary and Recommendations

In [None]:
# Generate final summary
best_score = max(comparison_df['CV Mean'])
best_test_score = comparison_df[comparison_df['Model'] == classifier.best_model_name]['Test Accuracy'].iloc[0]

print("" + "="*60)
print("📊 FAKE NEWS DETECTION - ANALYSIS SUMMARY")
print("="*60)
print(f"Dataset Size: {len(df)} articles")
print(f"Feature Count: {len(feature_names)} TF-IDF features")
print(f"Models Trained: {len(classifier.trained_models)}")
print(f"\n🏆 Best Model: {classifier.best_model_name.upper()}")
print(f"🎯 Cross-Validation Score: {best_score:.4f}")
print(f"🎯 Test Accuracy: {best_test_score:.4f}")
print(f"\n📈 Model Rankings (by CV Score):")
for i, (_, row) in enumerate(comparison_df.sort_values('CV Mean', ascending=False).iterrows(), 1):
    print(f"{i}. {row['Model'].upper()}: {row['CV Mean']:.4f}")

print(f"\n💡 Recommendations:")
if best_score > 0.8:
    print("✅ Model performance is good for this dataset size")
else:
    print("⚠️  Consider collecting more training data or feature engineering")
    
print("🚀 For production use:")
print(f"   - Use the {classifier.best_model_name} model")
print("   - Consider ensemble methods for better performance")
print("   - Collect more diverse training data")
print("   - Implement confidence thresholds for predictions")
print("="*60)

## 9. Save Results

In [None]:
# Save analysis results
import os

# Create results directory
os.makedirs('../results', exist_ok=True)

# Save model comparison
comparison_df.to_csv('../results/model_comparison.csv', index=False)

# Save feature analysis (if available)
if 'feature_importance' in locals():
    feature_importance.to_csv('../results/feature_importance.csv', index=False)

# Save test predictions
test_results = pd.DataFrame({
    'text': X_test,
    'actual_label': y_test,
    'predicted_label': test_predictions,
    'correct': y_test == test_predictions
})
test_results.to_csv('../results/test_predictions.csv', index=False)

print("📁 Analysis results saved to ../results/ directory")
print("   - model_comparison.csv")
if 'feature_importance' in locals():
    print("   - feature_importance.csv")
print("   - test_predictions.csv")

---

## Conclusion

This notebook has provided a comprehensive analysis of the fake news detection system, including:

1. **Data Exploration**: Understanding the dataset characteristics and distributions
2. **Preprocessing Analysis**: Examining the text cleaning and feature extraction process
3. **Model Training**: Training and comparing multiple machine learning algorithms
4. **Performance Evaluation**: Detailed analysis of model performance metrics
5. **Feature Analysis**: Understanding which features are most important for classification
6. **Prediction Examples**: Testing the system with real examples
7. **Error Analysis**: Examining misclassified cases

The system demonstrates good performance for fake news detection and can be further improved with more training data and advanced techniques.

---

*Run this notebook to explore your own fake news detection analysis!*