# Fake vs Real News Classification - Data Exploration & Preprocessing

This notebook performs exploratory data analysis and preprocessing for the fake news classification project.

## Project Overview
- **Goal**: Build a classifier to distinguish between real (1) and fake (0) news
- **Dataset**: News articles with labels, titles, full text content, subjects, and dates
- **Approach**: Use NLP techniques to process text and train machine learning models
- **Output**: Preprocessed data ready for model training

## Dataset Structure
- `label`: 0 = fake news, 1 = real news
- `title`: Headline text
- `text`: Full article content
- `subject`: Article topic/category
- `date`: Publication date

## Workflow
1. Load and explore the training dataset (`data.csv`)
2. Perform data cleaning and preprocessing
3. Apply text processing techniques (tokenization, TF-IDF, etc.)
4. Split data into training and test sets (stratified by label)
5. Save preprocessed data for model training

## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Text processing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")

# Download required NLTK data
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)
    print("NLTK data downloaded successfully!")
except:
    print("NLTK data download failed - please download manually if needed")

## 2. Load and Explore the Dataset

In [None]:
# Load the training dataset
print("Loading training dataset...")
df = pd.read_csv('../dataset/data.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nColumn Names:", df.columns.tolist())
print("\nData Types:")
print(df.dtypes)

# Display memory usage for large dataset
print(f"\nMemory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\nFirst 5 rows:")
df.head()

In [None]:
# Check for missing values
print("Missing values per column:")
missing_info = df.isnull().sum()
print(missing_info)

# Check data distribution
print("\nLabel distribution (0=fake, 1=real):")
label_counts = df['label'].value_counts().sort_index()
print(label_counts)

# Calculate percentages
print("\nLabel distribution (percentages):")
label_pct = df['label'].value_counts(normalize=True).sort_index() * 100
for label, pct in label_pct.items():
    label_name = 'Fake' if label == 0 else 'Real'
    print(f"  {label_name} (Label {label}): {pct:.1f}%")

# Check subjects
print("\nSubject distribution:")
subject_counts = df['subject'].value_counts()
print(subject_counts.head(10))  # Show top 10 subjects

# Date range analysis
if not df['date'].isnull().all():
    print("\nDate range:")
    try:
        df['date_parsed'] = pd.to_datetime(df['date'], errors='coerce')
        if df['date_parsed'].notna().any():
            print(f"From: {df['date_parsed'].min()}")
            print(f"To: {df['date_parsed'].max()}")
        else:
            print("Date parsing failed - treating as string")
    except:
        print("Date column contains mixed formats")

In [None]:
# Visualize label distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Label distribution - Bar plot
label_counts.plot(kind='bar', ax=axes[0, 0], color=['red', 'green'])
axes[0, 0].set_title('Distribution of Labels')
axes[0, 0].set_xlabel('Label (0=Fake, 1=Real)')
axes[0, 0].set_ylabel('Count')
axes[0, 0].tick_params(axis='x', rotation=0)

# Label distribution - Pie chart
label_counts.plot(kind='pie', ax=axes[0, 1], autopct='%1.1f%%', 
                  labels=['Fake News', 'Real News'], colors=['red', 'green'])
axes[0, 1].set_title('Label Distribution')
axes[0, 1].set_ylabel('')

# Subject distribution (top 10)
subject_counts.head(10).plot(kind='bar', ax=axes[1, 0])
axes[1, 0].set_title('Top 10 News Subjects')
axes[1, 0].set_xlabel('Subject')
axes[1, 0].set_ylabel('Count')
axes[1, 0].tick_params(axis='x', rotation=45)

# Label distribution by subject (for top subjects)
top_subjects = subject_counts.head(5).index
subject_label_df = df[df['subject'].isin(top_subjects)].groupby(['subject', 'label']).size().unstack(fill_value=0)
subject_label_df.plot(kind='bar', ax=axes[1, 1], color=['red', 'green'], width=0.8)
axes[1, 1].set_title('Label Distribution by Top Subjects')
axes[1, 1].set_xlabel('Subject')
axes[1, 1].set_ylabel('Count')
axes[1, 1].legend(['Fake', 'Real'])
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Summary statistics
total_articles = len(df)
fake_articles = len(df[df['label'] == 0])
real_articles = len(df[df['label'] == 1])

print(f"\nDataset Summary:")
print(f"Total articles: {total_articles:,}")
print(f"Fake articles: {fake_articles:,} ({fake_articles/total_articles*100:.1f}%)")
print(f"Real articles: {real_articles:,} ({real_articles/total_articles*100:.1f}%)")
print(f"Number of subjects: {df['subject'].nunique()}")
print(f"Articles with missing text: {df['text'].isnull().sum()}")
print(f"Articles with missing titles: {df['title'].isnull().sum()}")

In [None]:
# Text length analysis for title and text columns
print("Analyzing text lengths...")

# Calculate text statistics
df['title_length'] = df['title'].astype(str).str.len()
df['text_length'] = df['text'].astype(str).str.len()
df['title_word_count'] = df['title'].astype(str).str.split().str.len()
df['text_word_count'] = df['text'].astype(str).str.split().str.len()

# Statistics by label
print("Text Statistics by Label (0=Fake, 1=Real):")
print("\n1. Title Character Length:")
title_stats = df.groupby('label')['title_length'].describe()
print(title_stats)

print("\n2. Article Text Character Length:")
text_stats = df.groupby('label')['text_length'].describe()
print(text_stats)

print("\n3. Title Word Count:")
title_word_stats = df.groupby('label')['title_word_count'].describe()
print(title_word_stats)

print("\n4. Article Text Word Count:")
text_word_stats = df.groupby('label')['text_word_count'].describe()
print(text_word_stats)

# Visualize text length distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Title lengths by label
for label in [0, 1]:
    label_name = 'Fake' if label == 0 else 'Real'
    color = 'red' if label == 0 else 'green'
    data = df[df['label'] == label]['title_length']
    axes[0, 0].hist(data, bins=50, alpha=0.7, label=f'{label_name} News', color=color, density=True)
axes[0, 0].set_title('Title Length Distribution')
axes[0, 0].set_xlabel('Characters')
axes[0, 0].set_ylabel('Density')
axes[0, 0].legend()
axes[0, 0].set_xlim(0, 300)

# Text lengths by label (log scale for better visualization)
for label in [0, 1]:
    label_name = 'Fake' if label == 0 else 'Real'
    color = 'red' if label == 0 else 'green'
    data = df[df['label'] == label]['text_length']
    axes[0, 1].hist(np.log10(data + 1), bins=50, alpha=0.7, label=f'{label_name} News', color=color, density=True)
axes[0, 1].set_title('Article Text Length Distribution (Log Scale)')
axes[0, 1].set_xlabel('Log10(Characters + 1)')
axes[0, 1].set_ylabel('Density')
axes[0, 1].legend()

# Title word counts
for label in [0, 1]:
    label_name = 'Fake' if label == 0 else 'Real'
    color = 'red' if label == 0 else 'green'
    data = df[df['label'] == label]['title_word_count']
    axes[1, 0].hist(data, bins=30, alpha=0.7, label=f'{label_name} News', color=color, density=True)
axes[1, 0].set_title('Title Word Count Distribution')
axes[1, 0].set_xlabel('Words')
axes[1, 0].set_ylabel('Density')
axes[1, 0].legend()
axes[1, 0].set_xlim(0, 30)

# Text word counts (log scale)
for label in [0, 1]:
    label_name = 'Fake' if label == 0 else 'Real'
    color = 'red' if label == 0 else 'green'
    data = df[df['label'] == label]['text_word_count']
    axes[1, 1].hist(np.log10(data + 1), bins=50, alpha=0.7, label=f'{label_name} News', color=color, density=True)
axes[1, 1].set_title('Article Word Count Distribution (Log Scale)')
axes[1, 1].set_xlabel('Log10(Words + 1)')
axes[1, 1].set_ylabel('Density')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

## 3. Data Preprocessing and Cleaning

In [None]:
# Handle missing values and data cleaning
print("Data Cleaning and Missing Value Handling")
print("=" * 50)

# Before cleaning
print("BEFORE CLEANING:")
print(f"Dataset shape: {df.shape}")
print("\nMissing values:")
for col in df.columns:
    missing_count = df[col].isnull().sum()
    if missing_count > 0:
        print(f"  {col}: {missing_count} ({missing_count/len(df)*100:.1f}%)")

# Handle missing values strategically
df_clean = df.copy()

# Drop rows where both title AND text are missing (unusable)
unusable_rows = df_clean['title'].isnull() & df_clean['text'].isnull()
df_clean = df_clean[~unusable_rows]
print(f"\nRemoved {unusable_rows.sum()} rows with both title and text missing")

# Fill missing titles with "No Title"
title_missing = df_clean['title'].isnull()
df_clean.loc[title_missing, 'title'] = "No Title"
print(f"Filled {title_missing.sum()} missing titles")

# Fill missing text with "No Content"
text_missing = df_clean['text'].isnull()
df_clean.loc[text_missing, 'text'] = "No Content"
print(f"Filled {text_missing.sum()} missing text")

# Handle missing subjects
subject_missing = df_clean['subject'].isnull()
df_clean.loc[subject_missing, 'subject'] = "Unknown"
print(f"Filled {subject_missing.sum()} missing subjects")

# Handle missing dates
date_missing = df_clean['date'].isnull()
df_clean.loc[date_missing, 'date'] = "Unknown Date"
print(f"Filled {date_missing.sum()} missing dates")

# Remove any rows with missing labels (critical)
label_missing = df_clean['label'].isnull()
df_clean = df_clean[~label_missing]
print(f"Removed {label_missing.sum()} rows with missing labels")

# Final check
print(f"\nAFTER CLEANING:")
print(f"Dataset shape: {df_clean.shape}")
print(f"Rows removed: {len(df) - len(df_clean)} ({(len(df) - len(df_clean))/len(df)*100:.1f}%)")
print(f"Remaining missing values: {df_clean.isnull().sum().sum()}")

# Verify label distribution is maintained
print(f"\nLabel distribution after cleaning:")
cleaned_label_counts = df_clean['label'].value_counts().sort_index()
print(cleaned_label_counts)
for label, count in cleaned_label_counts.items():
    label_name = 'Fake' if label == 0 else 'Real'
    print(f"  {label_name}: {count:,} ({count/len(df_clean)*100:.1f}%)")

In [None]:
def clean_text(text):
    """
    Clean and preprocess text data for NLP.
    
    Steps:
    1. Convert to lowercase
    2. Remove URLs, email addresses
    3. Remove extra whitespace
    4. Remove non-alphabetic characters (keeping spaces)
    5. Strip leading/trailing whitespace
    """
    if pd.isna(text) or text == "":
        return ""
    
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text)
    
    # Remove non-alphabetic characters but keep spaces
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

# Apply basic cleaning to title and text columns
print("Applying text cleaning...")

# Clean titles and text
df_clean['title_clean'] = df_clean['title'].apply(clean_text)
df_clean['text_clean'] = df_clean['text'].apply(clean_text)

# Create combined text (title + text) - this is often effective for classification
df_clean['combined_text'] = df_clean['title_clean'] + ' ' + df_clean['text_clean']

# Remove any completely empty combined text
empty_combined = df_clean['combined_text'].str.strip() == ''
print(f"Removing {empty_combined.sum()} rows with empty combined text")
df_clean = df_clean[~empty_combined]

print("Text cleaning completed!")

# Show examples of cleaning
print("\nExamples of text cleaning:")
print("=" * 50)
sample_idx = df_clean.index[0]
print("Original title:")
print(f"  '{df_clean.loc[sample_idx, 'title']}'")
print("Cleaned title:")
print(f"  '{df_clean.loc[sample_idx, 'title_clean']}'")
print("\nOriginal text (first 200 chars):")
print(f"  '{df_clean.loc[sample_idx, 'text'][:200]}...'")
print("Cleaned text (first 200 chars):")
print(f"  '{df_clean.loc[sample_idx, 'text_clean'][:200]}...'")
print("\nCombined text (first 300 chars):")
print(f"  '{df_clean.loc[sample_idx, 'combined_text'][:300]}...'")

# Check final text lengths after cleaning
print(f"\nFinal dataset statistics:")
print(f"Total articles: {len(df_clean):,}")
print(f"Average combined text length: {df_clean['combined_text'].str.len().mean():.1f} characters")
print(f"Average combined word count: {df_clean['combined_text'].str.split().str.len().mean():.1f} words")

## 4. Text Feature Engineering

In [None]:
# Advanced text processing with NLTK
print("Applying advanced NLP preprocessing...")

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def advanced_text_processing(text, use_stemming=True, remove_stopwords=True):
    """
    Advanced text processing with tokenization, stopword removal, and stemming.
    
    Args:
        text (str): Input text to process
        use_stemming (bool): Whether to apply stemming
        remove_stopwords (bool): Whether to remove stopwords
    
    Returns:
        str: Processed text
    """
    if pd.isna(text) or text == "":
        return ""
    
    # Tokenize the text
    try:
        tokens = word_tokenize(str(text))
    except:
        # Fallback to simple split if tokenization fails
        tokens = str(text).split()
    
    # Convert to lowercase and remove very short tokens
    tokens = [token.lower() for token in tokens if len(token) > 2]
    
    # Remove stopwords if requested
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stop_words]
    
    # Apply stemming if requested
    if use_stemming:
        tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)

# Apply advanced processing to combined text
print("Processing combined text with NLTK...")

# Process in batches for large datasets
batch_size = 1000
processed_texts = []

for i in range(0, len(df_clean), batch_size):
    batch_end = min(i + batch_size, len(df_clean))
    batch_texts = df_clean.iloc[i:batch_end]['combined_text'].tolist()
    
    batch_processed = [advanced_text_processing(text) for text in batch_texts]
    processed_texts.extend(batch_processed)
    
    if (i // batch_size + 1) % 10 == 0:  # Progress update every 10 batches
        print(f"Processed {i + len(batch_processed):,} / {len(df_clean):,} articles...")

df_clean['text_processed'] = processed_texts

print("Advanced text processing completed!")

# Remove articles with no processable text
empty_processed = df_clean['text_processed'].str.strip() == ''
if empty_processed.sum() > 0:
    print(f"Removing {empty_processed.sum()} articles with no processable text after NLP preprocessing")
    df_clean = df_clean[~empty_processed]

# Show processing examples
print("\nExample of NLP processing:")
print("=" * 50)
sample_idx = df_clean.index[0]
print("Before NLP processing:")
print(f"  '{df_clean.loc[sample_idx, 'combined_text'][:200]}...'")
print("After NLP processing:")
print(f"  '{df_clean.loc[sample_idx, 'text_processed'][:200]}...'")

# Final statistics
print(f"\nFinal processed dataset:")
print(f"Total articles: {len(df_clean):,}")
print(f"Average processed text length: {df_clean['text_processed'].str.split().str.len().mean():.1f} words")
print(f"Label distribution:")
final_label_counts = df_clean['label'].value_counts().sort_index()
for label, count in final_label_counts.items():
    label_name = 'Fake' if label == 0 else 'Real'
    print(f"  {label_name}: {count:,} ({count/len(df_clean)*100:.1f}%)")

## 5. Split Data into Training and Testing Sets

In [None]:
# Split data into training and testing sets (stratified split)
print("Splitting data into training and testing sets...")
print("=" * 50)

# Prepare features and target
X = df_clean['text_processed']  # Processed text features
y = df_clean['label']           # Target labels (0=fake, 1=real)

# Additional features that could be useful
additional_features = df_clean[['subject', 'title_length', 'text_length', 'title_word_count', 'text_word_count']].copy()

print(f"Total samples for splitting: {len(X):,}")
print(f"Features: Processed text + {len(additional_features.columns)} additional features")
print(f"Target distribution:")
for label, count in y.value_counts().sort_index().items():
    label_name = 'Fake' if label == 0 else 'Real'
    print(f"  {label_name}: {count:,} ({count/len(y)*100:.1f}%)")

# Perform stratified train-test split
# Using 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,           # 20% for testing
    random_state=42,         # Fixed seed for reproducibility
    stratify=y               # Maintain label distribution in both sets
)

# Also split additional features
X_train_additional, X_test_additional, _, _ = train_test_split(
    additional_features, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"\nSplit completed!")
print(f"Training set: {len(X_train):,} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Testing set: {len(X_test):,} samples ({len(X_test)/len(X)*100:.1f}%)")

# Verify label distribution is maintained in both sets
print(f"\nLabel distribution in training set:")
train_label_counts = y_train.value_counts().sort_index()
for label, count in train_label_counts.items():
    label_name = 'Fake' if label == 0 else 'Real'
    print(f"  {label_name}: {count:,} ({count/len(y_train)*100:.1f}%)")

print(f"\nLabel distribution in testing set:")
test_label_counts = y_test.value_counts().sort_index()
for label, count in test_label_counts.items():
    label_name = 'Fake' if label == 0 else 'Real'
    print(f"  {label_name}: {count:,} ({count/len(y_test)*100:.1f}%)")

# Create directories for saving processed data
import os
os.makedirs('../data/processed', exist_ok=True)
os.makedirs('../outputs', exist_ok=True)

# Save the processed training and testing data
train_data = pd.DataFrame({
    'text_processed': X_train,
    'label': y_train,
    'subject': X_train_additional['subject'],
    'title_length': X_train_additional['title_length'],
    'text_length': X_train_additional['text_length']
})

test_data = pd.DataFrame({
    'text_processed': X_test,
    'label': y_test,
    'subject': X_test_additional['subject'],
    'title_length': X_test_additional['title_length'],
    'text_length': X_test_additional['text_length']
})

# Save processed datasets
train_data.to_csv('../data/processed/train.csv', index=False)
test_data.to_csv('../data/processed/test.csv', index=False)

# Also save the complete processed dataset
df_clean_final = df_clean[[
    'label', 'title', 'text', 'subject', 'date', 
    'title_clean', 'text_clean', 'combined_text', 'text_processed',
    'title_length', 'text_length', 'title_word_count', 'text_word_count'
]].copy()
df_clean_final.to_csv('../data/processed/data_processed_full.csv', index=False)

print(f"\nProcessed data saved:")
print(f"  - Training set: ../data/processed/train.csv ({len(train_data):,} samples)")
print(f"  - Testing set: ../data/processed/test.csv ({len(test_data):,} samples)")
print(f"  - Full processed dataset: ../data/processed/data_processed_full.csv ({len(df_clean_final):,} samples)")

print(f"\nData preprocessing completed successfully!")
print(f"Ready for model training in notebook 02.")