## 1. Setup & Data Loading

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("✓ Libraries imported successfully")

In [None]:
# Load data from MinIO or local files
# Option 1: From MinIO (if running in environment)
# from minio import Minio
# from io import BytesIO
# minio_client = Minio(...)
# df = pd.read_parquet(BytesIO(...))

# Option 2: From local CSV files
fake_df = pd.read_csv("../data/Fake.csv") if os.path.exists("../data/Fake.csv") else pd.read_csv("Fake.csv")
real_df = pd.read_csv("../data/True.csv") if os.path.exists("../data/True.csv") else pd.read_csv("True.csv")

# Add labels
fake_df['label'] = 0
real_df['label'] = 1

# Combine datasets
df = pd.concat([fake_df, real_df], ignore_index=True)

print(f"✓ Data loaded successfully")
print(f"Total records: {len(df):,}")
print(f"Fake news: {len(fake_df):,}")
print(f"Real news: {len(real_df):,}")

## 2. Basic Data Overview

In [None]:
# Dataset shape
print("="*50)
print("DATASET SHAPE")
print("="*50)
print(f"Rows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]}")
print()

In [None]:
# First few rows
print("="*50)
print("FIRST 5 ROWS")
print("="*50)
df.head()

In [None]:
# Data types and memory usage
print("="*50)
print("DATA TYPES & MEMORY USAGE")
print("="*50)
df.info()

In [None]:
# Statistical summary
print("="*50)
print("STATISTICAL SUMMARY")
print("="*50)
df.describe()

## 3. Data Quality Assessment

In [None]:
# Missing values analysis
print("="*50)
print("MISSING VALUES ANALYSIS")
print("="*50)

missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum().values,
    'Missing_Percentage': (df.isnull().sum().values / len(df) * 100).round(2)
})

missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_data) > 0:
    print(missing_data)
    
    # Visualize missing values
    plt.figure(figsize=(10, 6))
    sns.barplot(data=missing_data, x='Column', y='Missing_Percentage')
    plt.title('Missing Values by Column (%)', fontsize=14, fontweight='bold')
    plt.xlabel('Column')
    plt.ylabel('Missing Percentage (%)')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("✓ No missing values found!")

In [None]:
# Duplicate records
print("="*50)
print("DUPLICATE ANALYSIS")
print("="*50)

duplicates = df.duplicated().sum()
print(f"Total duplicates: {duplicates:,} ({duplicates/len(df)*100:.2f}%)")

if 'text' in df.columns:
    text_duplicates = df.duplicated(subset=['text']).sum()
    print(f"Duplicate texts: {text_duplicates:,} ({text_duplicates/len(df)*100:.2f}%)")

if 'title' in df.columns:
    title_duplicates = df.duplicated(subset=['title']).sum()
    print(f"Duplicate titles: {title_duplicates:,} ({title_duplicates/len(df)*100:.2f}%)")

## 4. Target Variable Analysis (Label Distribution)

In [None]:
# Label distribution
print("="*50)
print("LABEL DISTRIBUTION")
print("="*50)

label_counts = df['label'].value_counts()
label_pcts = df['label'].value_counts(normalize=True) * 100

print(f"Fake (0): {label_counts[0]:,} ({label_pcts[0]:.2f}%)")
print(f"Real (1): {label_counts[1]:,} ({label_pcts[1]:.2f}%)")
print(f"\nBalance Ratio: 1:{label_counts[1]/label_counts[0]:.2f}")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
label_counts.plot(kind='bar', ax=axes[0], color=['#FF6B6B', '#4ECDC4'])
axes[0].set_title('Label Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Label (0=Fake, 1=Real)')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(['Fake', 'Real'], rotation=0)
axes[0].grid(axis='y', alpha=0.3)

# Add count labels on bars
for i, v in enumerate(label_counts):
    axes[0].text(i, v + 500, f'{v:,}', ha='center', fontweight='bold')

# Pie chart
colors = ['#FF6B6B', '#4ECDC4']
axes[1].pie(label_counts, labels=['Fake', 'Real'], autopct='%1.1f%%', 
            colors=colors, startangle=90, textprops={'fontsize': 12, 'fontweight': 'bold'})
axes[1].set_title('Label Distribution (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Text Length Analysis

In [None]:
# Calculate text lengths if not already present
if 'text_length' not in df.columns and 'text' in df.columns:
    df['text_length'] = df['text'].apply(lambda x: len(str(x)) if pd.notna(x) else 0)

if 'title_length' not in df.columns and 'title' in df.columns:
    df['title_length'] = df['title'].apply(lambda x: len(str(x)) if pd.notna(x) else 0)

print("="*50)
print("TEXT LENGTH STATISTICS")
print("="*50)

# Overall statistics
if 'text_length' in df.columns:
    print("\nText Length:")
    print(df['text_length'].describe())
    
if 'title_length' in df.columns:
    print("\nTitle Length:")
    print(df['title_length'].describe())

In [None]:
# Text length by label
if 'text_length' in df.columns:
    print("="*50)
    print("TEXT LENGTH BY LABEL")
    print("="*50)
    
    text_length_by_label = df.groupby('label')['text_length'].agg(['mean', 'median', 'min', 'max', 'std'])
    text_length_by_label.index = ['Fake (0)', 'Real (1)']
    print(text_length_by_label)
    
    # Visualization
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Distribution by label
    fake_text_len = df[df['label'] == 0]['text_length']
    real_text_len = df[df['label'] == 1]['text_length']
    
    axes[0, 0].hist([fake_text_len, real_text_len], bins=50, label=['Fake', 'Real'], 
                    color=['#FF6B6B', '#4ECDC4'], alpha=0.7)
    axes[0, 0].set_title('Text Length Distribution by Label', fontsize=14, fontweight='bold')
    axes[0, 0].set_xlabel('Text Length')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].legend()
    axes[0, 0].grid(alpha=0.3)
    
    # Box plot
    df.boxplot(column='text_length', by='label', ax=axes[0, 1])
    axes[0, 1].set_title('Text Length Box Plot by Label', fontsize=14, fontweight='bold')
    axes[0, 1].set_xlabel('Label (0=Fake, 1=Real)')
    axes[0, 1].set_ylabel('Text Length')
    axes[0, 1].set_xticklabels(['Fake', 'Real'])
    plt.suptitle('')
    
    # Violin plot
    sns.violinplot(data=df, x='label', y='text_length', ax=axes[1, 0])
    axes[1, 0].set_title('Text Length Violin Plot by Label', fontsize=14, fontweight='bold')
    axes[1, 0].set_xlabel('Label')
    axes[1, 0].set_ylabel('Text Length')
    axes[1, 0].set_xticklabels(['Fake', 'Real'])
    
    # Mean comparison
    mean_lengths = df.groupby('label')['text_length'].mean()
    axes[1, 1].bar(['Fake', 'Real'], mean_lengths, color=['#FF6B6B', '#4ECDC4'])
    axes[1, 1].set_title('Average Text Length by Label', fontsize=14, fontweight='bold')
    axes[1, 1].set_xlabel('Label')
    axes[1, 1].set_ylabel('Average Text Length')
    axes[1, 1].grid(axis='y', alpha=0.3)
    
    for i, v in enumerate(mean_lengths):
        axes[1, 1].text(i, v + 50, f'{v:.0f}', ha='center', fontweight='bold')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Title length analysis
if 'title_length' in df.columns:
    print("="*50)
    print("TITLE LENGTH BY LABEL")
    print("="*50)
    
    title_length_by_label = df.groupby('label')['title_length'].agg(['mean', 'median', 'min', 'max', 'std'])
    title_length_by_label.index = ['Fake (0)', 'Real (1)']
    print(title_length_by_label)
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Box plot
    df.boxplot(column='title_length', by='label', ax=axes[0])
    axes[0].set_title('Title Length Box Plot by Label', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Label (0=Fake, 1=Real)')
    axes[0].set_ylabel('Title Length')
    axes[0].set_xticklabels(['Fake', 'Real'])
    plt.suptitle('')
    
    # Mean comparison
    mean_title_lengths = df.groupby('label')['title_length'].mean()
    axes[1].bar(['Fake', 'Real'], mean_title_lengths, color=['#FF6B6B', '#4ECDC4'])
    axes[1].set_title('Average Title Length by Label', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Label')
    axes[1].set_ylabel('Average Title Length')
    axes[1].grid(axis='y', alpha=0.3)
    
    for i, v in enumerate(mean_title_lengths):
        axes[1].text(i, v + 2, f'{v:.1f}', ha='center', fontweight='bold')
    
    plt.tight_layout()
    plt.show()

## 6. Subject/Category Analysis

In [None]:
# Subject distribution
if 'subject' in df.columns:
    print("="*50)
    print("SUBJECT DISTRIBUTION")
    print("="*50)
    
    subject_counts = df['subject'].value_counts()
    print(subject_counts)
    print(f"\nTotal unique subjects: {df['subject'].nunique()}")
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Bar plot
    subject_counts.plot(kind='barh', ax=axes[0], color='skyblue')
    axes[0].set_title('Subject Distribution (All)', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Count')
    axes[0].set_ylabel('Subject')
    axes[0].grid(axis='x', alpha=0.3)
    
    # Pie chart (top subjects)
    top_subjects = subject_counts.head(10)
    axes[1].pie(top_subjects, labels=top_subjects.index, autopct='%1.1f%%', startangle=90)
    axes[1].set_title('Top 10 Subjects Distribution', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Subject distribution by label
if 'subject' in df.columns:
    print("="*50)
    print("SUBJECT DISTRIBUTION BY LABEL")
    print("="*50)
    
    subject_by_label = pd.crosstab(df['subject'], df['label'])
    subject_by_label.columns = ['Fake', 'Real']
    print(subject_by_label)
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Stacked bar chart
    subject_by_label.plot(kind='bar', stacked=True, ax=axes[0], color=['#FF6B6B', '#4ECDC4'])
    axes[0].set_title('Subject Distribution by Label (Stacked)', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Subject')
    axes[0].set_ylabel('Count')
    axes[0].legend(title='Label')
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].grid(axis='y', alpha=0.3)
    
    # Grouped bar chart
    subject_by_label.plot(kind='bar', ax=axes[1], color=['#FF6B6B', '#4ECDC4'])
    axes[1].set_title('Subject Distribution by Label (Grouped)', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Subject')
    axes[1].set_ylabel('Count')
    axes[1].legend(title='Label')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 7. Temporal Analysis (Date, Year, Month)

In [None]:
# Parse date column if exists
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    
    # Extract temporal features if not present
    if 'year' not in df.columns:
        df['year'] = df['date'].dt.year.astype(str)
    if 'month' not in df.columns:
        df['month'] = df['date'].dt.month_name()
    if 'year_month' not in df.columns:
        df['year_month'] = df['date'].dt.to_period('M').astype(str)

print("✓ Temporal features processed")

In [None]:
# Year distribution
if 'year' in df.columns:
    print("="*50)
    print("YEAR DISTRIBUTION")
    print("="*50)
    
    year_counts = df['year'].value_counts().sort_index()
    print(year_counts)
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Overall year distribution
    year_counts.plot(kind='bar', ax=axes[0], color='steelblue')
    axes[0].set_title('News Articles by Year', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Year')
    axes[0].set_ylabel('Count')
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].grid(axis='y', alpha=0.3)
    
    # Year distribution by label
    year_by_label = pd.crosstab(df['year'], df['label'])
    year_by_label.columns = ['Fake', 'Real']
    year_by_label.plot(kind='bar', ax=axes[1], color=['#FF6B6B', '#4ECDC4'])
    axes[1].set_title('News Articles by Year and Label', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Year')
    axes[1].set_ylabel('Count')
    axes[1].legend(title='Label')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Month distribution
if 'month' in df.columns:
    print("="*50)
    print("MONTH DISTRIBUTION")
    print("="*50)
    
    # Order months correctly
    month_order = ['January', 'February', 'March', 'April', 'May', 'June', 
                   'July', 'August', 'September', 'October', 'November', 'December']
    
    month_counts = df['month'].value_counts().reindex(month_order, fill_value=0)
    print(month_counts)
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Overall month distribution
    month_counts.plot(kind='bar', ax=axes[0], color='coral')
    axes[0].set_title('News Articles by Month', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Month')
    axes[0].set_ylabel('Count')
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].grid(axis='y', alpha=0.3)
    
    # Month distribution by label
    month_by_label = pd.crosstab(df['month'], df['label']).reindex(month_order, fill_value=0)
    month_by_label.columns = ['Fake', 'Real']
    month_by_label.plot(kind='bar', ax=axes[1], color=['#FF6B6B', '#4ECDC4'])
    axes[1].set_title('News Articles by Month and Label', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Month')
    axes[1].set_ylabel('Count')
    axes[1].legend(title='Label')
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Time series analysis
if 'year_month' in df.columns:
    print("="*50)
    print("TIME SERIES ANALYSIS")
    print("="*50)
    
    # Time series by label
    time_series = df.groupby(['year_month', 'label']).size().unstack(fill_value=0)
    time_series.columns = ['Fake', 'Real']
    
    # Visualization
    fig, axes = plt.subplots(2, 1, figsize=(16, 10))
    
    # Line plot
    time_series.plot(ax=axes[0], color=['#FF6B6B', '#4ECDC4'], linewidth=2, marker='o')
    axes[0].set_title('News Articles Over Time by Label', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Year-Month')
    axes[0].set_ylabel('Count')
    axes[0].legend(title='Label')
    axes[0].grid(alpha=0.3)
    axes[0].tick_params(axis='x', rotation=45)
    
    # Stacked area plot
    time_series.plot.area(ax=axes[1], color=['#FF6B6B', '#4ECDC4'], alpha=0.7)
    axes[1].set_title('Cumulative News Articles Over Time by Label', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Year-Month')
    axes[1].set_ylabel('Count')
    axes[1].legend(title='Label')
    axes[1].grid(alpha=0.3)
    axes[1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

## 8. Word Analysis

In [None]:
# Install additional libraries if needed
# !pip install wordcloud nltk

from wordcloud import WordCloud
import nltk
from collections import Counter
import re

# Download stopwords
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

print("✓ Word analysis libraries loaded")

In [None]:
# Word count analysis
if 'text' in df.columns:
    print("="*50)
    print("WORD COUNT ANALYSIS")
    print("="*50)
    
    df['word_count'] = df['text'].apply(lambda x: len(str(x).split()) if pd.notna(x) else 0)
    
    word_count_stats = df.groupby('label')['word_count'].agg(['mean', 'median', 'min', 'max', 'std'])
    word_count_stats.index = ['Fake (0)', 'Real (1)']
    print(word_count_stats)
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Box plot
    df.boxplot(column='word_count', by='label', ax=axes[0])
    axes[0].set_title('Word Count Distribution by Label', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Label (0=Fake, 1=Real)')
    axes[0].set_ylabel('Word Count')
    axes[0].set_xticklabels(['Fake', 'Real'])
    plt.suptitle('')
    
    # Mean comparison
    mean_word_counts = df.groupby('label')['word_count'].mean()
    axes[1].bar(['Fake', 'Real'], mean_word_counts, color=['#FF6B6B', '#4ECDC4'])
    axes[1].set_title('Average Word Count by Label', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Label')
    axes[1].set_ylabel('Average Word Count')
    axes[1].grid(axis='y', alpha=0.3)
    
    for i, v in enumerate(mean_word_counts):
        axes[1].text(i, v + 5, f'{v:.0f}', ha='center', fontweight='bold')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Most common words analysis
def get_top_words(texts, n=20, label_name=''):
    """Extract top N most common words from texts"""
    all_words = []
    for text in texts:
        if pd.notna(text):
            # Clean and tokenize
            words = re.findall(r'\b[a-zA-Z]{3,}\b', str(text).lower())
            # Filter stopwords
            words = [w for w in words if w not in stop_words]
            all_words.extend(words)
    
    word_counts = Counter(all_words)
    return word_counts.most_common(n)

if 'text' in df.columns:
    print("="*50)
    print("MOST COMMON WORDS")
    print("="*50)
    
    # Top words for fake news
    fake_texts = df[df['label'] == 0]['text']
    top_fake_words = get_top_words(fake_texts, n=20)
    
    print("\nTop 20 words in FAKE news:")
    for word, count in top_fake_words:
        print(f"{word}: {count}")
    
    # Top words for real news
    real_texts = df[df['label'] == 1]['text']
    top_real_words = get_top_words(real_texts, n=20)
    
    print("\nTop 20 words in REAL news:")
    for word, count in top_real_words:
        print(f"{word}: {count}")

In [None]:
# Visualize top words
if 'text' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Fake news top words
    fake_words_df = pd.DataFrame(top_fake_words, columns=['Word', 'Count'])
    axes[0].barh(fake_words_df['Word'], fake_words_df['Count'], color='#FF6B6B')
    axes[0].set_title('Top 20 Words in FAKE News', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Frequency')
    axes[0].invert_yaxis()
    axes[0].grid(axis='x', alpha=0.3)
    
    # Real news top words
    real_words_df = pd.DataFrame(top_real_words, columns=['Word', 'Count'])
    axes[1].barh(real_words_df['Word'], real_words_df['Count'], color='#4ECDC4')
    axes[1].set_title('Top 20 Words in REAL News', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Frequency')
    axes[1].invert_yaxis()
    axes[1].grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Word clouds
if 'text' in df.columns:
    print("Generating word clouds...")
    
    fig, axes = plt.subplots(1, 2, figsize=(18, 8))
    
    # Fake news word cloud
    fake_text = ' '.join(df[df['label'] == 0]['text'].astype(str))
    wordcloud_fake = WordCloud(width=800, height=400, 
                               background_color='white',
                               stopwords=stop_words,
                               colormap='Reds').generate(fake_text)
    
    axes[0].imshow(wordcloud_fake, interpolation='bilinear')
    axes[0].axis('off')
    axes[0].set_title('Word Cloud - FAKE News', fontsize=16, fontweight='bold')
    
    # Real news word cloud
    real_text = ' '.join(df[df['label'] == 1]['text'].astype(str))
    wordcloud_real = WordCloud(width=800, height=400,
                               background_color='white',
                               stopwords=stop_words,
                               colormap='Blues').generate(real_text)
    
    axes[1].imshow(wordcloud_real, interpolation='bilinear')
    axes[1].axis('off')
    axes[1].set_title('Word Cloud - REAL News', fontsize=16, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print("✓ Word clouds generated")

## 9. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
print("="*50)
print("CORRELATION ANALYSIS")
print("="*50)

# Select numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numerical columns: {numerical_cols}")

if len(numerical_cols) > 1:
    correlation_matrix = df[numerical_cols].corr()
    print("\nCorrelation Matrix:")
    print(correlation_matrix)
    
    # Visualization
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1)
    plt.title('Correlation Matrix - Numerical Features', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    # Correlation with label
    if 'label' in numerical_cols:
        label_corr = correlation_matrix['label'].sort_values(ascending=False)
        print("\nCorrelation with Label:")
        print(label_corr)
        
        # Visualize correlation with label
        plt.figure(figsize=(10, 6))
        label_corr.drop('label').plot(kind='barh', color='steelblue')
        plt.title('Feature Correlation with Label', fontsize=14, fontweight='bold')
        plt.xlabel('Correlation Coefficient')
        plt.ylabel('Feature')
        plt.axvline(x=0, color='red', linestyle='--', linewidth=1)
        plt.grid(axis='x', alpha=0.3)
        plt.tight_layout()
        plt.show()

## 10. Statistical Tests

In [None]:
# Statistical significance tests
from scipy import stats

print("="*50)
print("STATISTICAL SIGNIFICANCE TESTS")
print("="*50)

# T-test for text length
if 'text_length' in df.columns:
    fake_text_len = df[df['label'] == 0]['text_length']
    real_text_len = df[df['label'] == 1]['text_length']
    
    t_stat, p_value = stats.ttest_ind(fake_text_len, real_text_len)
    
    print("\nT-Test: Text Length (Fake vs Real)")
    print(f"T-statistic: {t_stat:.4f}")
    print(f"P-value: {p_value:.4e}")
    print(f"Significant difference: {'YES' if p_value < 0.05 else 'NO'} (α=0.05)")

# T-test for title length
if 'title_length' in df.columns:
    fake_title_len = df[df['label'] == 0]['title_length']
    real_title_len = df[df['label'] == 1]['title_length']
    
    t_stat, p_value = stats.ttest_ind(fake_title_len, real_title_len)
    
    print("\nT-Test: Title Length (Fake vs Real)")
    print(f"T-statistic: {t_stat:.4f}")
    print(f"P-value: {p_value:.4e}")
    print(f"Significant difference: {'YES' if p_value < 0.05 else 'NO'} (α=0.05)")

# Chi-square test for subject distribution
if 'subject' in df.columns:
    subject_label_contingency = pd.crosstab(df['subject'], df['label'])
    chi2, p_value, dof, expected = stats.chi2_contingency(subject_label_contingency)
    
    print("\nChi-Square Test: Subject vs Label")
    print(f"Chi-square statistic: {chi2:.4f}")
    print(f"P-value: {p_value:.4e}")
    print(f"Degrees of freedom: {dof}")
    print(f"Significant association: {'YES' if p_value < 0.05 else 'NO'} (α=0.05)")

## 11. Summary & Key Findings

In [None]:
print("="*70)
print("EDA SUMMARY - KEY FINDINGS")
print("="*70)

print(f"\n1. DATASET OVERVIEW")
print(f"   - Total records: {len(df):,}")
print(f"   - Features: {df.shape[1]}")
print(f"   - Fake news: {(df['label']==0).sum():,} ({(df['label']==0).sum()/len(df)*100:.1f}%)")
print(f"   - Real news: {(df['label']==1).sum():,} ({(df['label']==1).sum()/len(df)*100:.1f}%)")

print(f"\n2. DATA QUALITY")
print(f"   - Missing values: {df.isnull().sum().sum()}")
print(f"   - Duplicate records: {df.duplicated().sum()}")

if 'text_length' in df.columns:
    print(f"\n3. TEXT CHARACTERISTICS")
    print(f"   - Avg text length (Fake): {df[df['label']==0]['text_length'].mean():.0f} chars")
    print(f"   - Avg text length (Real): {df[df['label']==1]['text_length'].mean():.0f} chars")

if 'title_length' in df.columns:
    print(f"   - Avg title length (Fake): {df[df['label']==0]['title_length'].mean():.1f} chars")
    print(f"   - Avg title length (Real): {df[df['label']==1]['title_length'].mean():.1f} chars")

if 'subject' in df.columns:
    print(f"\n4. SUBJECT DISTRIBUTION")
    print(f"   - Unique subjects: {df['subject'].nunique()}")
    print(f"   - Most common subject: {df['subject'].value_counts().index[0]}")

if 'year' in df.columns:
    print(f"\n5. TEMPORAL DISTRIBUTION")
    years = df['year'].dropna().unique()
    print(f"   - Year range: {sorted(years)[0] if len(years) > 0 else 'N/A'} - {sorted(years)[-1] if len(years) > 0 else 'N/A'}")
    print(f"   - Most active year: {df['year'].value_counts().index[0] if len(df['year'].value_counts()) > 0 else 'N/A'}")

print("\n6. RECOMMENDATIONS FOR MODELING")
print("   - Balance classes if needed (current ratio shows potential imbalance)")
print("   - Consider text length as a feature")
print("   - Use temporal features (year, month) for time-based patterns")
print("   - Subject can be a strong categorical feature")
print("   - Apply NLP techniques (TF-IDF, embeddings) for text processing")

print("\n" + "="*70)
print("EDA COMPLETED SUCCESSFULLY")
print("="*70)

## 12. Export Processed Data (Optional)

In [None]:
# Save processed dataframe with additional features
output_path = '../data/processed_dataset.csv'

# Uncomment to save
# df.to_csv(output_path, index=False)
# print(f"✓ Processed data saved to: {output_path}")

print("\nTo save the processed dataset, uncomment the code above.")