# RQ1: Spread Pattern Analysis - Bot vs Human Misinformation

## Research Question
**What are the differences in how misinformation spreads on social media when driven by bots compared to when driven by human users?**

### Analysis Focus
1. Speed of spread (temporal dynamics)
2. Audience reach and size
3. Network structure of propagation
4. Cascading patterns and virality metrics

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from pathlib import Path
import sys

# Add src to path
sys.path.append('../src')

from analysis.rq1_spread_patterns import RQ1Analyzer
from utils.metrics import *
from utils.visualization import *

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Environment setup complete!")

## Load Integrated Dataset

In [None]:
# Load integrated dataset
DATA_PATH = Path('../data/processed/integrated_dataset.parquet')

if DATA_PATH.exists():
    df = pd.read_parquet(DATA_PATH)
    print(f"Loaded {len(df)} records")
    print(f"\nDataset shape: {df.shape}")
    print(f"\nColumns: {df.columns.tolist()}")
    
    # Display basic statistics
    print("\nBasic Statistics:")
    print(f"- Unique users: {df['user_id'].nunique()}")
    print(f"- Bot posts: {(df['bot_label'] == 'bot').sum()}")
    print(f"- Human posts: {(df['bot_label'] == 'human').sum()}")
    print(f"- Fake news posts: {(df['label'] == 'fake').sum()}")
    print(f"- Real news posts: {(df['label'] == 'real').sum()}")
else:
    print(f"Dataset not found at {DATA_PATH}")
    print("Please run data integration pipeline first.")
    df = None

## Initialize RQ1 Analyzer

In [None]:
if df is not None:
    analyzer = RQ1Analyzer(df)
    print("RQ1 Analyzer initialized successfully")
else:
    print("Cannot initialize analyzer without data")

## Analysis 1: Speed of Spread

In [None]:
# Separate cascades
if df is not None:
    bot_cascades, human_cascades = analyzer.separate_cascades()
    
    # Analyze spread speed
    speed_results = analyzer.analyze_spread_speed(bot_cascades, human_cascades)
    
    print("\nSpeed Analysis Results:")
    for key, value in speed_results.items():
        print(f"{key}: {value:.4f}")

## Analysis 2: Audience Reach

In [None]:
# Analyze reach
if df is not None:
    reach_results = analyzer.analyze_reach(bot_cascades, human_cascades)
    
    print("\nReach Analysis Results:")
    for key, value in reach_results.items():
        print(f"{key}: {value:.4f}")

## Analysis 3: Temporal Patterns

In [None]:
# Analyze temporal patterns
if df is not None:
    temporal_results = analyzer.analyze_temporal_patterns()
    
    print("\nTemporal Analysis Results:")
    for key, value in temporal_results.items():
        print(f"{key}: {value}")

## Visualizations

In [None]:
# Cascade size distribution
if df is not None:
    bot_sizes = [len(c) for c in bot_cascades]
    human_sizes = [len(c) for c in human_cascades]
    
    plot_cascade_size_distribution(
        bot_sizes, human_sizes,
        save_path='../results/figures/rq1_cascade_sizes.png'
    )

In [None]:
# Temporal posting patterns
if df is not None and 'timestamp' in df.columns:
    plot_temporal_patterns(
        df, time_col='timestamp', bot_label_col='bot_label',
        window='1H',
        save_path='../results/figures/rq1_temporal_patterns.png'
    )

## Statistical Testing

In [None]:
from scipy import stats

if df is not None:
    # Mann-Whitney U test for cascade sizes
    u_stat, p_value = stats.mannwhitneyu(bot_sizes, human_sizes, alternative='two-sided')
    
    print("Statistical Test: Cascade Size Difference")
    print(f"Mann-Whitney U statistic: {u_stat}")
    print(f"P-value: {p_value}")
    print(f"Significant (p < 0.05): {p_value < 0.05}")

## Summary and Conclusions

In [None]:
if df is not None:
    print("="*60)
    print("RQ1 ANALYSIS SUMMARY")
    print("="*60)
    print("\n1. Speed of Spread:")
    print(f"   - Bot cascades spread {speed_results['velocity_ratio']:.2f}x faster than human cascades")
    
    print("\n2. Audience Reach:")
    print(f"   - Bot cascades reach {reach_results['size_ratio']:.2f}x more users on average")
    
    print("\n3. Key Findings:")
    print("   - [To be filled after analysis]")
    print("\n4. Implications:")
    print("   - [To be filled after analysis]")
    print("="*60)