# Notebook 02: Exploratory Data Analysis & Fixed Effects Regressions

This notebook performs:
1. Summary statistics
2. Time series visualization
3. Correlation analysis and VIF for multicollinearity
4. Two-way fixed effects regressions

**Prerequisites:** Run Notebook 01 or `scripts/run_build_panel.py` first.

In [None]:
# Standard imports
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Project imports
from src import config
from src.io_utils import load_panel
from src.analysis_fe import (
    compute_correlation_matrix,
    compute_vif,
    run_all_fe_regressions,
    format_regression_table,
)
from src.plots import (
    plot_timeseries,
    plot_timeseries_by_treatment,
    plot_correlation_heatmap,
    plot_vif_bars,
    plot_distribution_grid,
)

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:.4f}'.format)

## Load Panel Data

In [None]:
# Load panel
paths = config.get_output_paths(config.ACTIVE_CATEGORY)
panel_df = load_panel(paths['panel_parquet'])

print(f"Panel shape: {panel_df.shape}")
print(f"Products: {panel_df['parent_asin'].nunique():,}")
print(f"Weeks: {panel_df['week_start'].nunique():,}")

In [None]:
# Define column groups
outcome_cols = ['ReviewCount', 'UniqueReviewers', 'AvgRating', 'RatingDisp',
                'VerifiedShare', 'AvgHelpful', 'AvgLen', 'ImageShare', 'logReviewCount']

topic_cols = [c for c in config.TOPIC_KEYWORDS.keys() if c in panel_df.columns]

print(f"\nOutcome variables: {outcome_cols}")
print(f"Topic variables: {topic_cols}")

## 1. Summary Statistics

In [None]:
# Summary statistics table
summary_cols = outcome_cols + topic_cols
summary_stats = panel_df[summary_cols].describe().T
summary_stats['missing'] = panel_df[summary_cols].isnull().sum()
summary_stats['missing_pct'] = (summary_stats['missing'] / len(panel_df) * 100).round(2)

print("Summary Statistics:")
summary_stats

In [None]:
# Save summary stats
summary_stats.to_csv(paths['summary_stats'])
print(f"Saved to {paths['summary_stats']}")

In [None]:
# Summary by treatment group
print("\nSummary by Treatment Group:")
panel_df.groupby('treated')[outcome_cols].mean().round(4)

In [None]:
# Summary by pre/post period
print("\nSummary by Pre/Post Period:")
panel_df.groupby('post')[outcome_cols].mean().round(4)

## 2. Time Series Visualization

In [None]:
# Review count over time
plot_timeseries(
    panel_df,
    y_col='ReviewCount',
    title='Average Weekly Review Count per Product',
    ylabel='Review Count',
    save_path=paths['timeseries_review_count'],
)
plt.show()

In [None]:
# Review count by treatment group
plot_timeseries_by_treatment(
    panel_df,
    y_col='ReviewCount',
    title='Average Weekly Review Count by Treatment Group',
    ylabel='Review Count',
)
plt.show()

In [None]:
# Verified share over time
plot_timeseries(
    panel_df,
    y_col='VerifiedShare',
    title='Average Verified Purchase Share',
    ylabel='Share',
    save_path=paths['timeseries_verified_share'],
)
plt.show()

In [None]:
# Verified share by treatment group
plot_timeseries_by_treatment(
    panel_df,
    y_col='VerifiedShare',
    title='Verified Purchase Share by Treatment Group',
    ylabel='Share',
)
plt.show()

In [None]:
# Average review length by treatment group
plot_timeseries_by_treatment(
    panel_df,
    y_col='AvgLen',
    title='Average Review Length by Treatment Group',
    ylabel='Characters',
)
plt.show()

In [None]:
# Average rating by treatment group
plot_timeseries_by_treatment(
    panel_df,
    y_col='AvgRating',
    title='Average Rating by Treatment Group',
    ylabel='Rating',
)
plt.show()

## 3. Topic Share Analysis

In [None]:
# Topic share time series
if topic_cols:
    fig, axes = plt.subplots(len(topic_cols), 1, figsize=(14, 3*len(topic_cols)))
    if len(topic_cols) == 1:
        axes = [axes]
    
    for ax, col in zip(axes, topic_cols):
        ts = panel_df.groupby('week_start')[col].mean()
        ax.plot(ts.index, ts.values, marker='o', markersize=3)
        ax.axvline(x=pd.Timestamp(config.AI_ROLLOUT_DATE), color='red', 
                   linestyle='--', label='AI Rollout')
        ax.set_ylabel(col)
        ax.set_title(f'{col} Over Time')
        ax.legend()
    
    plt.tight_layout()
    plt.show()

## 4. Multicollinearity Assessment

In [None]:
# Correlation matrix for topic shares
if topic_cols:
    corr_matrix = compute_correlation_matrix(panel_df, topic_cols)
    
    print("Topic Share Correlation Matrix:")
    print(corr_matrix.round(3))
    
    # Save
    corr_matrix.to_csv(paths['correlation_matrix'])
    print(f"\nSaved to {paths['correlation_matrix']}")

In [None]:
# Correlation heatmap
if topic_cols:
    plot_correlation_heatmap(
        corr_matrix,
        title='Topic Share Correlations',
        save_path=paths['correlation_heatmap'],
    )
    plt.show()

In [None]:
# Variance Inflation Factors
if topic_cols:
    vif_df = compute_vif(panel_df, topic_cols)
    
    print("Variance Inflation Factors:")
    print(vif_df.to_string(index=False))
    print("\nInterpretation: VIF > 5 suggests moderate multicollinearity, VIF > 10 is concerning")
    
    # Save
    vif_df.to_csv(paths['vif_table'], index=False)
    print(f"\nSaved to {paths['vif_table']}")

In [None]:
# VIF bar chart
if topic_cols:
    plot_vif_bars(
        vif_df,
        save_path=config.FIGURES_DIR / f"vif_{config.ACTIVE_CATEGORY}.png",
    )
    plt.show()

## 5. Sanity Checks

In [None]:
# Check share variables in [0, 1]
print("Share Variable Bounds Check:")
print("-" * 50)

share_cols = [c for c in panel_df.columns if 'Share' in c]
all_ok = True

for col in share_cols:
    min_val = panel_df[col].min()
    max_val = panel_df[col].max()
    
    if min_val < 0 or max_val > 1:
        print(f"ISSUE: {col}: min={min_val:.4f}, max={max_val:.4f}")
        all_ok = False
    else:
        print(f"OK: {col}: min={min_val:.4f}, max={max_val:.4f}")

if all_ok:
    print("\nAll share variables are in [0, 1] - OK!")

In [None]:
# Check for negative lengths
print(f"\nReview Length Check:")
print(f"AvgLen min: {panel_df['AvgLen'].min():.1f}")
print(f"AvgLen max: {panel_df['AvgLen'].max():.1f}")
print(f"Rows with AvgLen <= 0: {(panel_df['AvgLen'] <= 0).sum()}")

In [None]:
# Check missingness
print("\nMissingness by Column:")
missing = panel_df.isnull().sum()
missing_pct = (missing / len(panel_df) * 100).round(2)

missing_df = pd.DataFrame({'missing': missing, 'pct': missing_pct})
print(missing_df[missing_df['missing'] > 0])

## 6. Two-Way Fixed Effects Regressions

Run baseline FE regressions:

$$Y_{it} = \beta' \cdot \text{TopicShares}_{it} + \alpha_i + \gamma_t + \varepsilon_{it}$$

Where:
- $\alpha_i$ = product (entity) fixed effects
- $\gamma_t$ = week (time) fixed effects
- Standard errors clustered by entity

In [None]:
# Run FE regressions
fe_results_df, fe_full_results = run_all_fe_regressions(
    panel_df,
    outcomes=config.PRIMARY_OUTCOMES,
    features=topic_cols,
)

In [None]:
# Display results
if not fe_results_df.empty:
    table_str = format_regression_table(fe_results_df, topic_cols)
    print(table_str)

In [None]:
# Save FE results
if not fe_results_df.empty:
    fe_results_df.to_csv(paths['fe_results'], index=False)
    print(f"Saved to {paths['fe_results']}")

In [None]:
# Detailed results for one outcome
if 'logReviewCount' in fe_full_results:
    results = fe_full_results['logReviewCount']['model_results']
    print("\nDetailed Results: logReviewCount")
    print("=" * 60)
    print(results.summary)

## Interpretation Notes

### Topic Share Coefficients
- **Positive coefficient**: Higher mention share of the topic is associated with higher Y
- **Negative coefficient**: Higher mention share is associated with lower Y
- Coefficients are interpreted as: a 1 unit increase in topic share (i.e., going from 0% to 100%) is associated with a β unit change in Y
- For practical interpretation, multiply by 0.1 to get effect of 10 percentage point change

### R² Interpretation
- **Within R²**: Variation explained by topic shares after removing entity and time effects
- Low within R² is common in panel data with granular fixed effects

### Multicollinearity
- If VIF > 5 for some topics, consider combining or orthogonalizing
- High correlation between topics can inflate standard errors

## Next Steps

Proceed to **Notebook 03** for DiD and event study analysis.