# Notebook 01: Build Product-Week Panel (Diapers)

This notebook builds a product-week panel from the UCSD Amazon Reviews 2023 dataset,
focusing on diaper products.

**Pipeline steps:**
1. Load product metadata and filter by keywords
2. Stream reviews and filter by target products
3. Extract text features and topic mentions
4. Aggregate to product-week level
5. Merge product metadata
6. Save output panel

In [None]:
# Standard imports
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Project imports
from src import config
from src.io_utils import print_data_summary
from src.build_panel import build_panel_from_config

## Configuration

Review and modify paths as needed.

In [None]:
# Print current configuration
print("Current Configuration")
print("=" * 60)
print(f"Reviews path: {config.REVIEWS_PATH}")
print(f"Meta path: {config.META_PATH}")
print(f"Output dir: {config.OUTPUT_DIR}")
print(f"Active category: {config.ACTIVE_CATEGORY}")
print(f"Keywords: {config.KEYWORD_GROUPS.get(config.ACTIVE_CATEGORY, [])}")
print(f"AI rollout date: {config.AI_ROLLOUT_DATE}")
print(f"Treatment threshold: {config.TREATMENT_THRESHOLD}")
print("=" * 60)

# Validate configuration
config.validate_config()

In [None]:
# Optional: Override paths if needed
# Uncomment and modify these lines if your data is in a different location

# config.REVIEWS_PATH = Path("/path/to/Baby_Products.jsonl")
# config.META_PATH = Path("/path/to/meta_Baby_Products.jsonl")

## Build Panel

This will:
1. Load metadata and filter to diaper products
2. Stream reviews (memory-efficient)
3. Extract features and aggregate to product-week
4. Save to Parquet and CSV

In [None]:
# Build the panel
# This may take several minutes depending on data size

panel_df = build_panel_from_config(config.ACTIVE_CATEGORY)

## Inspect Panel

In [None]:
# Basic info
print(f"Panel shape: {panel_df.shape}")
print(f"\nColumns: {list(panel_df.columns)}")
print(f"\nMemory usage: {panel_df.memory_usage(deep=True).sum() / 1e6:.2f} MB")

In [None]:
# Preview data
panel_df.head(10)

In [None]:
# Data types
panel_df.dtypes

In [None]:
# Summary statistics for key variables
outcome_cols = ['ReviewCount', 'UniqueReviewers', 'AvgRating', 'RatingDisp',
                'VerifiedShare', 'AvgHelpful', 'AvgLen', 'ImageShare', 'logReviewCount']

panel_df[outcome_cols].describe().round(3)

In [None]:
# Topic share summary
topic_cols = [c for c in panel_df.columns if c.endswith('Share') 
              and c not in ['VerifiedShare', 'ImageShare']]

if topic_cols:
    print("Topic Share Summary:")
    print(panel_df[topic_cols].describe().round(4))

## Time Coverage

In [None]:
# Date range
print(f"Date range: {panel_df['week_start'].min()} to {panel_df['week_start'].max()}")
print(f"\nNumber of unique weeks: {panel_df['week_start'].nunique()}")

# Pre/post split
ai_rollout = pd.Timestamp(config.AI_ROLLOUT_DATE)
pre_weeks = panel_df[panel_df['week_start'] < ai_rollout]['week_start'].nunique()
post_weeks = panel_df[panel_df['week_start'] >= ai_rollout]['week_start'].nunique()
print(f"\nPre-period weeks (before {config.AI_ROLLOUT_DATE}): {pre_weeks}")
print(f"Post-period weeks (on/after {config.AI_ROLLOUT_DATE}): {post_weeks}")

In [None]:
# Reviews per week
weekly_reviews = panel_df.groupby('week_start')['ReviewCount'].sum()

fig, ax = plt.subplots(figsize=(14, 5))
weekly_reviews.plot(ax=ax, marker='o', markersize=3)
ax.axvline(x=ai_rollout, color='red', linestyle='--', label='AI Summary Rollout')
ax.set_xlabel('Week')
ax.set_ylabel('Total Reviews')
ax.set_title('Weekly Review Volume')
ax.legend()
plt.tight_layout()
plt.show()

## Treatment Groups

In [None]:
# Treatment assignment summary
print(f"Treatment threshold: rating_number >= {config.TREATMENT_THRESHOLD}")
print(f"\nTreatment distribution:")
print(panel_df.groupby('treated')['parent_asin'].nunique())

In [None]:
# Compare treated vs control
comparison_cols = ['ReviewCount', 'AvgRating', 'VerifiedShare', 'AvgLen']

print("\nMean values by treatment group:")
print(panel_df.groupby('treated')[comparison_cols].mean().round(3))

## Sanity Checks

In [None]:
# Check share variables are in [0, 1]
share_cols = [c for c in panel_df.columns if 'Share' in c]

for col in share_cols:
    min_val = panel_df[col].min()
    max_val = panel_df[col].max()
    status = "OK" if 0 <= min_val and max_val <= 1 else "ISSUE"
    print(f"{col}: min={min_val:.4f}, max={max_val:.4f} [{status}]")

In [None]:
# Check for missing values
print("\nMissing values:")
missing = panel_df.isnull().sum()
print(missing[missing > 0])

In [None]:
# Check AvgLen is positive
print(f"\nAvgLen: min={panel_df['AvgLen'].min():.1f}, max={panel_df['AvgLen'].max():.1f}")
print(f"Rows with AvgLen <= 0: {(panel_df['AvgLen'] <= 0).sum()}")

## Save Confirmation

In [None]:
# Confirm output files
paths = config.get_output_paths(config.ACTIVE_CATEGORY)

print("Output files:")
for name, path in paths.items():
    if path.exists():
        size_mb = path.stat().st_size / 1e6
        print(f"  {name}: {path} ({size_mb:.2f} MB)")
    else:
        print(f"  {name}: {path} (not created)")

## Next Steps

Panel is built! Proceed to:
- **Notebook 02**: EDA and correlation analysis
- **Notebook 03**: DiD and event study analysis

Or run the full analysis pipeline:
```bash
python scripts/run_analysis.py
```