# Exploratory Data Analysis - Coupon Campaign

This notebook performs initial EDA on the coupon campaign dataset.

In [None]:
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from coupon_causal import data, utils, viz

sns.set_style('whitegrid')
%matplotlib inline

## 1. Load Data

In [None]:
# Load config and generate synthetic data
config = utils.load_config('../config/default.yaml')
utils.set_random_seed(config['random_state'])

df, ground_truth = data.generate_synthetic_coupon_data(
    n_samples=config['synthetic']['n_samples'],
    treatment_rate=config['synthetic']['treatment_rate'],
    true_ate=config['synthetic']['true_ate'],
    random_state=config['random_state']
)

print(f"Dataset shape: {df.shape}")
print(f"True ATE: ${ground_truth['true_ate']:.2f}")

## 2. Basic Statistics

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
# Treatment distribution
print("Treatment Distribution:")
print(df['treatment'].value_counts())
print(f"\nTreatment rate: {df['treatment'].mean():.1%}")

## 3. Outcome Analysis by Treatment

In [None]:
# Naive comparison
outcome_by_treatment = df.groupby('treatment')['outcome'].agg(['count', 'mean', 'std'])
print("Outcome by Treatment Group:")
print(outcome_by_treatment)

naive_ate = outcome_by_treatment.loc[1, 'mean'] - outcome_by_treatment.loc[0, 'mean']
print(f"\nNaive ATE (difference in means): ${naive_ate:.2f}")
print(f"True ATE: ${ground_truth['true_ate']:.2f}")
print(f"Bias: ${naive_ate - ground_truth['true_ate']:.2f}")

In [None]:
# Visualize treatment and outcome
viz.plot_treatment_distribution(df['treatment'].values, df['outcome'].values)
plt.show()

## 4. Covariate Balance Check

In [None]:
# Check balance on key covariates
from coupon_causal.utils import compute_standardized_mean_difference

key_features = ['loyalty_score', 'recency_days', 'monetary_value', 'frequency_purchases']

balance_stats = []
for feature in key_features:
    treated = df[df['treatment'] == 1][feature].values
    control = df[df['treatment'] == 0][feature].values
    
    smd = compute_standardized_mean_difference(treated, control)
    
    balance_stats.append({
        'feature': feature,
        'mean_treated': treated.mean(),
        'mean_control': control.mean(),
        'smd': smd
    })

balance_df = pd.DataFrame(balance_stats)
print("\nCovariate Balance (before adjustment):")
print(balance_df)

## 5. Feature Distributions

In [None]:
# Plot key feature distributions by treatment
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, feature in enumerate(key_features):
    axes[i].hist(df[df['treatment'] == 0][feature], bins=30, alpha=0.6, label='Control', color='steelblue')
    axes[i].hist(df[df['treatment'] == 1][feature], bins=30, alpha=0.6, label='Treated', color='coral')
    axes[i].set_xlabel(feature, fontsize=11)
    axes[i].set_ylabel('Frequency', fontsize=11)
    axes[i].set_title(f'{feature} Distribution', fontsize=12, fontweight='bold')
    axes[i].legend()

plt.tight_layout()
plt.show()

## 6. Segment Analysis

In [None]:
# Outcome by customer segment
segment_stats = df.groupby(['customer_segment', 'treatment'])['outcome'].agg(['count', 'mean']).reset_index()
print("\nOutcome by Segment and Treatment:")
print(segment_stats)

# Visualize
segment_pivot = segment_stats.pivot(index='customer_segment', columns='treatment', values='mean')
segment_pivot['diff'] = segment_pivot[1] - segment_pivot[0]
print("\nNaive uplift by segment:")
print(segment_pivot)

## Summary

Key observations from EDA:
1. Treatment is confounded - there are imbalances in covariates
2. Naive ATE is biased due to confounding
3. Need to use causal methods (propensity weighting, DR estimation) to get unbiased estimates

Next: Proceed to causal estimation notebook (10_causal_estimation.ipynb)