In [None]:
# Setup: imports and display options
import pandas as pd
import numpy as np

pd.options.display.max_columns = 50
pd.options.display.width = 120
pd.options.display.max_rows = 20

print("Setup complete!")
print(f"pandas version: {pd.__version__}")

## 1. Load and Merge Data

We'll load the raw data and merge it (or load pre-merged data):

In [None]:
# Load raw data
media_df = pd.read_csv('../data/media_contacts.csv')
demo_df = pd.read_csv('../data/socio_demos.csv')

# Standardize column names
media_df.columns = media_df.columns.str.strip().str.lower().str.replace(' ', '_')
demo_df.columns = demo_df.columns.str.strip().str.lower().str.replace(' ', '_')

print(f"Media: {media_df.shape}")
print(f"Demo: {demo_df.shape}")

In [None]:
# Merge datasets
merged_df = pd.merge(
    media_df,
    demo_df,
    on='person_id',
    how='inner'
)

print(f"Merged dataset: {merged_df.shape}")
merged_df.head()

## 2. GroupBy Basics

### What is GroupBy?

GroupBy splits data into groups, applies a function to each group, and combines the results.

**Think of it as**: "For each category, calculate..."

In [None]:
# Simple groupby - count rows by gender
gender_counts = merged_df.groupby('gender').size()

print("Count by gender:")
print(gender_counts)
print(f"\nType: {type(gender_counts)}")

In [None]:
# Calculate mean TV exposure by gender
tv_by_gender = merged_df.groupby('gender')['tv_total'].mean()

print("Average TV exposure by gender:")
print(tv_by_gender)

### Viewing Groups

Let's see what groups actually look like:

In [None]:
# Create a groupby object
grouped = merged_df.groupby('gender')

print(f"Number of groups: {grouped.ngroups}")
print(f"\nGroup names: {list(grouped.groups.keys())}")
print(f"\nGroup sizes:")
print(grouped.size())

In [None]:
# Iterate through groups
print("First 2 rows from each gender group:\n")
for name, group in grouped:
    print(f"Group: {name}")
    print(group[['person_id', 'gender', 'tv_total', 'purchase']].head(2))
    print()

## 3. Aggregation Functions

### Single Aggregation

In [None]:
# Calculate various statistics by gender
print("TV Total statistics by gender:")
print(merged_df.groupby('gender')['tv_total'].agg(['mean', 'median', 'min', 'max', 'std', 'count']))

In [None]:
# Multiple columns, single aggregation
print("\nMean exposure across channels by gender:")
channel_cols = ['tv_total', 'online_total', 'print_total']
print(merged_df.groupby('gender')[channel_cols].mean())

### Multiple Aggregations with `.agg()`

In [None]:
# Different aggregations for different columns
agg_result = merged_df.groupby('gender').agg({
    'tv_total': ['mean', 'median', 'max'],
    'online_total': ['mean', 'sum'],
    'purchase': ['sum', 'count', 'mean']
})

print("Multiple aggregations:")
print(agg_result)

In [None]:
# Flatten multi-level column names
agg_flat = merged_df.groupby('gender').agg({
    'tv_total': ['mean', 'median'],
    'purchase': ['sum', 'count', 'mean']
})

# Flatten columns
agg_flat.columns = ['_'.join(col).strip() for col in agg_flat.columns.values]
print("\nFlattened column names:")
print(agg_flat)

## 4. Purchase Rate Analysis

Let's calculate purchase rates (conversion rates) by different segments:

### Purchase Rate by Gender

In [None]:
# Calculate purchase rate by gender
purchase_by_gender = merged_df.groupby('gender').agg({
    'purchase': ['sum', 'count', 'mean']
})

purchase_by_gender.columns = ['purchases', 'total_people', 'purchase_rate']

print("Purchase analysis by gender:")
print(purchase_by_gender)
print(f"\nOverall purchase rate: {merged_df['purchase'].mean():.2%}")

### Purchase Rate by Age Bands

First, let's create age bands:

In [None]:
# Parse birthday and calculate age
merged_df['birthday_dt'] = pd.to_datetime(
    merged_df['birthday'].astype(int).astype(str), 
    format='%Y%m%d'
)
merged_df['age'] = 2025 - merged_df['birthday_dt'].dt.year

# Create age bands
merged_df['age_band'] = pd.cut(
    merged_df['age'],
    bins=[0, 18, 25, 35, 45, 55, 65, 100],
    labels=['<18', '18-24', '25-34', '35-44', '45-54', '55-64', '65+']
)

print("Age distribution:")
print(merged_df['age_band'].value_counts().sort_index())

In [None]:
# Purchase rate by age band
purchase_by_age = merged_df.groupby('age_band', observed=True).agg({
    'purchase': ['sum', 'count', 'mean'],
    'tv_total': 'mean',
    'online_total': 'mean'
})

purchase_by_age.columns = ['purchases', 'total', 'purchase_rate', 'avg_tv', 'avg_online']

print("\nPurchase rate by age band:")
print(purchase_by_age)

### Purchase Rate by Household Size

In [None]:
# Extract household size number
merged_df['household_size'] = merged_df['people_in_household'].str.extract(r'(\d+)')[0].astype(int)

# Purchase rate by household size
purchase_by_hh = merged_df.groupby('household_size').agg({
    'purchase': ['sum', 'count', 'mean'],
    'person_id': 'count'
}).round(4)

purchase_by_hh.columns = ['purchases', 'total_people', 'purchase_rate', 'count_check']

print("\nPurchase rate by household size:")
print(purchase_by_hh)

## 5. Weighted Means

Survey data often includes weights. Let's calculate weighted purchase rates:

In [None]:
# Calculate weighted purchase rate by gender
def weighted_mean(group):
    """Calculate weighted mean."""
    return (group['purchase'] * group['weight']).sum() / group['weight'].sum()

weighted_purchase = merged_df.groupby('gender').apply(weighted_mean)

print("Weighted purchase rate by gender:")
print(weighted_purchase)

# Compare to unweighted
unweighted_purchase = merged_df.groupby('gender')['purchase'].mean()
print("\nUnweighted purchase rate by gender:")
print(unweighted_purchase)

print("\nDifference:")
print(weighted_purchase - unweighted_purchase)

In [None]:
# More complete weighted analysis
def weighted_stats(group):
    """Calculate weighted statistics."""
    total_weight = group['weight'].sum()
    return pd.Series({
        'weighted_purchase_rate': (group['purchase'] * group['weight']).sum() / total_weight,
        'weighted_avg_tv': (group['tv_total'] * group['weight']).sum() / total_weight,
        'weighted_avg_online': (group['online_total'] * group['weight']).sum() / total_weight,
        'total_weight': total_weight,
        'count': len(group)
    })

weighted_by_gender = merged_df.groupby('gender').apply(weighted_stats)

print("\nWeighted statistics by gender:")
print(weighted_by_gender)

## 6. Multiple GroupBy Columns

Group by more than one column for deeper insights:

In [None]:
# Purchase rate by gender AND age band
multi_group = merged_df.groupby(['gender', 'age_band'], observed=True).agg({
    'purchase': ['sum', 'count', 'mean'],
    'tv_total': 'mean'
})

multi_group.columns = ['purchases', 'total', 'purchase_rate', 'avg_tv']

print("Purchase rate by gender and age band:")
print(multi_group)

In [None]:
# Unstack to create a pivot-like view
pivot_view = multi_group['purchase_rate'].unstack()

print("\nPurchase rate pivot (Gender x Age Band):")
print(pivot_view)

## 7. Named Aggregations (Cleaner Syntax)

pandas 0.25+ supports named aggregations for more readable code:

In [None]:
# Named aggregations - much cleaner!
named_agg = merged_df.groupby('gender').agg(
    total_people=('person_id', 'count'),
    total_purchases=('purchase', 'sum'),
    purchase_rate=('purchase', 'mean'),
    avg_tv_exposure=('tv_total', 'mean'),
    avg_online_exposure=('online_total', 'mean'),
    median_weight=('weight', 'median')
)

print("Named aggregations:")
print(named_agg)

In [None]:
# Custom functions with named aggregations
def purchase_count(x):
    """Count purchases (where purchase == 1)."""
    return (x == 1).sum()

named_custom = merged_df.groupby('age_band', observed=True).agg(
    sample_size=('person_id', 'count'),
    purchasers=('purchase', purchase_count),
    non_purchasers=('purchase', lambda x: (x == 0).sum()),
    purchase_rate=('purchase', 'mean'),
    high_tv_users=('tv_total', lambda x: (x > 50).sum())
)

print("\nCustom aggregations by age band:")
print(named_custom)

## 8. Filtering Groups

Filter out groups that don't meet certain criteria:

In [None]:
# Only keep age bands with at least 1000 people
min_sample = 1000

large_groups = merged_df.groupby('age_band', observed=True).filter(
    lambda x: len(x) >= min_sample
)

print(f"Original rows: {len(merged_df)}")
print(f"After filtering (>= {min_sample} per group): {len(large_groups)}")
print(f"\nRemaining age bands:")
print(large_groups['age_band'].value_counts().sort_index())

## 9. Transform vs Aggregate

**Aggregate** reduces groups to summary statistics.  
**Transform** returns a value for each row:

In [None]:
# Aggregate - one value per group
agg_result = merged_df.groupby('gender')['tv_total'].mean()
print("Aggregate (one value per group):")
print(agg_result)
print(f"Result size: {len(agg_result)}")

In [None]:
# Transform - one value per original row
merged_df['tv_gender_mean'] = merged_df.groupby('gender')['tv_total'].transform('mean')

print("\nTransform (value for each row):")
print(merged_df[['person_id', 'gender', 'tv_total', 'tv_gender_mean']].head(10))
print(f"Result size: {len(merged_df)}")

In [None]:
# Use transform to calculate deviation from group mean
merged_df['tv_vs_gender_avg'] = merged_df['tv_total'] - merged_df['tv_gender_mean']

print("\nDeviation from gender average:")
print(merged_df[['gender', 'tv_total', 'tv_gender_mean', 'tv_vs_gender_avg']].head(10))

## 10. Creating Crosstabs

Crosstabs are perfect for categorical data analysis:

In [None]:
# Simple crosstab
crosstab = pd.crosstab(
    merged_df['gender'],
    merged_df['purchase'],
    margins=True
)

print("Crosstab: Gender x Purchase:")
print(crosstab)

In [None]:
# Crosstab with percentages
crosstab_pct = pd.crosstab(
    merged_df['gender'],
    merged_df['purchase'],
    normalize='index'  # Row percentages
) * 100

print("\nCrosstab with row percentages:")
print(crosstab_pct.round(2))

In [None]:
# Crosstab with values (like weighted average)
crosstab_values = pd.crosstab(
    merged_df['age_band'],
    merged_df['gender'],
    values=merged_df['tv_total'],
    aggfunc='mean'
)

print("\nAverage TV exposure by Age Band x Gender:")
print(crosstab_values.round(2))

## 11. Pivot Tables

Pivot tables are similar to crosstabs but more flexible:

In [None]:
# Create pivot table
pivot = merged_df.pivot_table(
    values='purchase',
    index='age_band',
    columns='gender',
    aggfunc='mean',
    margins=True
)

print("Pivot table: Purchase rate by Age Band x Gender:")
print(pivot.round(4))

In [None]:
# Multiple aggregations in pivot table
pivot_multi = merged_df.pivot_table(
    values=['purchase', 'tv_total', 'online_total'],
    index='age_band',
    columns='gender',
    aggfunc={'purchase': 'mean', 'tv_total': 'mean', 'online_total': 'mean'}
)

print("\nMulti-metric pivot table:")
print(pivot_multi.round(2))

## 12. Real-World Analysis Example

Let's combine everything to answer: **"Which demographic segments have the highest purchase rates?"**

In [None]:
# Comprehensive segment analysis
segment_analysis = merged_df.groupby(['gender', 'age_band'], observed=True).agg(
    sample_size=('person_id', 'count'),
    purchases=('purchase', 'sum'),
    purchase_rate=('purchase', 'mean'),
    avg_tv=('tv_total', 'mean'),
    avg_online=('online_total', 'mean'),
    avg_print=('print_total', 'mean'),
    avg_weight=('weight', 'mean')
).round(4)

# Sort by purchase rate
segment_analysis_sorted = segment_analysis.sort_values('purchase_rate', ascending=False)

print("Top segments by purchase rate:")
print(segment_analysis_sorted.head(10))

In [None]:
# Filter to segments with meaningful sample size
min_n = 500
large_segments = segment_analysis[segment_analysis['sample_size'] >= min_n]

print(f"\nSegments with at least {min_n} people:")
print(large_segments.sort_values('purchase_rate', ascending=False))

## Summary

In this notebook, you learned:

âœ… GroupBy fundamentals and mechanics  
âœ… Aggregate with single and multiple functions  
âœ… Calculate purchase rates by demographic segments  
âœ… Use weighted means for survey data  
âœ… Named aggregations for readable code  
âœ… Filter groups based on size or criteria  
âœ… Transform vs aggregate operations  
âœ… Create crosstabs and pivot tables  
âœ… Perform multi-dimensional analysis  
âœ… Build real-world segment analysis

### Key Takeaways

1. **GroupBy is powerful**: Master it for data analysis
2. **Use named aggregations**: Makes code more readable
3. **Consider sample size**: Filter small groups for robust analysis
4. **Use weights when available**: Survey data requires weighted statistics
5. **Transform for row-level calculations**: Great for creating new features
6. **Crosstabs and pivots**: Perfect for categorical analysis

### Next Steps

In the next notebook (**06_reshaping_and_pivoting.ipynb**), we'll:
- Reshape data between wide and long formats
- Use melt() for tidy data
- Create complex pivot tables
- Handle multi-index DataFrames
- Prepare data for visualization

## ðŸŽ¯ Practice Exercises

Try these on your own:

1. Calculate the median TV exposure by household size
2. Find which age band has the highest average online exposure
3. Create a crosstab of household size x purchase with row percentages
4. Calculate weighted purchase rate by age band
5. Find the top 5 segments (gender x age_band) by total purchases
6. Use transform to create a "percent of gender total" column for TV exposure
7. Filter to keep only age bands where purchase rate > 30%
8. Create a pivot table showing count of people by gender x household size

### Bonus Challenges

9. Calculate the correlation between TV exposure and purchase rate by age band
10. Create age x gender segments and find which has highest "TV per purchaser"
11. Use .apply() to create a custom metric: (purchases * avg_tv) / sample_size
12. Create a function that takes a groupby column and returns a formatted summary table

## Loading/Saving Data Between Notebooks

### Load Pre-Merged Data

If you saved merged data in notebook 04:

```python
# Uncomment to load previously merged data
# merged_df = pd.read_csv('../outputs/merged_data.csv')
# 
# # Or load from Parquet (faster)
# merged_df = pd.read_parquet('../outputs/merged_data.parquet')
#
# print(f"Loaded merged data: {merged_df.shape}")
```

### Save Aggregated Results

Save your summary tables for reporting:

```python
# Uncomment to save analysis results
# import os
# os.makedirs('../outputs', exist_ok=True)
#
# # Save segment analysis
# segment_analysis.to_csv('../outputs/segment_analysis.csv')
#
# # Save purchase rate summary
# purchase_by_age.to_csv('../outputs/purchase_by_age.csv')
#
# # Save pivot table
# pivot.to_csv('../outputs/purchase_pivot.csv')
#
# print("Analysis results saved!")
```