In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
from operator import attrgetter
import os

# Fix working directory (same as in 01_data_prep)
notebook_dir = Path(__file__).parent if '__file__' in dir() else Path.cwd()
if 'notebooks' in str(notebook_dir):
    project_root = notebook_dir.parent
else:
    project_root = Path.cwd()
os.chdir(project_root)

# Load the CAC/LTV model dataset
df = pd.read_csv("data/raw/cac_ltv_model.csv")

# Parse dates - the dataset has 'date' (transaction date) and year/month columns
df['date'] = pd.to_datetime(df['date'], format='%b-%y', errors='coerce')
df = df.dropna(subset=['date']).copy()

# For cohort analysis, we need signup month and active month
# Since we don't have separate signup/transaction dates, we'll use:
# - signup_month: the first month each customer appears (their cohort)
# - active_month: each month we see them (could be any transaction)
df['signup_month'] = df.groupby('customer_id')['date'].transform('min').dt.to_period('M')
df['active_month'] = df['date'].dt.to_period('M')

# Build cohort table: count unique customers by signup cohort and active month
cohort = df.groupby(['signup_month', 'active_month'])['customer_id'].nunique().reset_index()
cohort.rename(columns={'customer_id': 'customers'}, inplace=True)

# Calculate period number (months since signup)
cohort['period_number'] = (cohort['active_month'] - cohort['signup_month']).apply(attrgetter('n'))

# Ensure output directory exists
Path("data/cleaned").mkdir(parents=True, exist_ok=True)

# Save cohort table
cohort.to_csv("data/cleaned/cohort_table.csv", index=False)
print(f'Wrote data/cleaned/cohort_table.csv with {len(cohort)} rows')
print('\nFirst few rows:')
print(cohort.head(10))

Wrote data/cleaned/cohort_table.csv with 24 rows

First few rows:
  signup_month active_month  customers  period_number
0      2023-01      2023-01        302              0
1      2023-02      2023-02        270              0
2      2023-03      2023-03        307              0
3      2023-04      2023-04        306              0
4      2023-05      2023-05        268              0
5      2023-06      2023-06        336              0
6      2023-07      2023-07        230              0
7      2023-08      2023-08        230              0
8      2023-09      2023-09        365              0
9      2023-10      2023-10        393              0
