In [19]:
import pandas as pd
import numpy as np

# Load cleaned data
df = pd.read_csv('../data/cleaned/cleaned_retail.csv')

# Fix datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Sort chronologically
df = df.sort_values('InvoiceDate')

df.head()


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,TotalAmount,InvoiceMonth
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085,United Kingdom,83.4,2009-12
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom,81.0,2009-12
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom,81.0,2009-12
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085,United Kingdom,100.8,2009-12
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085,United Kingdom,30.0,2009-12


In [20]:
# Choose cutoff date (training data ends here)
cutoff_date = pd.Timestamp('2011-06-30')

# Define churn window (90 days after cutoff)
churn_window = 90

cutoff_date


Timestamp('2011-06-30 00:00:00')

In [21]:
# Data before cutoff (used for features)
df_past = df[df['InvoiceDate'] <= cutoff_date]

# Data after cutoff (used for churn labeling)
df_future = df[
    (df['InvoiceDate'] > cutoff_date) & 
    (df['InvoiceDate'] <= cutoff_date + pd.Timedelta(days=churn_window))
]

df_past.shape, df_future.shape


((579107, 10), (89246, 10))

In [22]:
# Snapshot date = cutoff date
snapshot_date = cutoff_date

customer_features = (
    df_past
    .groupby('Customer ID')
    .agg({
        'InvoiceDate': [
            lambda x: (snapshot_date - x.max()).days,   # Recency
            lambda x: (snapshot_date - x.min()).days    # Tenure
        ],
        'Invoice': 'nunique',                           # Frequency
        'TotalAmount': ['sum', 'mean']                  # Monetary & AOV
    })
)

customer_features.columns = [
    'Recency',
    'Tenure',
    'Frequency',
    'Monetary',
    'AvgOrderValue'
]

customer_features.head()


Unnamed: 0_level_0,Recency,Tenure,Frequency,Monetary,AvgOrderValue
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12346,162,562,12,77556.46,2281.072353
12347,20,241,5,3529.27,20.400405
12348,85,275,4,1709.4,35.6125
12349,244,426,3,2671.14,26.187647
12350,147,147,1,334.4,19.670588


In [23]:
# Purchase velocity = Frequency / Tenure (avoid divide by zero)
customer_features['PurchaseVelocity'] = (
    customer_features['Frequency'] / 
    customer_features['Tenure'].replace(0, 1)
)

customer_features.head()


Unnamed: 0_level_0,Recency,Tenure,Frequency,Monetary,AvgOrderValue,PurchaseVelocity
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12346,162,562,12,77556.46,2281.072353,0.021352
12347,20,241,5,3529.27,20.400405,0.020747
12348,85,275,4,1709.4,35.6125,0.014545
12349,244,426,3,2671.14,26.187647,0.007042
12350,147,147,1,334.4,19.670588,0.006803


In [24]:
# Customers who purchased in future window
future_customers = df_future['Customer ID'].unique()

# If customer NOT in future window â†’ churned
customer_features['Churn'] = ~customer_features.index.isin(future_customers)

customer_features['Churn'] = customer_features['Churn'].astype(int)

customer_features['Churn'].value_counts()


Churn
1    3333
0    1705
Name: count, dtype: int64

In [25]:
customer_features.head()


Unnamed: 0_level_0,Recency,Tenure,Frequency,Monetary,AvgOrderValue,PurchaseVelocity,Churn
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12346,162,562,12,77556.46,2281.072353,0.021352,1
12347,20,241,5,3529.27,20.400405,0.020747,0
12348,85,275,4,1709.4,35.6125,0.014545,0
12349,244,426,3,2671.14,26.187647,0.007042,1
12350,147,147,1,334.4,19.670588,0.006803,1


In [27]:
customer_features.to_csv('../data/cleaned/time_aware_churn_dataset.csv')

pd.read_csv('../data/cleaned/time_aware_churn_dataset.csv').head()


Unnamed: 0,Customer ID,Recency,Tenure,Frequency,Monetary,AvgOrderValue,PurchaseVelocity,Churn
0,12346,162,562,12,77556.46,2281.072353,0.021352,1
1,12347,20,241,5,3529.27,20.400405,0.020747,0
2,12348,85,275,4,1709.4,35.6125,0.014545,0
3,12349,244,426,3,2671.14,26.187647,0.007042,1
4,12350,147,147,1,334.4,19.670588,0.006803,1


In [28]:
customer_features['Churn'].value_counts(normalize=True) * 100


Churn
1    66.157205
0    33.842795
Name: proportion, dtype: float64