# Notebook 01 — Setup & Data Load 
Goal: load the real Hillstrom dataset, unify schema, do basic sanity checks, and save a processed copy.


In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklift.datasets import fetch_hillstrom

In [8]:
BASE = Path.cwd().parents[0] if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = BASE / "data"
PROC_DIR = DATA_DIR / 'processed'
PROC_DIR.mkdir(parents=True, exist_ok=True)

print('Base: ', BASE)
print('Data Dir: ', DATA_DIR)
print('Process Dir: ', PROC_DIR)

Base:  D:\hillstrom-ab-rfm-ml
Data Dir:  D:\hillstrom-ab-rfm-ml\data
Process Dir:  D:\hillstrom-ab-rfm-ml\data\processed


In [36]:
def load_hillstrom():
    try:
        b_spend = fetch_hillstrom(target_col='spend')
        b_visit = fetch_hillstrom(target_col='visit')
        b_conv = fetch_hillstrom(target_col='conversion')

        df = b_spend['data'].copy()
        df['segment'] = b_spend['treatment']
        df['spend'] = b_spend['target'].astype(float)
        df['visit'] = b_visit['target'].astype(int)
        df['conversion'] = b_conv['target'].astype(int)

        
        return df, 'scikit uplift loader'
    except Exception as e:
        print("loader failed: ", e)
        
        url = "http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv"
        df = pd.read_csv(url)
        (DATA_DIR / "hillstrom.csv").write_text(df.to_csv(index=false))
        return df, "direct url"
        
df, source = load_hillstrom()
print('data set size: ', df.shape,
      ' from source:', source)
        
        

data set size:  (64000, 12)  from source: scikit uplift loader


In [37]:
df.columns = (df.columns.str.strip()
                            .str.lower()
                            .str.replace(' ', '-')
                            .str.replace(' ', '_'))

required = {"recency","history","history_segment","segment","visit","conversion","spend"}
missing = required - set(df.columns)

if missing:
    print('there is missing columns')
else:
    print('every thing is in place')


every thing is in place


In [38]:
for c in ['recency', 'history', 'spend', 'visit', 'conversion']:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df[['visit', 'conversion', 'spend']].describe()

Unnamed: 0,visit,conversion,spend
count,64000.0,64000.0,64000.0
mean,0.146781,0.009031,1.050908
std,0.35389,0.094604,15.036448
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,499.0


In [54]:
arm_map = {
    'Womens E-Mail': 'womens',
    'Mens E-Mail': 'mens',
    'No E-Mail': 'control'
}

df['arm'] = df['segment'].map(arm_map)
df['arm_men']     = (df['arm'] == 'mens').astype(int)
df['arm_women']   =  (df['arm'] == 'womens').astype(int)
df['arm_control'] = (df['arm'] == 'control').astype(int)

In [55]:
df[['segment', 'arm_control', 'arm_men', 'arm_women']]

Unnamed: 0,segment,arm_control,arm_men,arm_women
0,Womens E-Mail,0,0,1
1,No E-Mail,1,0,0
2,Womens E-Mail,0,0,1
3,Mens E-Mail,0,1,0
4,Womens E-Mail,0,0,1
...,...,...,...,...
63995,Mens E-Mail,0,1,0
63996,Mens E-Mail,0,1,0
63997,Mens E-Mail,0,1,0
63998,Womens E-Mail,0,0,1


In [65]:
summary = df.groupby('arm').agg(
    n = ('arm', 'size'),
    visit_rate = ('visit', 'mean'),
    conversion_rate = ('conversion', 'mean'),
    avg_spend = ('spend', 'mean'),
    p_spend_gt0 = ('spend', lambda s: (s>0).mean())
).sort_index()

In [66]:
summary

Unnamed: 0_level_0,n,visit_rate,conversion_rate,avg_spend,p_spend_gt0
arm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
control,21306,0.106167,0.005726,0.652789,0.005726
mens,21307,0.182757,0.012531,1.422617,0.012531
womens,21387,0.1514,0.008837,1.077202,0.008837


In [68]:
out_csv = PROC_DIR / 'hillstrom_clean.csv'
out_parquet = PROC_DIR / 'hillstrom_clean.parquet'

df.to_csv(out_csv, index=False)
df.to_parquet(out_parquet)

out_csv, out_parquet.exists()

(WindowsPath('D:/hillstrom-ab-rfm-ml/data/processed/hillstrom_clean.csv'),
 True)