# SPRINT 2 — Feature Engineering
# Goal: Create behavioral, demographic, and RFM features to enable segmentation.


In [44]:
import numpy as np
import pandas as pd
from pathlib import Path
from IPython.display import display

# PATHS 
interim_path  = Path("../Data/Interim/marketing_campaign_clean.csv")
processed_dir = Path("../Data/Processed")
processed_dir.mkdir(parents=True, exist_ok=True)

# 1) Load clean dataset from Step 1
df = pd.read_csv(interim_path, parse_dates=["Dt_Customer"])

# 2) Define key column groups
spend_cols    = ["MntWines","MntFruits","MntMeatProducts","MntFishProducts","MntSweetProducts","MntGoldProds"]
purchase_cols = ["NumWebPurchases","NumCatalogPurchases","NumStorePurchases","NumDealsPurchases"]

# Sanity check: all required columns must exist
for c in spend_cols + purchase_cols + ["NumWebVisitsMonth","Response","Year_Birth","Kidhome","Teenhome","Recency","Income","Education","Marital_Status"]:
    assert c in df.columns, f"Brak kolumny w danych: {c}"

# 3) Define snapshot date (latest customer + max recency)
snapshot_date = df["Dt_Customer"].max() + pd.Timedelta(days=int(df["Recency"].max()))

# 4) Core engineered features
df["Age"]         = (snapshot_date.year - df["Year_Birth"]).clip(lower=0, upper=120)
df["Children"]    = df["Kidhome"] + df["Teenhome"]
df["HasChildren"] = (df["Children"] > 0).astype(int)

df["TotalSpend"]     = df[spend_cols].sum(axis=1)
df["TotalPurchases"] = df[purchase_cols].sum(axis=1)
df["AOV"]            = np.where(df["TotalPurchases"] > 0, df["TotalSpend"]/df["TotalPurchases"], 0.0)

df["TenureDays"]   = (snapshot_date - df["Dt_Customer"]).dt.days.clip(lower=1)
df["TenureMonths"] = (df["TenureDays"] / 30.44).round(2)

# 5) Basket share by product category
for c in spend_cols:
    # nazwa np. Share_MntWines -> bazujemy na pełnej nazwie po 'Mnt'
    suffix = c.replace("Mnt", "")  # np. 'Wines'
    df[f"Share_Mnt{suffix}"] = np.where(df["TotalSpend"] > 0, df[c] / df["TotalSpend"], 0.0)

# 6) Channel share (web, catalog, store, deals)
for c in purchase_cols:
    df[f"Share_{c}"] = np.where(df["TotalPurchases"] > 0, df[c] / df["TotalPurchases"], 0.0)

# 7) Web conversion proxy
df["WebConvRate_Approx"] = np.where(df["NumWebVisitsMonth"] > 0,
                                    df["NumWebPurchases"] / df["NumWebVisitsMonth"], np.nan)

# 8) Customer Lifetime Value (CLV proxy)
# CLV = AOV * frequency_per_month * 12 * gross_margin
gross_margin   = 0.30
freq_per_month = np.where(df["TenureMonths"] > 0, df["TotalPurchases"] / df["TenureMonths"], 0.0)
df["CLV_Proxy"] = df["AOV"] * freq_per_month * 12 * gross_margin

# 9) RFM scoring
df["R_Score"] = pd.qcut(df["Recency"], 5, labels=[5,4,3,2,1])
df["F_Score"] = pd.qcut(df["TotalPurchases"].rank(method="first"), 5, labels=[1,2,3,4,5])
df["M_Score"] = pd.qcut(df["TotalSpend"].rank(method="first"), 5, labels=[1,2,3,4,5])

df[["R_Score","F_Score","M_Score"]] = df[["R_Score","F_Score","M_Score"]].astype(int)
df["RFM_Score"] = df[["R_Score","F_Score","M_Score"]].sum(axis=1)

# Business-driven segmentation using RFM labels
def rfm_label(row):
    r, f, m, s = row["R_Score"], row["F_Score"], row["M_Score"], row["RFM_Score"]
    if s >= 12 and r >= 4 and f >= 4:
        return "Champions"
    elif s >= 10 and f >= 3:
        return "Loyal"
    elif r >= 4 and f <= 2:
        return "New Customers"
    elif s <= 6 and r <= 2:
        return "At Risk"
    else:
        return "Regulars"

df["RFM_Label"] = df.apply(rfm_label, axis=1)

# 10) Campaign acceptance
camp_cols = [c for c in df.columns if c.startswith("AcceptedCmp")]
df["AcceptedAnyCampaign"] = (df[camp_cols].sum(axis=1) > 0).astype(int)

# 11) QA checks
assert df["Age"].between(0, 120).all()
assert (df["TotalSpend"] >= 0).all() and (df["TotalPurchases"] >= 0).all()
assert (df["Recency"] >= 0).all()
for c in spend_cols + purchase_cols:
    assert (df[c] >= 0).all(), f"Ujemne wartości w {c}"

# Validate share columns
share_product_cols = [f"Share_Mnt{c.replace('Mnt','')}" for c in spend_cols]
for c in share_product_cols + [f"Share_{c}" for c in purchase_cols]:
    assert c in df.columns, f"Brak oczekiwanej kolumny: {c}"

valid_labels = {"Champions","Loyal","New Customers","At Risk","Regulars"}
assert set(df["RFM_Label"].unique()).issubset(valid_labels)

# 12) Export processed datasets
master_cols = [
    "ID","Age","Income","Education","Marital_Status","Children","HasChildren",
    "TotalSpend","TotalPurchases","AOV","Recency","TenureMonths",
    "R_Score","F_Score","M_Score","RFM_Score","RFM_Label",
    "Share_MntWines","Share_MntFruits","Share_MntMeatProducts","Share_MntFishProducts",
    "Share_MntSweetProducts","Share_MntGoldProds",
    "Share_NumWebPurchases","Share_NumCatalogPurchases","Share_NumStorePurchases","Share_NumDealsPurchases",
    "WebConvRate_Approx","CLV_Proxy","AcceptedAnyCampaign","Response"
]
# upewnij się, że wszystkie istnieją; jeśli czegoś nie ma, podpowiedz która
missing = [c for c in master_cols if c not in df.columns]
assert not missing, f"Brakuje kolumn w master_cols: {missing}"

customers_master = df[master_cols].copy()
rfm_segments     = df[["ID","R_Score","F_Score","M_Score","RFM_Score","RFM_Label"]].copy()

path_master = processed_dir / "customers_master.csv"
path_rfm    = processed_dir / "rfm_segments.csv"
customers_master.to_csv(path_master, index=False)
rfm_segments.to_csv(path_rfm, index=False)

print("Feature engineering gotowe")
print("Zapisano:")
print(" -", path_master)
print(" -", path_rfm)
display(customers_master.head())


Feature engineering gotowe ✅
Zapisano:
 - ..\Data\Processed\customers_master.csv
 - ..\Data\Processed\rfm_segments.csv


Unnamed: 0,ID,Age,Income,Education,Marital_Status,Children,HasChildren,TotalSpend,TotalPurchases,AOV,...,Share_MntSweetProducts,Share_MntGoldProds,Share_NumWebPurchases,Share_NumCatalogPurchases,Share_NumStorePurchases,Share_NumDealsPurchases,WebConvRate_Approx,CLV_Proxy,AcceptedAnyCampaign,Response
0,5524,57,58138.0,Graduation,Single,0,0,1617.0,25.0,64.68,...,0.054422,0.054422,0.32,0.4,0.16,0.12,1.142857,232.568917,0,1
1,2174,60,46344.0,Graduation,Single,2,1,27.0,6.0,4.5,...,0.037037,0.222222,0.166667,0.166667,0.333333,0.333333,0.2,13.965517,0,0
2,4141,49,71613.0,Graduation,Together,0,0,776.0,21.0,36.952381,...,0.027062,0.054124,0.380952,0.095238,0.47619,0.047619,2.0,206.933333,0,0
3,6182,30,26646.0,Graduation,Together,1,1,53.0,8.0,6.625,...,0.056604,0.09434,0.25,0.0,0.5,0.25,0.333333,24.398977,0,0
4,5324,33,58293.0,Phd,Married,1,1,422.0,19.0,22.210526,...,0.063981,0.035545,0.263158,0.157895,0.315789,0.263158,1.0,177.892272,0,0
