In [2]:
# 02_feature_build.ipynb — Block 1
# imports + load v2 raw data

import pandas as pd
import numpy as np

# load data
invoices = pd.read_csv("../data/raw/client_invoices.csv")

# parse dates
date_cols = ["invoice_date","payment_due_date","payment_actual_date"]
for c in date_cols:
    invoices[c] = pd.to_datetime(invoices[c], errors="coerce")

print("✅ invoices loaded:", invoices.shape)
display(invoices.head(3))


✅ invoices loaded: (1800, 11)


Unnamed: 0,invoice_id,invoice_date,payment_due_date,payment_actual_date,invoice_amount,client_id,client_name,material_name,payment_method,delay_days,delayed_flag
0,INV00001,2024-05-20,2024-06-14,2024-06-21,660906,C036,Motherson Global Manufacturing Service - A div...,HR Coils,NEFT,7,1
1,INV00002,2024-12-04,2024-12-19,2025-01-04,188555,C024,HAMTEK TECHNOLOGIES INDIA PVT. LTD.,Galvanized Sheets,LC,16,1
2,INV00003,2024-02-21,2024-04-06,2024-04-04,237375,C015,COMMITTED ISPAT PRIVATE LIMITED,Binding Wire,RTGS,-2,0


In [3]:
# 02_feature_build.ipynb — Block 2
# current invoice features

feat_df = invoices.copy()

# term_days = contractual credit period
feat_df["term_days"] = (feat_df["payment_due_date"] - feat_df["invoice_date"]).dt.days

# month / quarter
feat_df["month"] = feat_df["invoice_date"].dt.month
feat_df["quarter"] = feat_df["invoice_date"].dt.quarter
feat_df["year"] = feat_df["invoice_date"].dt.year

# target
feat_df["target"] = feat_df["delayed_flag"]

print("✅ base features ready:", feat_df.shape)
display(feat_df[["invoice_id","client_id","invoice_amount","term_days","month","quarter","year","target"]].head(5))


✅ base features ready: (1800, 16)


Unnamed: 0,invoice_id,client_id,invoice_amount,term_days,month,quarter,year,target
0,INV00001,C036,660906,25,5,2,2024,1
1,INV00002,C024,188555,15,12,4,2024,1
2,INV00003,C015,237375,45,2,1,2024,0
3,INV00004,C033,368993,45,9,3,2023,0
4,INV00005,C053,503292,30,2,1,2024,0


In [4]:
# 02_feature_build.ipynb — Block 3
# sort by client + invoice_date

feat_df = feat_df.sort_values(["client_id","invoice_date"]).reset_index(drop=True)

print("✅ sorted by client/date")


✅ sorted by client/date


In [5]:
# 02_feature_build.ipynb — Block 4
# rolling historical features (per client)

def add_client_history_features(df):
    df = df.copy()
    df["client_prev_txn_count"] = 0
    df["client_prev_total_value"] = 0.0
    df["client_prev_delay_rate"] = 0.0
    df["client_prev_avg_delay"] = 0.0
    
    # group by client
    for cid, group in df.groupby("client_id"):
        past_txns = []
        prev_vals = []
        prev_delays = []
        
        for idx, row in group.iterrows():
            if len(past_txns) > 0:
                df.loc[idx,"client_prev_txn_count"] = len(past_txns)
                df.loc[idx,"client_prev_total_value"] = np.sum(prev_vals)
                df.loc[idx,"client_prev_delay_rate"] = np.mean(prev_delays)
                df.loc[idx,"client_prev_avg_delay"] = np.mean([d for d in prev_delays if d is not None])
            
            past_txns.append(row["invoice_id"])
            prev_vals.append(row["invoice_amount"])
            prev_delays.append(row["delayed_flag"])
    
    return df

feat_df = add_client_history_features(feat_df)

print("✅ historical features added")
display(feat_df[["invoice_id","client_id","invoice_amount",
                 "client_prev_txn_count","client_prev_total_value",
                 "client_prev_delay_rate","client_prev_avg_delay","target"]].head(10))


✅ historical features added


Unnamed: 0,invoice_id,client_id,invoice_amount,client_prev_txn_count,client_prev_total_value,client_prev_delay_rate,client_prev_avg_delay,target
0,INV00472,C001,216047,0,0.0,0.0,0.0,0
1,INV00193,C001,240835,1,216047.0,0.0,0.0,0
2,INV01088,C001,204797,2,456882.0,0.0,0.0,0
3,INV00936,C001,151179,3,661679.0,0.0,0.0,0
4,INV01576,C001,104261,4,812858.0,0.0,0.0,0
5,INV00203,C001,111843,5,917119.0,0.0,0.0,0
6,INV00795,C001,110200,6,1028962.0,0.0,0.0,1
7,INV00750,C001,101633,7,1139162.0,0.142857,0.142857,0
8,INV00581,C001,177680,8,1240795.0,0.125,0.125,0
9,INV00339,C001,207273,9,1418475.0,0.111111,0.111111,1


In [7]:
# 02_feature_build.ipynb — Block 6
# more intuitive features

feat_df = feat_df.copy()

# log amount (stabilizes variance)
feat_df["log_amount"] = np.log1p(feat_df["invoice_amount"])

# is quarter end / month end invoices (often riskier)
feat_df["is_month_end"] = (feat_df["invoice_date"].dt.is_month_end).astype(int)
feat_df["is_quarter_end"] = (feat_df["invoice_date"].dt.quarter.isin([3,6,9,12])).astype(int)  # simple proxy
# better: actual quarter end check
qend = feat_df["invoice_date"].dt.to_period("Q").dt.end_time.dt.date
feat_df["is_quarter_end"] = (feat_df["invoice_date"].dt.date == qend).astype(int)

# days to month end (due & invoice)
month_end = feat_df["invoice_date"].dt.to_period("M").dt.end_time
feat_df["days_to_month_end"] = (month_end - feat_df["invoice_date"]).dt.days

# due weekday / invoice weekday
feat_df["due_dow"] = feat_df["payment_due_date"].dt.weekday
feat_df["inv_dow"] = feat_df["invoice_date"].dt.weekday

print("✅ calendar features added")


✅ calendar features added


In [8]:
# 02_feature_build.ipynb — Block 7
# how big is this invoice vs client's own history? (z-score)
feat_df["client_amount_mean"] = feat_df.groupby("client_id")["invoice_amount"].transform(lambda s: s.shift().expanding().mean())
feat_df["client_amount_std"]  = feat_df.groupby("client_id")["invoice_amount"].transform(lambda s: s.shift().expanding().std())
feat_df["client_amount_z"]    = (feat_df["invoice_amount"] - feat_df["client_amount_mean"]) / (feat_df["client_amount_std"] + 1e-6)

# exposure velocity (last 90 days)
feat_df = feat_df.sort_values(["client_id","invoice_date"])
feat_df["client_rolling_90_value"] = (
    feat_df
    .groupby("client_id")
    .apply(lambda g: g.set_index("invoice_date")["invoice_amount"].rolling("90D").sum())
    .reset_index(level=0, drop=True)
    .values
)

print("✅ client-relative size & recent exposure added")


✅ client-relative size & recent exposure added


  .apply(lambda g: g.set_index("invoice_date")["invoice_amount"].rolling("90D").sum())


In [9]:
# 02_feature_build.ipynb — Block 8
cat_cols = ["payment_method", "material_name"]
feat_df = pd.get_dummies(feat_df, columns=cat_cols, drop_first=True)

print("✅ one-hot encoded:", [c for c in feat_df.columns if "payment_method_" in c or "material_name_" in c])


✅ one-hot encoded: ['payment_method_LC', 'payment_method_NEFT', 'payment_method_RTGS', 'material_name_Binding Wire', 'material_name_CR Coils', 'material_name_Cement', 'material_name_Fly Ash', 'material_name_GGBS', 'material_name_Galvanized Sheets', 'material_name_HDPE Pipes', 'material_name_HR Coils', 'material_name_MS Coils', 'material_name_PVC Pipes', 'material_name_Plates', 'material_name_Rebars', 'material_name_Stainless Steel', 'material_name_Structural Steel', 'material_name_TMT Bars']


In [10]:
# 02_feature_build.ipynb — Block 9
feat_df.to_csv("../data/processed/features_v2.csv", index=False)
print("✅ saved features_v2.csv:", feat_df.shape)


✅ saved features_v2.csv: (1800, 46)
