In [1]:
import os, json, textwrap, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
np.random.seed(42)
# --- Simulate data ---
N = 8000

roles = np.random.choice(["driver", "merchant"], size=N, p=[0.7, 0.3])
gender = np.random.choice(["M", "F"], size=N, p=[0.75, 0.25])
age_group = np.random.choice(["<=25", "26-35", "36-50", "50+"], size=N, p=[0.2, 0.4, 0.3, 0.1])
city_tier = np.random.choice([1,2,3], size=N, p=[0.45,0.35,0.20])
tenure_months = np.random.gamma(shape=3, scale=9, size=N).astype(int) + 1  # 1.. ~
days_active = np.clip(np.random.normal(22, 5, size=N), 5, 30).astype(int)
trips_per_week = np.clip(np.random.normal(55, 15, size=N), 5, 120).astype(int)
earnings_avg = np.clip(np.random.normal(700, 180, size=N), 100, 2000)  # per week (local currency units)
earnings_var = np.clip(np.random.gamma(2, 80, size=N), 5, 1500)
on_time_rate = np.clip(np.random.normal(0.93, 0.05, size=N), 0.6, 1.0)
cancel_rate = np.clip(np.random.beta(2, 18, size=N), 0.0, 0.45)
customer_rating = np.clip(np.random.normal(4.73, 0.2, size=N), 3.0, 5.0)
complaints = np.random.poisson(lam=0.15, size=N)
accidents = np.random.binomial(1, p=0.02, size=N) + np.random.binomial(1, p=0.01, size=N)  # 0-2 rare
night_shift_pct = np.clip(np.random.beta(2, 5, size=N), 0.0, 1.0)
cashless_ratio = np.clip(np.random.beta(4, 2, size=N), 0.0, 1.0)
wallet_txn_volume = np.clip(np.random.normal(250, 100, size=N), 0, 1000)
vehicle_age = np.clip(np.random.normal(4.0, 2.0, size=N), 0, 15)  # years
past_due_history = np.random.binomial(1, p=0.08, size=N)  # whether historically late on any repayments (simulated proxy)

# Construct DataFrame
df = pd.DataFrame({
    "partner_id": np.arange(1, N+1),
    "role": roles,
    "gender": gender,
    "age_group": age_group,
    "city_tier": city_tier,
    "tenure_months": tenure_months,
    "days_active": days_active,
    "trips_per_week": trips_per_week,
    "earnings_avg": earnings_avg,
    "earnings_var": earnings_var,
    "on_time_rate": on_time_rate,
    "cancel_rate": cancel_rate,
    "customer_rating": customer_rating,
    "complaints": complaints,
    "accidents": accidents,
    "night_shift_pct": night_shift_pct,
    "cashless_ratio": cashless_ratio,
    "wallet_txn_volume": wallet_txn_volume,
    "vehicle_age": vehicle_age,
    "past_due_history": past_due_history,
})

# --- Generate a latent "true repayment propensity" from operational features ---
# Higher earnings, higher on-time, higher rating, lower cancellations/complaints/accidents/variance => better.
z = (
    0.0025 * df["earnings_avg"]
    - 0.0005 * df["earnings_var"]
    + 2.0 * df["on_time_rate"]
    - 1.8 * df["cancel_rate"]
    + 0.4 * df["customer_rating"]
    - 0.15 * df["complaints"]
    - 0.35 * df["accidents"]
    + 0.5 * df["cashless_ratio"]
    + 0.003 * df["wallet_txn_volume"]
    + 0.0015 * df["trips_per_week"]
    + 0.002 * df["days_active"]
    + 0.002 * df["tenure_months"]
    - 0.05 * df["vehicle_age"]
    - 0.9 * df["past_due_history"]
)

# Introduce *historical bias* unrelated to repayment (to test mitigation):
# Suppose historical processes unfairly penalized certain groups in observed outcomes (label leakage).
# We'll add a spurious negative shift to the label for (city_tier==3) and for women in a small amount.
bias_shift = (
    (-0.25 * (df["city_tier"] == 3).astype(int)) +
    (-0.10 * (df["gender"] == "F").astype(int))
)

# Probability of *good repayment* (1 = good / approved class)
p_good = 1 / (1 + np.exp(-(z + bias_shift - 3.0)))  # shift to set overall rate ~60-70%

good = np.random.binomial(1, p_good, size=N)
df["good_repayment"] = good


In [3]:
df

Unnamed: 0,partner_id,role,gender,age_group,city_tier,tenure_months,days_active,trips_per_week,earnings_avg,earnings_var,...,cancel_rate,customer_rating,complaints,accidents,night_shift_pct,cashless_ratio,wallet_txn_volume,vehicle_age,past_due_history,good_repayment
0,1,driver,M,<=25,2,36,19,52,638.443420,101.826834,...,0.135795,4.837981,0,0,0.348988,0.350500,160.028448,4.336435,0,1
1,2,merchant,M,<=25,2,17,24,53,444.440684,69.151447,...,0.061321,4.778687,0,0,0.286113,0.915312,104.718719,5.482631,0,1
2,3,merchant,M,26-35,1,61,28,35,659.666189,223.266722,...,0.045608,4.418110,1,0,0.287171,0.642508,134.089531,6.072208,0,1
3,4,driver,F,<=25,3,33,18,58,503.720971,24.461841,...,0.053521,4.782128,0,0,0.279957,0.671180,268.472293,3.909398,0,0
4,5,driver,M,50+,3,27,19,57,637.742060,150.778596,...,0.150525,4.905054,0,0,0.108403,0.785409,0.000000,3.512688,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,7996,driver,M,50+,2,22,26,37,651.340236,178.112415,...,0.009458,4.877334,0,0,0.687393,0.451506,275.217454,4.551144,0,1
7996,7997,driver,M,26-35,3,12,23,31,804.831806,21.553709,...,0.109542,4.864437,0,0,0.562117,0.441517,323.231347,0.000000,1,1
7997,7998,driver,M,26-35,2,13,21,44,648.224239,48.960279,...,0.130103,4.602213,0,0,0.193823,0.741244,240.555107,1.850392,0,1
7998,7999,merchant,M,26-35,1,45,20,81,861.418504,49.013844,...,0.124617,4.975657,0,0,0.212527,0.836395,352.932530,4.547698,0,1


In [4]:
# Save raw data
df.to_csv("/Users/hrishityelchuri/Desktop/github/grabhack/driver-credit-scoring-engine/data/raw/partners.csv", index=False)