In [15]:
!pip install optuna



In [16]:
import pandas as pd
import numpy as np
import optuna
import json
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_sample_weight

In [17]:
df = pd.read_csv("/content/df_feature_engineered.csv")

In [18]:
df

Unnamed: 0,partner_id,role,gender,age_group,city_tier,earnings_avg,on_time_rate,cancel_rate,customer_rating,complaints,...,engagement_ratio,weekly_trip_ratio,reliability_index,complaint_rate,accident_rate,txn_freq_per_day,wallet_txn_flag,vehicle_utilization,earnings_vs_city_avg,rating_vs_age_avg
0,1,driver,M,<=25,2,638.443420,0.964244,0.135795,4.837981,0,...,0.513514,2.600000,0.833304,0.000000,0.0,8.001422,1,9.744333,0.913992,1.023845
1,2,merchant,M,<=25,2,444.440684,0.931376,0.061321,4.778687,0,...,1.333333,2.120000,0.874263,0.000000,0.0,4.188749,1,8.175693,0.636259,1.011297
2,3,merchant,M,26-35,1,659.666189,0.960345,0.045608,4.418110,1,...,0.451613,1.206897,0.916545,0.027778,0.0,4.623777,1,4.948950,0.942219,0.934736
3,4,driver,F,<=25,3,503.720971,0.850885,0.053521,4.782128,0,...,0.529412,3.052632,0.805345,0.000000,0.0,14.130121,1,11.814075,0.721940,1.012025
4,5,driver,M,50+,3,637.742060,0.970612,0.150525,4.905054,0,...,0.678571,2.850000,0.824510,0.000000,0.0,0.000000,0,12.631052,0.914021,1.038896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,7996,driver,M,50+,2,651.340236,0.963597,0.009458,4.877334,0,...,1.130435,1.370370,0.954483,0.000000,0.0,10.193239,1,6.665293,0.932455,1.033024
7996,7997,driver,M,26-35,3,804.831806,0.932786,0.109542,4.864437,0,...,1.769231,1.291667,0.830607,0.000000,0.0,13.467973,1,31.000000,1.153497,1.029165
7997,7998,driver,M,26-35,2,648.224239,0.916002,0.130103,4.602213,0,...,1.500000,2.000000,0.796827,0.000000,0.0,10.934323,1,15.436476,0.927994,0.973687
7998,7999,merchant,M,26-35,1,861.418504,0.932769,0.124617,4.975657,0,...,0.434783,3.809524,0.816530,0.000000,0.0,16.806311,1,14.420394,1.230387,1.052696


In [19]:
categorical_features = ["role", "gender", "age_group", "city_tier",
                        "vehicle_age_bin", "wallet_txn_bin"]

numeric_features = [col for col in df.columns if col not in categorical_features + ["partner_id", "good_repayment"]]

In [20]:
X = df.drop(columns=["partner_id", "good_repayment"])
y = df["good_repayment"]

In [21]:
def reweight_samples(df, sensitive_col="city_tier"):
    group_counts = df[sensitive_col].value_counts()
    weights = df[sensitive_col].map(lambda x: 1.0 / group_counts[x])
    return (weights / weights.mean()).values

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

w_train = reweight_samples(X_train, sensitive_col="city_tier")

In [23]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", StandardScaler(), numeric_features)
    ]
)

In [24]:
def objective(trial):
    C = trial.suggest_loguniform("C", 1e-3, 10)
    penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    solver = "liblinear" if penalty == "l1" else "lbfgs"

    clf = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(
            C=C,
            penalty=penalty,
            solver=solver,
            class_weight="balanced",
            max_iter=1000,
            random_state=42
        ))
    ])

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs, fairness_penalties = [], []

    for train_idx, valid_idx in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
        w_tr = w_train[train_idx]

        clf.fit(X_tr, y_tr, classifier__sample_weight=w_tr)
        y_pred_prob = clf.predict_proba(X_val)[:, 1]

        auc = roc_auc_score(y_val, y_pred_prob)

        # --- Fairness penalty across all tiers ---
        group_means = []
        for g in [1, 2, 3]:
            mask = (X_val["city_tier"] == g)
            if mask.sum() > 0:   # only if group exists in this fold
                group_means.append(y_pred_prob[mask].mean())
        dpd = max(group_means) - min(group_means) if len(group_means) > 1 else 0

        aucs.append(auc)
        fairness_penalties.append(dpd)

    # Adjust the tradeoff: λ controls weight of fairness penalty
    return np.mean(aucs) - 0.5 * np.mean(fairness_penalties)


In [25]:
# === Run Optuna ===
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best params:", study.best_params)

[I 2025-09-09 15:10:48,498] A new study created in memory with name: no-name-5fd46a69-0a97-434e-a872-2c175a4634d3
  C = trial.suggest_loguniform("C", 1e-3, 10)
[I 2025-09-09 15:10:48,989] Trial 0 finished with value: 0.6123063784246343 and parameters: {'C': 0.7174571885946396, 'penalty': 'l2'}. Best is trial 0 with value: 0.6123063784246343.
  C = trial.suggest_loguniform("C", 1e-3, 10)
[I 2025-09-09 15:10:49,372] Trial 1 finished with value: 0.6254058503196928 and parameters: {'C': 0.01076433382408315, 'penalty': 'l2'}. Best is trial 1 with value: 0.6254058503196928.
  C = trial.suggest_loguniform("C", 1e-3, 10)
[I 2025-09-09 15:10:49,881] Trial 2 finished with value: 0.6119372222173158 and parameters: {'C': 1.6150754520846302, 'penalty': 'l2'}. Best is trial 1 with value: 0.6254058503196928.
  C = trial.suggest_loguniform("C", 1e-3, 10)
[I 2025-09-09 15:10:50,406] Trial 3 finished with value: 0.6125126460629162 and parameters: {'C': 0.5936119851369973, 'penalty': 'l2'}. Best is trial

Best params: {'C': 0.01588619088190863, 'penalty': 'l1'}


In [26]:
# === Train Final Model ===
best_params = study.best_params
penalty = best_params["penalty"]
solver = "liblinear" if penalty == "l1" else "lbfgs"

final_clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(
        C=best_params["C"],
        penalty=penalty,
        solver=solver,
        class_weight="balanced",
        max_iter=1000,
        random_state=42
    ))
])

final_clf.fit(X_train, y_train, classifier__sample_weight=w_train)

y_pred_prob = final_clf.predict_proba(X_test)[:, 1]
y_pred = final_clf.predict(X_test)

In [30]:
from sklearn.metrics import (
    roc_auc_score, f1_score, classification_report,
    accuracy_score, precision_score, recall_score, roc_curve
)
import json
import numpy as np

# After fitting best model:
y_pred_prob = final_clf.predict_proba(X_test)[:, 1]
y_pred = final_clf.predict(X_test)

# --- Metrics ---
auc = roc_auc_score(y_test, y_pred_prob)
gini = 2 * auc - 1

# KS statistic
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
ks = max(tpr - fpr)

metrics = {
    "AUC": auc,
    "Gini": gini,
    "KS": ks,
    "F1": f1_score(y_test, y_pred),
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "Report": classification_report(y_test, y_pred, output_dict=True)
}


with open("metrics.json", "w") as f:
    json.dump(metrics, f, indent=4)

In [28]:
def prob_to_score(prob, base=600, pdo=50, odds=20):
    odds_ratio = prob / (1 - prob)
    score = base + pdo / np.log(2) * np.log(odds / odds_ratio)
    return score

nova_scores = prob_to_score(y_pred_prob)
df_scores = pd.DataFrame({
    "partner_id": X_test.index,
    "probability": y_pred_prob,
    "nova_score": nova_scores
})
df_scores.to_csv("nova_scores.csv", index=False)

print("✅ Done! metrics.json and nova_scores.csv saved.")

✅ Done! metrics.json and nova_scores.csv saved.
