In [1]:
import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from d3rlpy.dataset import MDPDataset
from d3rlpy.algos import DiscreteCQL

processed_path = Path("../data/processed")

df = pd.read_csv(processed_path / "lendingclub_clean.csv")
df.shape, df["default"].mean()


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  from .autonotebook import tqdm as notebook_tqdm


((1373915, 31), np.float64(0.21484298519195147))

In [2]:
state_features = [
    "loan_amnt",
    "term_months",
    "int_rate",
    "installment",
    "annual_inc",
    "dti",
    "delinq_2yrs",
    "inq_last_6mths",
    "open_acc",
    "pub_rec",
    "revol_bal",
    "revol_util",
    "total_acc",
    "mort_acc",
    "pub_rec_bankruptcies",
    "fico_range_low",
    "fico_range_high",
    "credit_age_years",
]

state_features = [c for c in state_features if c in df.columns]
len(state_features), state_features


(18,
 ['loan_amnt',
  'term_months',
  'int_rate',
  'installment',
  'annual_inc',
  'dti',
  'delinq_2yrs',
  'inq_last_6mths',
  'open_acc',
  'pub_rec',
  'revol_bal',
  'revol_util',
  'total_acc',
  'mort_acc',
  'pub_rec_bankruptcies',
  'fico_range_low',
  'fico_range_high',
  'credit_age_years'])

In [3]:
from pathlib import Path
import numpy as np
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from d3rlpy.dataset import MDPDataset
from d3rlpy.algos import DiscreteCQLConfig

df_rl = df.dropna(subset=["default"]).copy()
df_rl = df_rl[df_rl["default"].isin([0, 1])].reset_index(drop=True)

print("RL rows:", len(df_rl), "| default rate:", df_rl["default"].mean().round(3))

to_dec = lambda r: np.where(r > 1.0, r / 100.0, r).astype("float32")

X_raw = df_rl[state_features].fillna(df_rl[state_features].median(numeric_only=True))
y_def = df_rl["default"].astype("int64").to_numpy()
loan = df_rl["loan_amnt"].astype("float32").to_numpy()
rate = to_dec(df_rl["int_rate"].to_numpy())

scaler = StandardScaler()
X_all = scaler.fit_transform(X_raw.to_numpy(dtype="float32"))

idx = np.arange(len(df_rl))
tr_idx, te_idx = train_test_split(idx, test_size=0.2, random_state=42, stratify=y_def)

X_tr, X_te = X_all[tr_idx], X_all[te_idx]
def_tr, def_te   = y_def[tr_idx], y_def[te_idx]
loan_tr, loan_te = loan[tr_idx], loan[te_idx]
rate_tr, rate_te = rate[tr_idx], rate[te_idx]

DENY, APPROVE = 0, 1

act_tr = np.full_like(def_tr, APPROVE, dtype="int64")

reward_tr = np.zeros_like(loan_tr, dtype="float32")
good, bad = def_tr == 0, def_tr == 1
reward_tr[good] = loan_tr[good] * rate_tr[good]
reward_tr[bad]  = -loan_tr[bad]
reward_tr /= 1000.0

dataset = MDPDataset(X_tr.astype("float32"), act_tr, reward_tr, terminals=np.ones_like(reward_tr, bool))

print("Mean reward (train, approve-all):", reward_tr.mean().round(3))

cfg  = DiscreteCQLConfig()
cql  = cfg.create(device="cpu")
cql.fit(dataset, n_steps=100_000,
n_steps_per_epoch=5_000, show_progress=True)

def rewards(actions, ln, rt, df):
    r = np.zeros_like(ln, "float32")
    appr = actions == APPROVE
    good = appr & (df == 0)
    bad  = appr & (df == 1)
    r[good] = ln[good] * rt[good]
    r[bad]  = -ln[bad]
    return r / 1000.0

logged_act = np.full_like(def_te, APPROVE, "int64")
logged_r   = rewards(logged_act, loan_te, rate_te, def_te).mean()

q_app = cql.predict_value(X_te.astype("float32"),
action=np.full(len(X_te), APPROVE, "int64"))
rl_act = np.where(q_app > 0.0, APPROVE, DENY).astype("int64")
rl_r   = rewards(rl_act, loan_te, rate_te, def_te).mean()

print("\n--- Policy value (thousands per loan) ---")
print(f"Logged approve-all : {logged_r:.3f}")
print(f"RL Q>0 policy      : {rl_r:.3f}")

print("\nApproval rates   | default rates by decision")
print("Logged:", (logged_act==APPROVE).mean().round(3))
for a,label in [(DENY,"DENY"),(APPROVE,"APP")]:
    m = rl_act==a
    print(f"RL {label}: n={m.sum():4d} | def_rate={def_te[m].mean():.3f}" if m.any()
          else f"RL {label}: n=0")

out_dir = Path("../data/processed")
cql.save_model(out_dir / "cql_policy.d3")
dump(scaler,     out_dir / "rl_state_scaler.joblib")
print("\nSaved model and scaler to", out_dir)


RL rows: 1373915 | default rate: 0.215
2025-12-11 16:18.18 [info     ] Signatures have been automatically determined. action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]) observation_signature=Signature(dtype=[dtype('float32')], shape=[(18,)]) reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)])
2025-12-11 16:18.18 [info     ] Action-space has been automatically determined. action_space=<ActionSpace.DISCRETE: 2>
2025-12-11 16:18.21 [info     ] Action size has been automatically determined. action_size=2
Mean reward (train, approve-all): -1.932
2025-12-11 16:18.23 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(18,)]), action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)
2025-12-11 16:18.23 [debug    ] Building models...            
2025-12-11

Epoch 1/20: 100%|██████████| 5000/5000 [00:26<00:00, 190.37it/s, loss=3.82, td_loss=3.81, conservative_loss=0.0124]


2025-12-11 16:18.51 [info     ] DiscreteCQL_20251211161824: epoch=1 step=5000 epoch=1 metrics={'time_sample_batch': 0.0008704916000366211, 'time_algorithm_update': 0.004081946516036987, 'loss': 3.81947074874416, 'td_loss': 3.807104358164221, 'conservative_loss': 0.012366387033186039, 'time_step': 0.005174429225921631} step=5000
2025-12-11 16:18.51 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_5000.d3


Epoch 2/20: 100%|██████████| 5000/5000 [00:27<00:00, 181.83it/s, loss=3.75, td_loss=3.75, conservative_loss=5.42e-5] 


2025-12-11 16:19.18 [info     ] DiscreteCQL_20251211161824: epoch=2 step=10000 epoch=2 metrics={'time_sample_batch': 0.0009021766662597657, 'time_algorithm_update': 0.00427284140586853, 'loss': 3.747914449682459, 'td_loss': 3.7478602888222783, 'conservative_loss': 5.41615898557211e-05, 'time_step': 0.005412972021102905} step=10000
2025-12-11 16:19.18 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_10000.d3


Epoch 3/20: 100%|██████████| 5000/5000 [00:26<00:00, 189.71it/s, loss=3.79, td_loss=3.79, conservative_loss=6.21e-6]


2025-12-11 16:19.45 [info     ] DiscreteCQL_20251211161824: epoch=3 step=15000 epoch=3 metrics={'time_sample_batch': 0.0008533257961273194, 'time_algorithm_update': 0.004112821817398071, 'loss': 3.790979060420394, 'td_loss': 3.7909728613853453, 'conservative_loss': 6.199032154017914e-06, 'time_step': 0.005192131614685059} step=15000
2025-12-11 16:19.45 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_15000.d3


Epoch 4/20: 100%|██████████| 5000/5000 [00:26<00:00, 190.55it/s, loss=3.77, td_loss=3.77, conservative_loss=7.01e-7]


2025-12-11 16:20.11 [info     ] DiscreteCQL_20251211161824: epoch=4 step=20000 epoch=4 metrics={'time_sample_batch': 0.0008439227104187012, 'time_algorithm_update': 0.0040979547023773195, 'loss': 3.7719131730653346, 'td_loss': 3.771912472178042, 'conservative_loss': 7.005341351032257e-07, 'time_step': 0.0051680573463439945} step=20000
2025-12-11 16:20.11 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_20000.d3


Epoch 5/20: 100%|██████████| 5000/5000 [00:25<00:00, 197.41it/s, loss=3.72, td_loss=3.72, conservative_loss=6.19e-8]


2025-12-11 16:20.36 [info     ] DiscreteCQL_20251211161824: epoch=5 step=25000 epoch=5 metrics={'time_sample_batch': 0.000836931037902832, 'time_algorithm_update': 0.003926714611053467, 'loss': 3.7243664292648435, 'td_loss': 3.7243664017915727, 'conservative_loss': 6.178691983222961e-08, 'time_step': 0.004988942766189575} step=25000
2025-12-11 16:20.36 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_25000.d3


Epoch 6/20: 100%|██████████| 5000/5000 [00:22<00:00, 218.25it/s, loss=3.78, td_loss=3.78, conservative_loss=4.85e-9]


2025-12-11 16:20.59 [info     ] DiscreteCQL_20251211161824: epoch=6 step=30000 epoch=6 metrics={'time_sample_batch': 0.0007836950302124023, 'time_algorithm_update': 0.003518218183517456, 'loss': 3.782096094980091, 'td_loss': 3.782096094895154, 'conservative_loss': 4.845112562179565e-09, 'time_step': 0.00451491904258728} step=30000
2025-12-11 16:20.59 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_30000.d3


Epoch 7/20: 100%|██████████| 5000/5000 [00:26<00:00, 185.25it/s, loss=3.75, td_loss=3.75, conservative_loss=3.7e-10] 


2025-12-11 16:21.26 [info     ] DiscreteCQL_20251211161824: epoch=7 step=35000 epoch=7 metrics={'time_sample_batch': 0.0008844974040985108, 'time_algorithm_update': 0.0041876310825347905, 'loss': 3.7454173569515348, 'td_loss': 3.7454173569515348, 'conservative_loss': 3.6954879760742186e-10, 'time_step': 0.005312508392333984} step=35000
2025-12-11 16:21.26 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_35000.d3


Epoch 8/20: 100%|██████████| 5000/5000 [00:22<00:00, 223.20it/s, loss=3.75, td_loss=3.75, conservative_loss=5.82e-11]


2025-12-11 16:21.49 [info     ] DiscreteCQL_20251211161824: epoch=8 step=40000 epoch=8 metrics={'time_sample_batch': 0.000761633014678955, 'time_algorithm_update': 0.0034409104347229, 'loss': 3.7480337739676237, 'td_loss': 3.7480337739676237, 'conservative_loss': 5.8114528656005857e-11, 'time_step': 0.004412642335891724} step=40000
2025-12-11 16:21.49 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_40000.d3


Epoch 9/20: 100%|██████████| 5000/5000 [00:21<00:00, 231.66it/s, loss=3.73, td_loss=3.73, conservative_loss=7.46e-12]


2025-12-11 16:22.10 [info     ] DiscreteCQL_20251211161824: epoch=9 step=45000 epoch=9 metrics={'time_sample_batch': 0.0007348190784454346, 'time_algorithm_update': 0.0033103960514068603, 'loss': 3.7263733174517752, 'td_loss': 3.7263733174517752, 'conservative_loss': 7.450580596923828e-12, 'time_step': 0.004252180624008179} step=45000
2025-12-11 16:22.10 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_45000.d3


Epoch 10/20: 100%|██████████| 5000/5000 [00:20<00:00, 239.75it/s, loss=3.71, td_loss=3.71, conservative_loss=5.97e-12]


2025-12-11 16:22.31 [info     ] DiscreteCQL_20251211161824: epoch=10 step=50000 epoch=10 metrics={'time_sample_batch': 0.0007154226303100586, 'time_algorithm_update': 0.0031917922019958495, 'loss': 3.715200203165412, 'td_loss': 3.715200203165412, 'conservative_loss': 5.960464477539063e-12, 'time_step': 0.004110252618789673} step=50000
2025-12-11 16:22.31 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_50000.d3


Epoch 11/20: 100%|██████████| 5000/5000 [00:20<00:00, 238.55it/s, loss=3.72, td_loss=3.72, conservative_loss=4.48e-11]


2025-12-11 16:22.52 [info     ] DiscreteCQL_20251211161824: epoch=11 step=55000 epoch=11 metrics={'time_sample_batch': 0.0007193023204803467, 'time_algorithm_update': 0.003209153699874878, 'loss': 3.723073830935359, 'td_loss': 3.723073830935359, 'conservative_loss': 4.470348358154297e-11, 'time_step': 0.004130197095870971} step=55000
2025-12-11 16:22.52 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_55000.d3


Epoch 12/20: 100%|██████████| 5000/5000 [00:21<00:00, 232.64it/s, loss=3.74, td_loss=3.74, conservative_loss=7.09e-11]


2025-12-11 16:23.14 [info     ] DiscreteCQL_20251211161824: epoch=12 step=60000 epoch=12 metrics={'time_sample_batch': 0.000730507230758667, 'time_algorithm_update': 0.003297607755661011, 'loss': 3.740699812617898, 'td_loss': 3.740699812617898, 'conservative_loss': 7.078051567077637e-11, 'time_step': 0.004237124300003052} step=60000
2025-12-11 16:23.14 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_60000.d3


Epoch 13/20: 100%|██████████| 5000/5000 [00:21<00:00, 235.17it/s, loss=3.71, td_loss=3.71, conservative_loss=2.69e-10]


2025-12-11 16:23.35 [info     ] DiscreteCQL_20251211161824: epoch=13 step=65000 epoch=13 metrics={'time_sample_batch': 0.0007224892616271972, 'time_algorithm_update': 0.003262483787536621, 'loss': 3.711511780309677, 'td_loss': 3.711511780142784, 'conservative_loss': 2.682209014892578e-10, 'time_step': 0.004190690231323242} step=65000
2025-12-11 16:23.35 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_65000.d3


Epoch 14/20: 100%|██████████| 5000/5000 [00:20<00:00, 238.54it/s, loss=3.73, td_loss=3.73, conservative_loss=9.55e-11]


2025-12-11 16:23.56 [info     ] DiscreteCQL_20251211161824: epoch=14 step=70000 epoch=14 metrics={'time_sample_batch': 0.0007188632011413574, 'time_algorithm_update': 0.0032079299926757814, 'loss': 3.7264795622348785, 'td_loss': 3.7264795622348785, 'conservative_loss': 9.5367431640625e-11, 'time_step': 0.004131971311569214} step=70000
2025-12-11 16:23.56 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_70000.d3


Epoch 15/20: 100%|██████████| 5000/5000 [00:21<00:00, 232.47it/s, loss=3.68, td_loss=3.68, conservative_loss=1.13e-10]


2025-12-11 16:24.17 [info     ] DiscreteCQL_20251211161824: epoch=15 step=75000 epoch=15 metrics={'time_sample_batch': 0.0007321259021759033, 'time_algorithm_update': 0.0032984567165374756, 'loss': 3.677728113435209, 'td_loss': 3.677728113435209, 'conservative_loss': 1.1324882507324219e-10, 'time_step': 0.004239110851287842} step=75000
2025-12-11 16:24.18 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_75000.d3


Epoch 16/20: 100%|██████████| 5000/5000 [00:21<00:00, 232.31it/s, loss=3.7, td_loss=3.7, conservative_loss=5.6e-11]   


2025-12-11 16:24.39 [info     ] DiscreteCQL_20251211161824: epoch=16 step=80000 epoch=16 metrics={'time_sample_batch': 0.0007332563877105713, 'time_algorithm_update': 0.0032988405704498292, 'loss': 3.695740035907924, 'td_loss': 3.695740035907924, 'conservative_loss': 5.587935447692871e-11, 'time_step': 0.0042404850959777834} step=80000
2025-12-11 16:24.39 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_80000.d3


Epoch 17/20: 100%|██████████| 5000/5000 [00:21<00:00, 228.88it/s, loss=3.69, td_loss=3.69, conservative_loss=1.24e-10]


2025-12-11 16:25.01 [info     ] DiscreteCQL_20251211161824: epoch=17 step=85000 epoch=17 metrics={'time_sample_batch': 0.0007497227191925049, 'time_algorithm_update': 0.0033425195217132568, 'loss': 3.6927394653201104, 'td_loss': 3.692739465272427, 'conservative_loss': 1.2367963790893555e-10, 'time_step': 0.004304240274429321} step=85000
2025-12-11 16:25.01 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_85000.d3


Epoch 18/20: 100%|██████████| 5000/5000 [00:21<00:00, 235.88it/s, loss=3.71, td_loss=3.71, conservative_loss=4.7e-11] 


2025-12-11 16:25.22 [info     ] DiscreteCQL_20251211161824: epoch=18 step=90000 epoch=18 metrics={'time_sample_batch': 0.0007196068286895752, 'time_algorithm_update': 0.003248333692550659, 'loss': 3.7061887362152337, 'td_loss': 3.706188736191392, 'conservative_loss': 4.693865776062012e-11, 'time_step': 0.004177911376953125} step=90000
2025-12-11 16:25.22 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_90000.d3


Epoch 19/20: 100%|██████████| 5000/5000 [00:21<00:00, 235.48it/s, loss=3.7, td_loss=3.7, conservative_loss=5.6e-11]   


2025-12-11 16:25.43 [info     ] DiscreteCQL_20251211161824: epoch=19 step=95000 epoch=19 metrics={'time_sample_batch': 0.0007236002445220948, 'time_algorithm_update': 0.003249044370651245, 'loss': 3.6987307060092687, 'td_loss': 3.6987307060092687, 'conservative_loss': 5.587935447692871e-11, 'time_step': 0.004185031318664551} step=95000
2025-12-11 16:25.43 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_95000.d3


Epoch 20/20: 100%|██████████| 5000/5000 [00:21<00:00, 235.15it/s, loss=3.7, td_loss=3.7, conservative_loss=6.05e-11]  


2025-12-11 16:26.05 [info     ] DiscreteCQL_20251211161824: epoch=20 step=100000 epoch=20 metrics={'time_sample_batch': 0.0007323276996612549, 'time_algorithm_update': 0.003247660732269287, 'loss': 3.7001272270798684, 'td_loss': 3.7001272270798684, 'conservative_loss': 6.0349702835083e-11, 'time_step': 0.004191861248016357} step=100000
2025-12-11 16:26.05 [info     ] Model parameters are saved to d3rlpy_logs\DiscreteCQL_20251211161824\model_100000.d3

--- Policy value (thousands per loan) ---
Logged approve-all : -1.942
RL Q>0 policy      : -1.547

Approval rates   | default rates by decision
Logged: 1.0
RL DENY: n=14415 | def_rate=0.479
RL APP: n=260368 | def_rate=0.200

Saved model and scaler to ..\data\processed
