# Imports

In [1]:
import pandas as pd
import numpy as np
import d3rlpy
from sklearn.model_selection import train_test_split

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [2]:
SEED = 42
d3rlpy.seed(SEED)

In [3]:
DATA_PATH = '../data/processed_loans.csv'
df = pd.read_csv(DATA_PATH)

# Reward Engineering

<p>Logic for reward engineering:<br>
1. Action = Deny: Reward = 0 (We will generate these later in Step 2)<br>
2. Action = Approve & Fully Paid: Reward = + (loan_amnt * int_rate) (Profit)<br>
3. Action = Approve & Default: Reward = - loan_amnt (Loss)</p>

In [6]:
def calculated_reward(row):
    if row['target'] == 0: # Fully paid
        return row['loan_amnt'] * (row['int_rate']/100.0) # Reward
    
    else: # Default
        return -1.0 * row['loan_amnt'] # Reward

In [7]:
df['reward'] = df.apply(calculated_reward, axis=1)

## Financial inspection based upon rewards

In [8]:
print("Reward Statistics: ")
print(df['reward'].describe())

print(f"\nMax Profit (Best Loan):   ${df['reward'].max():.2f}")
print(f"Max Loss (Worst Loan):    ${df['reward'].min():.2f}")
print(f"Total Portfolio Value:    ${df['reward'].sum():,.2f}")

Reward Statistics: 
count    176083.000000
mean      -1805.502019
std        8039.184595
min      -35000.000000
25%         380.700000
50%         983.200000
75%        1893.600000
max       10146.500000
Name: reward, dtype: float64

Max Profit (Best Loan):   $10146.50
Max Loss (Worst Loan):    $-35000.00
Total Portfolio Value:    $-317,918,212.10


<p>The historical strategy (approving these specific people) was a financial disaster. Even though ~80% of people paid back, the 20% who defaulted caused such massive losses (up to -$35,000 each) that they wiped out all the profit from the good loans.</p>

<p>The DL Model (from Phase 2) learned to mimic this history. It would approve almost everyone to get high accuracy.

Result: The DL model would lose the company $317 Million.

The RL Agent's Job: It needs to look at this mess and learn to say "NO" (Action 0). If it simply denied everyone, the profit would be $0, which is effectively a $317 Million improvement over the current strategy!</p>

# MDP Construction & Augmentation

<p>Create the "Training Data" for the RL agent. We need to teach the agent that Denying (Action 0) is a valid option that yields $0 Reward.</p>

Batch A (The Reality): The actual loans. <br>
State: Applicant Features <br>
Action: 1 (Approve) <br>
Reward: -$1,805 (on average) <br>

Batch B (The Simulation): The same applicants, but hypothetically denied. <br>
State: Applicant Features <br>
Action: 0 (Deny) <br>
Reward: 0 (Risk-Free) <br>

In [9]:
feature_cols = df.drop(columns=['target', 'reward']).columns
X = df[feature_cols].values

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
print(f"Shape: {X_scaled.shape}")

Shape: (176083, 79)


## Batch A: "Real Approvals" (Action = 1)

In [14]:
obs_approve = X_scaled
act_approve = np.ones(len(df), dtype=int) # Action 1
rew_approve = df['reward'].values         # Real Profit/Loss

## Batch B: "Synthetic Denials" (Action = 0)

In [15]:
obs_deny = X_scaled
act_deny = np.zeros(len(df), dtype=int)   # Action 0
rew_deny = np.zeros(len(df), dtype=float) # Reward 0 (Safe)

## Combining A & B for Training Set

In [16]:
observations = np.concatenate([obs_approve, obs_deny], axis=0)
actions = np.concatenate([act_approve, act_deny], axis=0)
rewards = np.concatenate([rew_approve, rew_deny], axis=0)
terminals = np.ones(len(observations), dtype=int)

In [18]:
dataset = d3rlpy.dataset.MDPDataset(
    observations=observations,
    actions=actions,
    rewards=rewards,
    terminals=terminals,
    #discrete_action=True
)

2025-12-09 12:38.12 [info     ] Signatures have been automatically determined. action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]) observation_signature=Signature(dtype=[dtype('float64')], shape=[(79,)]) reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)])
2025-12-09 12:38.12 [info     ] Action-space has been automatically determined. action_space=<ActionSpace.DISCRETE: 2>
2025-12-09 12:38.13 [info     ] Action size has been automatically determined. action_size=2


In [19]:
print(f"Total Transitions: {len(observations)}")
print(f"Action Distribution: {np.unique(actions, return_counts=True)}")
print(f"Average Reward in Dataset: {np.mean(rewards):.2f}")

Total Transitions: 352166
Action Distribution: (array([0, 1]), array([176083, 176083]))
Average Reward in Dataset: -902.75


# Training the RL Agent (CQL based)

In [20]:
cql = d3rlpy.algos.DiscreteCQLConfig(
    batch_size=256,
    learning_rate=1e-4,
    alpha=1.0 # Controls how "conservative" the agent is 
).create(device="cuda:0")

In [22]:
cql.fit(
    dataset,
    n_steps=10000,
    n_steps_per_epoch=1000,
    save_interval=1000,
    #eval_episodes=None, # don't have a gym environment to evaluate live
    experiment_name="loan_cql_run"
)

2025-12-09 12:45.17 [info     ] dataset info                   dataset_info=DatasetInfo(observation_signature=Signature(dtype=[dtype('float64')], shape=[(79,)]), action_signature=Signature(dtype=[dtype('int64')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float64')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)
2025-12-09 12:45.17 [debug    ] Building models...            
2025-12-09 12:45.18 [debug    ] Models have been built.       
2025-12-09 12:45.18 [info     ] Directory is created at d3rlpy_logs\loan_cql_run_20251209124518
2025-12-09 12:45.18 [info     ] Parameters                     params={'observation_shape': [79], 'action_size': 2, 'config': {'type': 'discrete_cql', 'params': {'batch_size': 256, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': {'type': 'none', 'params': {}}, 'reward_scaler': {'type': 'none', 'params': {}}, 'compile_graph': False, 'learning_rate': 0.0001, 'optim_factory': {'type': 'ada

Epoch 1/10:   0%|          | 0/1000 [00:00<?, ?it/s]

2025-12-09 12:45.30 [info     ] loan_cql_run_20251209124518: epoch=1 step=1000 epoch=1 metrics={'time_sample_batch': 0.004452955484390259, 'time_algorithm_update': 0.0071353626251220706, 'loss': 2234.8534659423826, 'td_loss': 2233.859842895508, 'conservative_loss': 0.9936207665205001, 'time_step': 0.011640814304351807} step=1000


Epoch 2/10:   0%|          | 0/1000 [00:00<?, ?it/s]

2025-12-09 12:45.41 [info     ] loan_cql_run_20251209124518: epoch=2 step=2000 epoch=2 metrics={'time_sample_batch': 0.004081830501556396, 'time_algorithm_update': 0.007274195432662964, 'loss': 2227.985461425781, 'td_loss': 2226.917547729492, 'conservative_loss': 1.0679138877987862, 'time_step': 0.011447357416152954} step=2000


Epoch 3/10:   0%|          | 0/1000 [00:00<?, ?it/s]

2025-12-09 12:45.53 [info     ] loan_cql_run_20251209124518: epoch=3 step=3000 epoch=3 metrics={'time_sample_batch': 0.0047473304271698, 'time_algorithm_update': 0.006556984901428223, 'loss': 2242.6324913330077, 'td_loss': 2241.5626302490236, 'conservative_loss': 1.0698606004714966, 'time_step': 0.011416449785232544} step=3000


Epoch 4/10:   0%|          | 0/1000 [00:00<?, ?it/s]

2025-12-09 12:46.05 [info     ] loan_cql_run_20251209124518: epoch=4 step=4000 epoch=4 metrics={'time_sample_batch': 0.004105709314346314, 'time_algorithm_update': 0.0071660706996917725, 'loss': 2234.772651489258, 'td_loss': 2233.7125447998046, 'conservative_loss': 1.0601074762940408, 'time_step': 0.011385433197021485} step=4000


Epoch 5/10:   0%|          | 0/1000 [00:00<?, ?it/s]

2025-12-09 12:46.17 [info     ] loan_cql_run_20251209124518: epoch=5 step=5000 epoch=5 metrics={'time_sample_batch': 0.004933040618896484, 'time_algorithm_update': 0.007271283149719239, 'loss': 2246.772549560547, 'td_loss': 2245.697715698242, 'conservative_loss': 1.0748399902582169, 'time_step': 0.012377523183822632} step=5000


Epoch 6/10:   0%|          | 0/1000 [00:00<?, ?it/s]

2025-12-09 12:46.29 [info     ] loan_cql_run_20251209124518: epoch=6 step=6000 epoch=6 metrics={'time_sample_batch': 0.004644346714019775, 'time_algorithm_update': 0.00751648998260498, 'loss': 2235.352747558594, 'td_loss': 2234.2765003662107, 'conservative_loss': 1.0762469405531883, 'time_step': 0.012233129262924195} step=6000


Epoch 7/10:   0%|          | 0/1000 [00:00<?, ?it/s]

2025-12-09 12:46.43 [info     ] loan_cql_run_20251209124518: epoch=7 step=7000 epoch=7 metrics={'time_sample_batch': 0.004695441484451294, 'time_algorithm_update': 0.008285058498382568, 'loss': 2249.717970825195, 'td_loss': 2248.648846435547, 'conservative_loss': 1.0691236688494683, 'time_step': 0.013110782623291016} step=7000


Epoch 8/10:   0%|          | 0/1000 [00:00<?, ?it/s]

2025-12-09 12:46.57 [info     ] loan_cql_run_20251209124518: epoch=8 step=8000 epoch=8 metrics={'time_sample_batch': 0.004977902889251709, 'time_algorithm_update': 0.008572404623031616, 'loss': 2240.7507578125, 'td_loss': 2239.6659310302734, 'conservative_loss': 1.084826292693615, 'time_step': 0.01367981219291687} step=8000


Epoch 9/10:   0%|          | 0/1000 [00:00<?, ?it/s]

2025-12-09 12:47.09 [info     ] loan_cql_run_20251209124518: epoch=9 step=9000 epoch=9 metrics={'time_sample_batch': 0.004460729837417602, 'time_algorithm_update': 0.007578107357025146, 'loss': 2252.0469572753905, 'td_loss': 2250.964448364258, 'conservative_loss': 1.0825099548697472, 'time_step': 0.012147035837173461} step=9000


Epoch 10/10:   0%|          | 0/1000 [00:00<?, ?it/s]

2025-12-09 12:47.21 [info     ] loan_cql_run_20251209124518: epoch=10 step=10000 epoch=10 metrics={'time_sample_batch': 0.004477681159973145, 'time_algorithm_update': 0.007120664358139038, 'loss': 2227.849716430664, 'td_loss': 2226.769458984375, 'conservative_loss': 1.080257894217968, 'time_step': 0.011683770418167115} step=10000


[(1,
  {'time_sample_batch': 0.004452955484390259,
   'time_algorithm_update': 0.0071353626251220706,
   'loss': 2234.8534659423826,
   'td_loss': 2233.859842895508,
   'conservative_loss': 0.9936207665205001,
   'time_step': 0.011640814304351807}),
 (2,
  {'time_sample_batch': 0.004081830501556396,
   'time_algorithm_update': 0.007274195432662964,
   'loss': 2227.985461425781,
   'td_loss': 2226.917547729492,
   'conservative_loss': 1.0679138877987862,
   'time_step': 0.011447357416152954}),
 (3,
  {'time_sample_batch': 0.0047473304271698,
   'time_algorithm_update': 0.006556984901428223,
   'loss': 2242.6324913330077,
   'td_loss': 2241.5626302490236,
   'conservative_loss': 1.0698606004714966,
   'time_step': 0.011416449785232544}),
 (4,
  {'time_sample_batch': 0.004105709314346314,
   'time_algorithm_update': 0.0071660706996917725,
   'loss': 2234.772651489258,
   'td_loss': 2233.7125447998046,
   'conservative_loss': 1.0601074762940408,
   'time_step': 0.011385433197021485}),
 (5,

In [23]:
cql.save_model("cql_loan_agent.pt")
print("Model saved to cql_loan_agent.pt")

Model saved to cql_loan_agent.pt


# Evaluating the RL model

<p>1. Select ONLY the Real Historical Data (Batch A) <br>
In Step 2, we concatenated [Real, Synthetic]. The first half of the dataset is the Real data.</p>

In [24]:
n_real = len(df)
real_observations = observations[:n_real]
real_rewards = rewards[:n_real]

<p>2. Ask the RL Agent to make decisions</p>

In [25]:
# predict() returns the action (0 or 1) for each state
rl_actions = cql.predict(real_observations)

<p>3. Calculate the RL values</p>

In [26]:
rl_portfolio_rewards = []
rl_approve_count = 0

In [27]:
for i in range(n_real):
    action = rl_actions[i]
    
    if action == 0: # Agent Denies
        rl_portfolio_rewards.append(0) # Safe, no gain/loss
    else: # Agent Approves
        # We get the ACTUAL historical result
        rl_portfolio_rewards.append(real_rewards[i])
        rl_approve_count += 1

## Compare Human vs RL-based policies

In [28]:
human_avg_reward = np.mean(real_rewards) # actual
rl_avg_reward = np.mean(rl_portfolio_rewards) # What RL would have done

In [30]:
print(f"Human Banker Avg Reward:   ${human_avg_reward:.2f}")
print(f"RL Agent Avg Reward:       ${rl_avg_reward:.2f}")
print("-" * 40)
print(f"Human Approval Rate:       100.0% (By definition of dataset's subset)")
print(f"RL Agent Approval Rate:    {rl_approve_count / n_real * 100:.1f}%")

Human Banker Avg Reward:   $-1805.50
RL Agent Avg Reward:       $-1325.56
----------------------------------------
Human Approval Rate:       100.0% (By definition of dataset's subset)
RL Agent Approval Rate:    95.2%


## Total Impact

In [31]:
total_gain = (rl_avg_reward - human_avg_reward) * n_real
print(f"Total Portfolio Improvement: ${total_gain:,.2f}")

Total Portfolio Improvement: $84,510,026.31


# Saving the results

In [None]:
comparison_df = pd.DataFrame({
    'Metric': ['Average Reward per Loan', 'Total Portfolio Value'],
    'Human Policy': [human_avg_reward, np.sum(real_rewards)],
    'RL Agent Policy': [rl_avg_reward, np.sum(rl_portfolio_rewards)]
})
print("\nFinal Comparison Table:")
display(comparison_df)


Final Comparison Table:


Unnamed: 0,Metric,Human Policy,RL Agent Policy
0,Average Reward per Loan,-1805.502,-1325.558
1,Total Portfolio Value,-317918200.0,-233408200.0
