# Initial cleaning of dataset (similar to supervised model cleaning)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df = pd.read_csv('accepted_2007_to_2018Q4.csv', low_memory=False)

In [None]:
# Filtering relevant loan statuses
final_status = ['Fully Paid', 'Charged Off', 'Default']
df = df[df['loan_status'].isin(final_status)].copy()

In [None]:
# Creating binary targets
df['target'] = df['loan_status'].apply(lambda x: 1 if x in ['Charged Off', 'Default'] else 0)

# Drop columns with >50% missing values
missing_ratio = df.isnull().mean()
df.drop(columns=missing_ratio[missing_ratio > 0.5].index, inplace=True)

# Drop irrelevant text columns
drop_cols = ['member_id', 'id', 'url', 'title', 'zip_code', 'issue_d',
             'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d',
             'emp_title', 'loan_status', 'pymnt_plan', 'initial_list_status',
             'disbursement_method']
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)

# Encode binary flags
binary_map = {'Y': 1, 'N': 0}
for col in ['hardship_flag', 'debt_settlement_flag']:
    if col in df.columns:
        df[col] = df[col].map(binary_map)

# Fill missing values
df['emp_length'] = df['emp_length'].fillna('Unknown')
df['dti'] = df['dti'].fillna(df['dti'].median())
df['revol_util'] = df['revol_util'].fillna(df['revol_util'].median())

# Ordinal encoding
df['term'] = df['term'].apply(lambda x: 0 if '36' in x else 1)
grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
df['grade'] = df['grade'].map(grade_map)

# Clean emp_length
import re
def clean_emp_length(x):
    if x == 'Unknown': return 0
    match = re.findall(r'\d+', str(x))
    return int(match[0]) if match else 0
df['emp_length'] = df['emp_length'].apply(clean_emp_length)

# One-hot encode categorical columns
categorical_cols = ['home_ownership', 'verification_status', 'purpose', 'addr_state', 'application_type']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Final cleanup: drop any remaining object columns
df.drop(columns=df.select_dtypes(include='object').columns, inplace=True)

print(df.columns.tolist())



['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'emp_length', 'annual_inc', 'dti', 'delinq_2yrs', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'mths_since_last_delinq', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_amnt', 'last_fico_range_high', 'last_fico_range_low', 'collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_

In [None]:
# Save full cleaned DataFrame including target
df.to_csv('cleaned_loan_data_with_target.csv', index=False)

# Using the cleaned dataset for Reinforcement learning

In [None]:
import pandas as pd
df = pd.read_csv('cleaned_loan_data_with_target.csv')

In [None]:
df_rl = df.copy()

# Action: always 1 (approved)
df_rl['action'] = 1

# Reward:
df['reward'] = df.apply(
    lambda row: row['loan_amnt'] * row['int_rate'] / 100 if row['target'] == 0 else -row['loan_amnt'],
    axis=1
)

# State: drop target, reward, action
state_cols = [c for c in df_rl.columns if c not in ['target', 'reward', 'action']]

# Normalizing Features

In [None]:
reward_scale = 10000.0
df['reward'] = df.apply(
    lambda row: (row['loan_amnt'] * row['int_rate'] / 100) / reward_scale if row['target'] == 0
    else -row['loan_amnt'] / reward_scale,
    axis=1
)


In [None]:
# Checking how many columns still have NaN values
nan_cols = df[state_cols].isnull().sum()
print(nan_cols[nan_cols > 0])


mths_since_last_delinq        1807
mths_since_rcnt_il              88
il_util                        458
bc_open_to_buy                  42
bc_util                         44
mo_sin_old_il_acct              88
mths_since_recent_bc            41
mths_since_recent_inq          408
num_actv_rev_tl                  1
num_bc_sats                      1
num_bc_tl                        1
num_il_tl                        1
num_op_rev_tl                    1
num_rev_accts                    1
num_rev_tl_bal_gt_0              1
num_sats                         1
num_tl_120dpd_2m               314
num_tl_30dpd                     1
num_tl_90g_dpd_24m               1
num_tl_op_past_12m               1
pct_tl_nvr_dlq                   1
percent_bc_gt_75                48
pub_rec_bankruptcies             1
tax_liens                        1
tot_hi_cred_lim                  1
total_bal_ex_mort                1
total_bc_limit                   1
total_il_high_credit_limit       1
hardship_flag       

In [None]:
# Replacing NaN values based on whether it is numerical or categorical data
for col in df[state_cols].columns:
    if df[col].isnull().any():
        if df[col].dtype == 'float' or df[col].dtype == 'int':
            df[col] = df[col].fillna(df[col].median())
        else:
            df[col] = df[col].fillna('Unknown')


In [None]:
# Confirming there is no more NaN values in columns
assert not df[state_cols].isnull().any().any(), "Still NaNs after filling!"


In [None]:
# Scaling features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[state_cols])


In [None]:
# Confirming there is no NaN values after scaling
assert not np.isnan(X_scaled).any(), "NaNs in scaled features!"


# Training the RL Agent using DQN (Deep Q-Network)

In [None]:
from d3rlpy.dataset import MDPDataset
from d3rlpy.algos import CQL
from sklearn.model_selection import train_test_split
import numpy as np

# Preparing the  dataset
X_scaled = df[state_cols].values.astype(np.float32)
actions = np.ones(len(df), dtype=np.int32)
rewards = df['reward'].values.astype(np.float32)
terminals = np.ones_like(rewards, dtype=bool)

In [None]:
# Splitting the dataset for training, validation and testing
from sklearn.model_selection import train_test_split

X_train, X_val, a_train, a_val, r_train, r_val, t_train, t_val, train_indices, X_val_indices = train_test_split(
    X_scaled, actions, rewards, terminals, range(len(X_scaled)), test_size=0.1, random_state=42
)

# Test approach
Creating an MDP dataset for training the reinforcement learning agent
Stores:
- States (X)
- actions (a)
- rewards (r)
- terminal flags (t)

In [None]:
from d3rlpy.dataset import MDPDataset

dataset = MDPDataset(X_train, a_train, r_train, t_train)


[2m2025-10-29 22:00.07[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int32')], shape=[(1,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(153,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(1,)])[0m
[2m2025-10-29 22:00.07[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.DISCRETE: 2>[0m
[2m2025-10-29 22:00.07[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m2[0m


# Testing the DQN training pipeline to check whether dataset structure is valid or not

In [None]:
from d3rlpy.algos import DQN, DQNConfig

config = DQNConfig()

# Initialize agent with required arguments
dqn = DQN(config=config, device='cpu', enable_ddp=False)

dqn.fit(dataset, n_steps=10)

[2m2025-10-29 22:00.12[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('float32')], shape=[(153,)]), action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)[0m
[2m2025-10-29 22:00.12[0m [[32m[1mdebug    [0m] [1mBuilding models...            [0m
[2m2025-10-29 22:00.12[0m [[32m[1mdebug    [0m] [1mModels have been built.       [0m
[2m2025-10-29 22:00.12[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/DQN_20251029220012[0m
[2m2025-10-29 22:00.12[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [153], 'action_size': 2, 'config': {'type': 'dqn', 'params': {'batch_size': 32, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': 

[]

# Predicting actions from the dataset

In [None]:
predicted_actions = dqn.predict(X_val)

# Evaluating the predicted policy on validation data by computing expectated reward (realistic reward function)

Reward logic:
- If loan is approved and customer repays (target=0): positive profit = loan_amount * interest.
- If loan is approved and customer defaults (target=1): loss = -loan_amount.
- If loan is denied: no profit/loss = 0.

In [None]:
# Reconstructing validation DataFrame
val_df = df.iloc[X_val_indices].copy()

# Applying predicted actions
val_df['predicted_action'] = predicted_actions

# Computing reward based on predicted action (exploratory step)
def compute_reward(row):
    # Loan approved
    if row['predicted_action'] == 1:
        return row['loan_amnt'] * row['int_rate'] / 100 if row['target'] == 0 else -row['loan_amnt']

    # Loan denied
    else:
        return 0.0

val_df['predicted_reward'] = val_df.apply(compute_reward, axis=1)

# Computing average expected reward from the validation predictions

In [None]:
average_reward = val_df['predicted_reward'].mean()
print("Average Predicted Reward:", average_reward)


Average Predicted Reward: -190.95464572192515


# Checking how many approvals vs denials the agent predicted - to assess policy bias

In [None]:
print(val_df['predicted_action'].value_counts())


predicted_action
0    296
1     78
Name: count, dtype: int64


# Analyzing average reward per predicted action

In [None]:
val_df.groupby('predicted_action')['predicted_reward'].mean()


Unnamed: 0_level_0,predicted_reward
predicted_action,Unnamed: 1_level_1
0,0.0
1,-915.603045


# Reward system for the DQN agent to learn a general policy for loan approval (simplified reward definition for RL training)

In [None]:
df['reward'] = df['target'].apply(lambda x: 1.0 if x == 0 else -1.0)


In [None]:
# Assuming all historical loans were approved
df['action'] = 1


# Creating a synthetic dataset for both possible actions:
- approved_df: historical real loans (action=1)
- denied_df: hypothetical 'deny' cases (action=0, zero reward)

In [None]:
approved_df = df.copy()
denied_df = df.copy()

approved_df['action'] = 1
approved_df['reward'] = approved_df['target'].apply(lambda x: 1.0 if x == 0 else -1.0)

denied_df['action'] = 0

# no loan = no gain/loss
denied_df['reward'] = 0.0

combined_df = pd.concat([approved_df, denied_df], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)


# Preparing numerical features for training

In [None]:
state_cols = [c for c in combined_df.columns if c not in ['target', 'reward', 'action']]
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(combined_df[state_cols])
actions = combined_df['action'].values.astype(np.int32)
rewards = combined_df['reward'].values.astype(np.float32)
terminals = np.ones_like(rewards, dtype=bool)


# Splitting the dataset into training and validation sets for model evaluation

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, a_train, a_val, r_train, r_val, t_train, t_val, train_indices, val_indices = train_test_split(
    X, actions, rewards, terminals, range(len(X)), test_size=0.1, random_state=42
)


# Creating an MDP dataset for training the reinforcement learning agent
Stores:
- States (X)
- actions (a)
- rewards (r)
- terminal flags (t)

In [None]:
from d3rlpy.dataset import MDPDataset

dataset = MDPDataset(X_train, a_train, r_train, t_train)


[2m2025-10-29 22:04.38[0m [[32m[1minfo     [0m] [1mSignatures have been automatically determined.[0m [36maction_signature[0m=[35mSignature(dtype=[dtype('int32')], shape=[(1,)])[0m [36mobservation_signature[0m=[35mSignature(dtype=[dtype('float64')], shape=[(153,)])[0m [36mreward_signature[0m=[35mSignature(dtype=[dtype('float32')], shape=[(1,)])[0m
[2m2025-10-29 22:04.38[0m [[32m[1minfo     [0m] [1mAction-space has been automatically determined.[0m [36maction_space[0m=[35m<ActionSpace.DISCRETE: 2>[0m
[2m2025-10-29 22:04.38[0m [[32m[1minfo     [0m] [1mAction size has been automatically determined.[0m [36maction_size[0m=[35m2[0m


# Training the DQN agent on the offline loan dataset

In [None]:
config = DQNConfig()

# Initializing agent with required arguments
dqn = DQN(config=config, device='cpu', enable_ddp=False)

dqn.fit(dataset, n_steps=50000)

[2m2025-10-29 22:05.47[0m [[32m[1minfo     [0m] [1mdataset info                  [0m [36mdataset_info[0m=[35mDatasetInfo(observation_signature=Signature(dtype=[dtype('float64')], shape=[(153,)]), action_signature=Signature(dtype=[dtype('int32')], shape=[(1,)]), reward_signature=Signature(dtype=[dtype('float32')], shape=[(1,)]), action_space=<ActionSpace.DISCRETE: 2>, action_size=2)[0m
[2m2025-10-29 22:05.47[0m [[32m[1mdebug    [0m] [1mBuilding models...            [0m
[2m2025-10-29 22:05.47[0m [[32m[1mdebug    [0m] [1mModels have been built.       [0m
[2m2025-10-29 22:05.47[0m [[32m[1minfo     [0m] [1mDirectory is created at d3rlpy_logs/DQN_20251029220547[0m
[2m2025-10-29 22:05.47[0m [[32m[1minfo     [0m] [1mParameters                    [0m [36mparams[0m=[35m{'observation_shape': [153], 'action_size': 2, 'config': {'type': 'dqn', 'params': {'batch_size': 32, 'gamma': 0.99, 'observation_scaler': {'type': 'none', 'params': {}}, 'action_scaler': 

Epoch 1/5:   0%|          | 0/10000 [00:00<?, ?it/s]

[2m2025-10-29 22:06.22[0m [[32m[1minfo     [0m] [1mDQN_20251029220547: epoch=1 step=10000[0m [36mepoch[0m=[35m1[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0007092863082885742, 'time_algorithm_update': 0.002721205759048462, 'loss': 0.010905840966625693, 'time_step': 0.00352421293258667}[0m [36mstep[0m=[35m10000[0m
[2m2025-10-29 22:06.22[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DQN_20251029220547/model_10000.d3[0m


Epoch 2/5:   0%|          | 0/10000 [00:00<?, ?it/s]

[2m2025-10-29 22:06.56[0m [[32m[1minfo     [0m] [1mDQN_20251029220547: epoch=2 step=20000[0m [36mepoch[0m=[35m2[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0006769908666610718, 'time_algorithm_update': 0.0025758283376693726, 'loss': 0.00020183251825974367, 'time_step': 0.003344250702857971}[0m [36mstep[0m=[35m20000[0m
[2m2025-10-29 22:06.56[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DQN_20251029220547/model_20000.d3[0m


Epoch 3/5:   0%|          | 0/10000 [00:00<?, ?it/s]

[2m2025-10-29 22:07.30[0m [[32m[1minfo     [0m] [1mDQN_20251029220547: epoch=3 step=30000[0m [36mepoch[0m=[35m3[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0006808190107345581, 'time_algorithm_update': 0.002578108763694763, 'loss': 0.00014586059204357298, 'time_step': 0.0033563970565795897}[0m [36mstep[0m=[35m30000[0m
[2m2025-10-29 22:07.30[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DQN_20251029220547/model_30000.d3[0m


Epoch 4/5:   0%|          | 0/10000 [00:00<?, ?it/s]

[2m2025-10-29 22:08.04[0m [[32m[1minfo     [0m] [1mDQN_20251029220547: epoch=4 step=40000[0m [36mepoch[0m=[35m4[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0006780359029769897, 'time_algorithm_update': 0.002568923234939575, 'loss': 0.000115547615496439, 'time_step': 0.0033425585746765137}[0m [36mstep[0m=[35m40000[0m
[2m2025-10-29 22:08.04[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DQN_20251029220547/model_40000.d3[0m


Epoch 5/5:   0%|          | 0/10000 [00:00<?, ?it/s]

[2m2025-10-29 22:08.38[0m [[32m[1minfo     [0m] [1mDQN_20251029220547: epoch=5 step=50000[0m [36mepoch[0m=[35m5[0m [36mmetrics[0m=[35m{'time_sample_batch': 0.0006920247793197632, 'time_algorithm_update': 0.0026267071723937987, 'loss': 9.508389734853608e-05, 'time_step': 0.0034264955043792726}[0m [36mstep[0m=[35m50000[0m
[2m2025-10-29 22:08.38[0m [[32m[1minfo     [0m] [1mModel parameters are saved to d3rlpy_logs/DQN_20251029220547/model_50000.d3[0m


[(1,
  {'time_sample_batch': 0.0007092863082885742,
   'time_algorithm_update': 0.002721205759048462,
   'loss': 0.010905840966625693,
   'time_step': 0.00352421293258667}),
 (2,
  {'time_sample_batch': 0.0006769908666610718,
   'time_algorithm_update': 0.0025758283376693726,
   'loss': 0.00020183251825974367,
   'time_step': 0.003344250702857971}),
 (3,
  {'time_sample_batch': 0.0006808190107345581,
   'time_algorithm_update': 0.002578108763694763,
   'loss': 0.00014586059204357298,
   'time_step': 0.0033563970565795897}),
 (4,
  {'time_sample_batch': 0.0006780359029769897,
   'time_algorithm_update': 0.002568923234939575,
   'loss': 0.000115547615496439,
   'time_step': 0.0033425585746765137}),
 (5,
  {'time_sample_batch': 0.0006920247793197632,
   'time_algorithm_update': 0.0026267071723937987,
   'loss': 9.508389734853608e-05,
   'time_step': 0.0034264955043792726})]

# Evaluation metric

In [None]:
predicted_actions = dqn.predict(X_val)
val_df = combined_df.iloc[val_indices].copy()
val_df['predicted_action'] = predicted_actions

def compute_reward(row):
    if row['predicted_action'] == 1:
        return 1.0 if row['target'] == 0 else -1.0
    else:
        return 0.0

val_df['predicted_reward'] = val_df.apply(compute_reward, axis=1)
print("Average Predicted Reward:", val_df['predicted_reward'].mean())


Average Predicted Reward: 0.8235294117647058
