# Bachelor's thesis

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score,roc_curve, brier_score_loss
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from scipy.stats import mannwhitneyu
from scipy.stats import ks_2samp

### Loading the data 

In [2]:
unbiased_data = pd.DataFrame()

In [3]:
unbiased_data = pd.read_csv('../data/accepts.csv',encoding = "ISO-8859-1", low_memory=False)

## Rebalancing

### In this step I rebalance my data so that it has a sensible amount of good and bad loan to represent an unbiased set. First I scale the numerical features and then use SMOTE to resample. You can try other resampling methods, as you like.

In [4]:
scaler=StandardScaler()
scaler.fit(unbiased_data[['loan_amnt','dti', 'fico','int_rate',"bc_open_to_buy","acc_open_past_24mths"]])
unbiased_data[['loan_amnt', 'dti', 'fico','int_rate',"bc_open_to_buy","acc_open_past_24mths"]] = scaler.transform(unbiased_data[['loan_amnt','dti', 'fico','int_rate',"bc_open_to_buy","acc_open_past_24mths"]])

In [5]:
X = unbiased_data.drop("loan_status", axis=1)
y = unbiased_data["loan_status"]

In [None]:
sm = SMOTE(random_state=42,sampling_strategy = 0.9)
X_unbiased, y_unbiased = sm.fit_resample(X, y)

In [None]:
unbiased_data = pd.concat([X_unbiased, y_unbiased], axis = 1)
unbiased_data.info()

### Now I inverse the scaling and change the values slightly. For example FICO Scores used in our data only end on 0 or 5.

In [None]:
unbiased_data[['loan_amnt', 'dti', 'fico','int_rate',"bc_open_to_buy","acc_open_past_24mths"]] = scaler.inverse_transform(unbiased_data[['loan_amnt', 'dti', 'fico','int_rate',"bc_open_to_buy","acc_open_past_24mths"]], copy=True)

In [None]:
columns_to_convert = ['loan_amnt', 'fico','bc_open_to_buy','acc_open_past_24mths']
unbiased_data[columns_to_convert] = unbiased_data[columns_to_convert].round().astype(int)

In [None]:
def round_to_5_or_10(x):
    return 5 * round(x / 5)

unbiased_data['fico'] = unbiased_data['fico'].apply(round_to_5_or_10)

In [None]:
def round_to_25_intervals(x):
    return 25 * round(x / 25)

unbiased_data['loan_amnt'] = unbiased_data['loan_amnt'].apply(round_to_25_intervals)

In [None]:
unbiased_data.info()

## Scorecard to predict Accepts and Rejects 

### I use a simple XGB-Classifier to predict the probabilities of default and then to assign them to the accepts or rejects. You can also drop the variables int_rate, bc_open_to_buy, acc_open_past_24mths, to see how well the model with only observed variables would do.

In [None]:
X = unbiased_data.drop("loan_status", axis=1)
y = unbiased_data["loan_status"]

In [None]:
xgb_clf = XGBClassifier(n_estimators=150, learning_rate=1, max_depth=3, random_state=42)
xgb_clf.fit(X, y)
yhat = xgb_clf.predict_proba(X)

### These are the results of the model

In [None]:
auc = roc_auc_score(y, yhat[:,1])
print("AUC:", auc)

In [None]:
brier = brier_score_loss(y, yhat[:,1],pos_label=1)
print("Brier score:", brier)

In [None]:
fpr, tpr, thresholds = roc_curve(y, yhat[:,1],pos_label=1) 
ks_statistic = max(tpr - fpr)
print("KS-Statistic:",ks_statistic)

## Final synthetic data

### Now I assign my observations to the accepts or rejects based on the percentiles of the predictions and thus have my new accepts and new rejects. I also drop the unobserved variables, so that the later compared reject inference models don't have access to them.

In [None]:
unbiased_data['predictions'] = yhat[:, 0]

In [None]:
unbiased_data = unbiased_data.drop("int_rate", axis=1)
unbiased_data = unbiased_data.drop("bc_open_to_buy", axis=1)
unbiased_data = unbiased_data.drop("acc_open_past_24mths", axis=1)

In [None]:
unbiased_data.info()

In [None]:
accept_percentile = np.percentile(unbiased_data['predictions'], 70)  # Adjust percentile as required
reject_percentile = np.percentile(unbiased_data['predictions'], 70)
New_accepts = unbiased_data[unbiased_data['predictions'] >= accept_percentile]
New_rejects = unbiased_data[unbiased_data['predictions']< reject_percentile]

In [None]:
New_rejects = New_rejects.drop('predictions', axis=1)
New_accepts = New_accepts.drop('predictions', axis=1)

### I also drop Duplicates which makes it easier to analyse the models later.

In [None]:
X_acc = New_accepts.copy()
X_rej = New_rejects.copy()
X_acc['Rejected']=0
X_rej['Rejected']=1
All_data = pd.concat([X_acc, X_rej])
All_data = All_data.drop_duplicates(subset=['loan_amnt', 'emp_length','dti','fico','addr_state'], keep=False)
New_accepts = All_data[All_data['Rejected'] == 0]
New_rejects = All_data[All_data['Rejected'] == 1]
New_accepts = New_accepts.drop('Rejected', axis=1)
New_rejects = New_rejects.drop('Rejected', axis=1)

### These are our final accepts and rejects and their values for loan_status

In [None]:
New_accepts.value_counts(['loan_status'])

In [None]:
New_rejects.value_counts(['loan_status'])

In [None]:
All_data.value_counts(['loan_status'])

### Here we save these newly created accepts and rejects.

In [None]:
New_rejects.to_csv('../data/New_rejects.csv', encoding='utf-8', index=False)
New_accepts.to_csv('../data/New_accepts.csv', encoding='utf-8', index=False)

## Statistics

### Now I show some statistics about the created accepts and rejects, as well as performing some statistical tests. Here you can see the Mann-Whitney-U-Test for the variable loan amount as an example. I used the statistic to calculate my effect sizes.

In [None]:
New_accepts.describe().apply(lambda x: x.apply('{0:.1f}'.format))

In [None]:
New_rejects.describe().apply(lambda x: x.apply('{0:.1f}'.format))

In [None]:
mannwhitneyu(New_accepts.iloc[:,0], New_rejects.iloc[:,0])