In [50]:
import pandas as pd
import numpy as np

from sklearn.utils import resample
from sklearn.model_selection import train_test_split

import xgboost as xgb

from sklearn.metrics import roc_auc_score
# ROC is used for evaluation of model

In [51]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_output = pd.DataFrame()
df_output['id'] = df_test['id'].copy()

In [52]:
df_train.describe()

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0,58645.0
mean,29322.0,27.550857,64046.17,4.701015,9217.556518,10.677874,0.159238,5.813556,0.142382
std,16929.497605,6.033216,37931.11,3.959784,5563.807384,3.034697,0.091692,4.029196,0.349445
min,0.0,20.0,4200.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,14661.0,23.0,42000.0,2.0,5000.0,7.88,0.09,3.0,0.0
50%,29322.0,26.0,58000.0,4.0,8000.0,10.75,0.14,4.0,0.0
75%,43983.0,30.0,75600.0,7.0,12000.0,12.99,0.21,8.0,0.0
max,58644.0,123.0,1900000.0,123.0,35000.0,23.22,0.83,30.0,1.0


In [53]:
def remove_outliers(df : pd.DataFrame, columns : list, significance: int):
    for column in columns:
        upperLimit = df[column].quantile(1-significance*.01)
        lowerLimit = df[column].quantile(significance*.01)
        df = df.loc[(df[column] > lowerLimit) & (df[column] < upperLimit)]
    return df

def normalize_data(df : pd.DataFrame, columns : list):
    for column in columns:
        df[column] = df[column] / df[column].abs().max() 
    return df

def create_dummies(df : pd.DataFrame, columns : list):
    for column in columns:
        df = pd.concat([df.drop(column, axis=1), pd.get_dummies(df[column], prefix=column)], axis=1)
    return df

In [54]:
columns_outliers = ['person_income', 'loan_amnt', 'person_age', 'person_emp_length', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
columns_normalization = ['person_income', 'loan_amnt', 'person_age', 'person_emp_length', 'loan_percent_income', 'cb_person_cred_hist_length']
columns_dummies = ['person_home_ownership', 'loan_intent', 'loan_grade']

#df_train = remove_outliers(df_train, columns_outliers, 1)
df_train = normalize_data(df_train, columns_normalization)
df_train = create_dummies(df_train, columns_dummies)
df_train['cb_person_default_on_file'] = df_train['cb_person_default_on_file'].map(dict(Y=1, N=0))

df_test = normalize_data(df_test, columns_normalization)
df_test = create_dummies(df_test, columns_dummies)
df_test['cb_person_default_on_file'] = df_test['cb_person_default_on_file'].map(dict(Y=1, N=0))

In [55]:
df_train['loan_status'].value_counts()

loan_status
0    50295
1     8350
Name: count, dtype: int64

In [56]:
# Balance Dataset by oversampling
minority_class = df_train[df_train['loan_status'] == 1]
majority_class = df_train[df_train['loan_status'] == 0]


minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)
df_train = pd.concat([majority_class, minority_upsampled])

In [57]:
df_train['loan_status'].value_counts()

loan_status
0    50295
1    50295
Name: count, dtype: int64

In [58]:
df_train.head()

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G
0,0,0.300813,0.018421,0.0,0.171429,11.49,0.204819,0,0.466667,0,...,False,False,False,False,True,False,False,False,False,False
1,1,0.178862,0.029474,0.04878,0.114286,13.35,0.084337,0,0.066667,0,...,True,False,False,False,False,True,False,False,False,False
2,2,0.235772,0.015158,0.065041,0.171429,8.9,0.253012,0,0.333333,0,...,False,True,False,True,False,False,False,False,False,False
3,3,0.243902,0.036842,0.113821,0.342857,11.11,0.204819,0,0.166667,0,...,False,False,True,False,True,False,False,False,False,False
4,4,0.178862,0.031579,0.01626,0.171429,6.92,0.120482,0,0.1,0,...,True,False,False,True,False,False,False,False,False,False


In [59]:
df_train, df_validation = train_test_split(df_train, test_size=0.2, random_state=42)

In [60]:
clf = xgb.XGBClassifier(tree_method="hist")
clf.fit(df_train.loc[:, df_train.columns != 'loan_status'], df_train['loan_status'])
predictions = clf.predict(df_validation.loc[:, df_validation.columns != 'loan_status'])

In [61]:
roc_auc_score(df_validation['loan_status'],predictions)

0.9379907200177358

In [62]:
predictions_test = clf.predict(df_test)
df_output['loan_status'] = predictions_test
df_output = df_output.set_index('id')
df_output.to_csv('Predictions_v3.csv')
