# Loan Delinquency Prediction

Import the basic packages

In [64]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)

# Reading the Datasets 

In [65]:
train = pd.read_csv("train.csv")

In [66]:
test = pd.read_csv("test.csv")

# Data Preprocessing

Some of the features in the given dataset are categorical. Hence we can replace them with unique numerical values

In [67]:
train['source'] = train['source'].map({'X':1,'Y':2,'Z':3}) 
train['loan_purpose'] = train['loan_purpose'].map({'A23':1,'B12':2,'C86':3}) 
train["financial_institution"] = train["financial_institution"].map({'Browning-Hart':1,'Swanson, Newton and Miller':2,
                                                                     'Edwards-Hoffman':3,'Martinez, Duffy and Bird':4,
                                                                    'Miller, Mcclure and Allen':5,'Nicholson Group':6,
                                                                    'Turner, Baldwin and Rhodes':7,'Suarez Inc':8,
                                                                    'Cole, Brooks and Vincent':9,'Richards-Walters':10,
                                                                    'Richards-Walters':11,'Taylor, Hunt and Rodriguez':12,
                                                                    'Sanchez-Robinson':13,'Sanchez, Hays and Wilkerson':14,
                                                                    'Romero, Woods and Johnson':15,'Thornton-Davis':16,
                                                                    'Anderson-Taylor':17,'Richardson Ltd':18,
                                                                    'Chapman-Mcmahon':19,'OTHER':20})
test['source'] = test['source'].map({'X':1,'Y':2,'Z':3}) 
test['loan_purpose'] = test['loan_purpose'].map({'A23':1,'B12':2,'C86':3}) 
test["financial_institution"] = test["financial_institution"].map({'Browning-Hart':1,'Swanson, Newton and Miller':2,
                                                                     'Edwards-Hoffman':3,'Martinez, Duffy and Bird':4,
                                                                    'Miller, Mcclure and Allen':5,'Nicholson Group':6,
                                                                    'Turner, Baldwin and Rhodes':7,'Suarez Inc':8,
                                                                    'Cole, Brooks and Vincent':9,'Richards-Walters':10,
                                                                    'Richards-Walters':11,'Taylor, Hunt and Rodriguez':12,
                                                                    'Sanchez-Robinson':13,'Sanchez, Hays and Wilkerson':14,
                                                                    'Romero, Woods and Johnson':15,'Thornton-Davis':16,
                                                                    'Anderson-Taylor':17,'Richardson Ltd':18,
                                                                    'Chapman-Mcmahon':19,'OTHER':20})

In [68]:
train.head()

Unnamed: 0,loan_id,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,origination_date,first_payment_date,loan_to_value,number_of_borrowers,debt_to_income_ratio,borrower_credit_score,loan_purpose,insurance_percent,co-borrower_credit_score,insurance_type,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13
0,268055008619,3,7,4.25,214000,360,2012-03-01,05/2012,95,1.0,22.0,694.0,3,30.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,672831657627,2,2,4.875,144000,360,2012-01-01,03/2012,72,1.0,44.0,697.0,2,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,1
2,742515242108,3,16,3.25,366000,180,2012-01-01,03/2012,49,1.0,33.0,780.0,2,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,601385667462,1,20,4.75,135000,360,2012-02-01,04/2012,46,2.0,44.0,633.0,2,0.0,638.0,0.0,0,0,0,0,0,0,0,0,1,1,1,1,1
4,273870029961,1,20,4.75,124000,360,2012-02-01,04/2012,80,1.0,43.0,681.0,3,0.0,0.0,0.0,0,1,2,3,4,5,6,7,8,9,10,11,1


# Feature Engineering

As we need to test the data for multicollinearity. We first filter out the numerical features.

In [69]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

new_train = train.select_dtypes(include=numerics)

One of the most important step to be followed in feature engineering is to remove the numerical features with high correlation values. This is done to reduce the noise produced in the model due to multicollinearity.

In [70]:
# Create correlation matrix
corr_matrix = new_train.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

In [71]:
new_train = new_train.drop(new_train[to_drop], axis=1)

In [72]:
new_train.head()

Unnamed: 0,loan_id,source,financial_institution,interest_rate,unpaid_principal_bal,loan_term,loan_to_value,number_of_borrowers,debt_to_income_ratio,borrower_credit_score,loan_purpose,insurance_percent,insurance_type,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13
0,268055008619,3,7,4.25,214000,360,95,1.0,22.0,694.0,3,30.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,1
1,672831657627,2,2,4.875,144000,360,72,1.0,44.0,697.0,2,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,1
2,742515242108,3,16,3.25,366000,180,49,1.0,33.0,780.0,2,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,601385667462,1,20,4.75,135000,360,46,2.0,44.0,633.0,2,0.0,0.0,0,0,0,0,0,0,0,0,1,1,1,1,1
4,273870029961,1,20,4.75,124000,360,80,1.0,43.0,681.0,3,0.0,0.0,0,1,2,3,4,5,6,7,8,9,10,11,1


In [73]:
new_train.m13.value_counts()

0    115422
1       636
Name: m13, dtype: int64

As we can see from above that the data is highly unbalanced. Here there are two ways in which we can resample the dataset. We either under sample the class with the higher count or we over sample the class with the lower count. oversampling of the minor class might add lot of noise to the data while undersampling the major class might remove most the data. A better way in this case is to do both oversampling of minor class and undersampling of major class by a factor of 3 or 4 to get better accuracy while modelling. 

In [83]:
from sklearn.utils import resample
from math import ceil,floor
df_major = new_train[new_train.m13 == 0]
df_minor = new_train[new_train.m13 == 1]
major_count = len(df_major)
minor_count = len(df_minor)
df_major_downsampled = resample(df_major, replace = True, n_samples = floor(major_count/(3.6*100))*100, random_state = 2018)
df_minor_upsampled = resample(df_minor, replace = True, n_samples = floor((minor_count*4)/100)*100, random_state = 2018)
final_sample = pd.concat([df_major_downsampled, df_minor_upsampled])
final_sample.m13.value_counts()

0    32000
1     2500
Name: m13, dtype: int64

Splitting the Train data into train and test

In [15]:
from sklearn.model_selection import train_test_split
X = final_sample.drop('m13', axis = 1)
Y = final_sample.m13
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.15, random_state=0)

In [16]:
#from imblearn.over_sampling import SMOTE
#sm = SMOTE(random_state=12, ratio = 1.0)
#xtrain, ytrain = sm.fit_sample(xtrain, ytrain)

Another Important step of feature Engineering is to scale the data inorder to normalize it and speed up the algorithm. Choosing the right scalar method also makes a significant difference interms of accuracy. Here we use the Robust Scalar as the values are not influenced by large outliers.

In [17]:
from sklearn.preprocessing import RobustScaler
mms = RobustScaler()
mms.fit(xtrain)
xtrain_scaled = mms.transform(xtrain)

In [18]:
from sklearn.metrics import accuracy_score, roc_auc_score,classification_report, confusion_matrix
from sklearn.metrics import recall_score, roc_auc_score, f1_score
def evaluate_model(ytest, ypred, ypred_proba = None):
    if ypred_proba is not None:
        print('ROC-AUC score of the model:   {}'.format(roc_auc_score(ytest, ypred_proba[:, 1])))
    print('Accuracy of the model: {}\n'.format(accuracy_score(ytest, ypred)))
    print('Classification report: \n{}\n'.format(classification_report(ytest, ypred)))
    print('Confusion matrix: \n{}\n'.format(confusion_matrix(ytest, ypred)))

# Build a Random Forest Model

In [19]:
from sklearn.ensemble import RandomForestClassifier
def random_forest(xtrain, xtest, ytrain):
    rf_params = {
        'n_estimators': 126, 
        'max_depth': 14
    }

    rf = RandomForestClassifier(**rf_params)
    rf.fit(xtrain, ytrain)
    rfpred = rf.predict(xtest)
    rfpred_proba = rf.predict_proba(xtest)
    
    return rfpred, rfpred_proba

In [20]:
mms.fit(xtest)
xtest_scaled = mms.transform(xtest)

In [21]:
rfpred, rfpred_proba = random_forest(xtrain_scaled, xtest_scaled, ytrain)

In [22]:
evaluate_model(ytest, rfpred, rfpred_proba)

ROC-AUC score of the model:   0.9612532981677651
Accuracy of the model: 0.9706280193236715

Classification report: 
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      4802
           1       0.96      0.62      0.75       373

    accuracy                           0.97      5175
   macro avg       0.97      0.81      0.87      5175
weighted avg       0.97      0.97      0.97      5175


Confusion matrix: 
[[4793    9]
 [ 143  230]]



# Build a LightGBM Model

In [23]:
import lightgbm

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [24]:
lbg_params = {
    'n_estimators': 8000,
    'max_depth': 100,
    'objective': 'binary',
    'learning_rate' : 0.02,
    'num_leaves' : 250,
    'feature_fraction': 0.64, 
    'bagging_fraction': 0.8, 
    'bagging_freq': 1,
    'boosting_type' : 'gbdt'
}

In [25]:
lgb = lightgbm.LGBMClassifier(**lbg_params)

In [26]:
col_list = list(X.columns)

In [27]:
new_test = test[col_list]

In [28]:
lgb.fit(xtrain_scaled, ytrain)

LGBMClassifier(bagging_fraction=0.8, bagging_freq=1, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.64,
               importance_type='split', learning_rate=0.02, max_depth=100,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=8000, n_jobs=-1, num_leaves=250, objective='binary',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [29]:
lgb_pred = lgb.predict(xtest_scaled)

In [30]:
lgb_pred_proba = lgb.predict_proba(xtest_scaled)

In [31]:
evaluate_model(ytest, lgb_pred, lgb_pred_proba)

ROC-AUC score of the model:   0.9962498869438895
Accuracy of the model: 0.9885990338164251

Classification report: 
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4802
           1       0.98      0.86      0.92       373

    accuracy                           0.99      5175
   macro avg       0.99      0.93      0.95      5175
weighted avg       0.99      0.99      0.99      5175


Confusion matrix: 
[[4797    5]
 [  54  319]]



We can see that the accuracy and ROC-AUC score of the LightGBM model is much better than the random forest model

# Predict the model on unseen Test Data

In [32]:
mms.fit(new_test)
new_test_scaled = mms.transform(new_test)

In [33]:
lgb_pred = lgb.predict(new_test_scaled)

In [34]:
new_test["m13"] = lgb_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [35]:
final = new_test[["loan_id","m13"]]

In [36]:
final.to_csv("output.csv",index=False)

In [37]:
final["m13"].value_counts()

0    35717
1      149
Name: m13, dtype: int64