# Importing Libraries

In [1]:
#Importing required libraries
import pandas as pd 
import numpy as np

from sklearn.metrics import f1_score

### Loading the dataset

In [2]:
#reading the data
data=pd.read_csv('Loan data cleaned.csv')

In [3]:
#shape of the data
data.shape

(4368, 64)

In [4]:
#first five rows of the data
data.head()

Unnamed: 0,loannumber,loanamount,totaldue,termdays,good_bad_flag,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,...,ploan_additional charges_50%_x,ploan_additional charges_75%_x,ploan_additional charges_max_x,ploan_additional charges_mean_y,ploan_additional charges_min_y,ploan_additional charges_25%_y,ploan_additional charges_50%_y,ploan_additional charges_75%_y,ploan_additional charges_max_y,not_repaid_percentage
0,12,30000.0,34500.0,30,Good,Other,3.43201,6.433055,Diamond Bank,Permanent,...,3800.0,4500.0,4500.0,3900.0,3000.0,3400.0,3800.0,4500.0,4500.0,64.0
1,2,15000.0,17250.0,30,Good,Savings,3.885298,7.3207,GT Bank,Permanent,...,2665.558293,3037.525723,3543.851778,2730.359972,2102.661161,2364.738902,2665.558293,3037.525723,3543.851778,63.573297
2,7,20000.0,22250.0,15,Good,Other,11.13935,10.292041,EcoBank,Permanent,...,1500.0,1500.0,3000.0,1750.0,1500.0,1500.0,1500.0,1500.0,3000.0,50.0
3,3,10000.0,11500.0,15,Good,Savings,3.98577,7.491708,First Bank,Permanent,...,2250.0,2625.0,3000.0,2250.0,1500.0,1875.0,2250.0,2625.0,3000.0,0.0
4,9,40000.0,44000.0,30,Good,Other,7.457913,9.076574,GT Bank,Permanent,...,3800.0,5100.0,9000.0,4800.0,3000.0,3000.0,3800.0,5100.0,9000.0,100.0


In [5]:
#checking missing values in the data
data.isnull().sum()

loannumber                        0
loanamount                        0
totaldue                          0
termdays                          0
good_bad_flag                     0
                                 ..
ploan_additional charges_25%_y    0
ploan_additional charges_50%_y    0
ploan_additional charges_75%_y    0
ploan_additional charges_max_y    0
not_repaid_percentage             0
Length: 64, dtype: int64

In [6]:
#make dummies of categorical columns('bank_account_type','bank_name_clients','employment_status_clients')
data=pd.get_dummies(data=data,columns=['bank_account_type','bank_name_clients','employment_status_clients'],drop_first=True)

### Separating independent and dependent variables.

In [8]:
x = data.drop(['good_bad_flag'], axis=1)
y = data['good_bad_flag'].replace({'Good':0, 'Bad':1})

### Creating the train and test dataset

In [9]:
#import the train-test split
from sklearn.model_selection import train_test_split

In [10]:
#divide into train and test sets
train_x,test_x,train_y,test_y = train_test_split(x,y, random_state = 101, stratify=y)

In [11]:
#Scalig the train and testing datasets
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(train_x)
train_x = pd.DataFrame(x_train_scaled, columns=train_x.columns)

x_test_scaled = scaler.transform(test_x)
test_x = pd.DataFrame(x_test_scaled, columns=test_x.columns)

## Building a Decision Tree Model

In [12]:
#Importing Decision Tree Classifier 
from sklearn.tree import DecisionTreeClassifier

In [13]:
#creating a decision tree instance
clf = DecisionTreeClassifier(class_weight='balanced',random_state=101,criterion='gini',max_depth=4,max_leaf_nodes=15,min_samples_split=30)

In [14]:
#training the model
clf.fit(train_x,train_y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight='balanced', criterion='gini',
                       max_depth=4, max_features=None, max_leaf_nodes=15,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=30,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=101, splitter='best')

In [15]:
pred_train=clf.predict(train_x)
pred_test=clf.predict(test_x)
f1_train=f1_score(train_y,pred_train)
f1_test=f1_score(test_y,pred_test)

In [16]:
f1_train,f1_test

(0.4778856526429342, 0.43686006825938567)

## Building a Random Forest Model

In [17]:
#Importing random forest classifier 
from sklearn.ensemble import RandomForestClassifier

In [47]:
#creating a random forest instance
clf = RandomForestClassifier(n_estimators=500,class_weight='balanced',random_state=101,criterion='gini',max_depth=4,max_leaf_nodes=15,min_samples_split=30)

In [48]:
#train the model
clf.fit(train_x,train_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=15, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=30,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=101,
                       verbose=0, warm_start=False)

In [49]:
pred_train=clf.predict(train_x)
pred_test=clf.predict(test_x)
f1_train=f1_score(train_y,pred_train)
f1_test=f1_score(test_y,pred_test)

In [50]:
f1_train,f1_test

(0.4993412384716732, 0.42588726513569936)

In [38]:
#looking at the feature importance
clf.feature_importances_

array([0.00878583, 0.00710632, 0.01639431, 0.00410486, 0.01166729,
       0.00827829, 0.00148545, 0.00239597, 0.01767869, 0.00030155,
       0.01074299, 0.01194062, 0.00076832, 0.0008411 , 0.00335265,
       0.01277368, 0.00645804, 0.01883257, 0.00026698, 0.00453941,
       0.00529754, 0.00634612, 0.01351786, 0.01377379, 0.        ,
       0.00029366, 0.00081516, 0.00072952, 0.00072047, 0.00944803,
       0.00087162, 0.00077501, 0.00021107, 0.0025771 , 0.00244649,
       0.0375445 , 0.01593586, 0.02254147, 0.02162272, 0.05348977,
       0.02390222, 0.05826645, 0.05441601, 0.06225816, 0.11446636,
       0.11204262, 0.05764728, 0.00889765, 0.00359449, 0.00446433,
       0.00172381, 0.01303375, 0.0130792 , 0.00298581, 0.00231215,
       0.00589056, 0.00469487, 0.00988726, 0.0043798 , 0.06215824,
       0.00710875, 0.00741896, 0.00092255, 0.        , 0.        ,
       0.00020522, 0.        , 0.0012301 , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00201

In [22]:
#feature importance against each variable
pd.Series(clf.feature_importances_, index=train_x.columns).sort_values(ascending=False)

ploan_first_return_days_75%             0.089982
ploan_first_return_days_50%             0.089761
ploan_first_return_days_25%             0.086724
ploan_first_return_days_min             0.067334
ploan_first_return_days_mean            0.065163
                                          ...   
bank_name_clients_Standard Chartered    0.000000
bank_name_clients_Heritage Bank         0.000000
bank_name_clients_Keystone Bank         0.000000
bank_name_clients_FCMB                  0.000000
bank_name_clients_Skye Bank             0.000000
Length: 80, dtype: float64