In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [4]:
df=pd.read_csv(r'D:\DS-Input-Files\loan_data.csv')
df.head(2)

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0


In [5]:
df.isin([0]).sum()

credit.policy        1868
purpose                 0
int.rate                0
installment             0
log.annual.inc          0
dti                    89
fico                    0
days.with.cr.line       0
revol.bal             321
revol.util            297
inq.last.6mths       3637
delinq.2yrs          8458
pub.rec              9019
not.fully.paid       8045
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   credit.policy      9578 non-null   int64  
 1   purpose            9578 non-null   object 
 2   int.rate           9578 non-null   float64
 3   installment        9578 non-null   float64
 4   log.annual.inc     9578 non-null   float64
 5   dti                9578 non-null   float64
 6   fico               9578 non-null   int64  
 7   days.with.cr.line  9578 non-null   float64
 8   revol.bal          9578 non-null   int64  
 9   revol.util         9578 non-null   float64
 10  inq.last.6mths     9578 non-null   int64  
 11  delinq.2yrs        9578 non-null   int64  
 12  pub.rec            9578 non-null   int64  
 13  not.fully.paid     9578 non-null   int64  
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB


In [7]:
y=df.pop('not.fully.paid')

In [8]:
df.columns

Index(['credit.policy', 'purpose', 'int.rate', 'installment', 'log.annual.inc',
       'dti', 'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec'],
      dtype='object')

In [14]:
df['purpose']=pd.get_dummies(df['purpose'])

In [15]:
df['purpose'].uniqueque()

array([0, 1], dtype=uint8)

In [16]:
x_train,x_test,y_train,y_test=train_test_split(df,y,test_size=0.4)

In [17]:
logistic=LogisticRegression()

In [18]:
logistic.fit(x_train,y_train)

LogisticRegression()

In [20]:
test_pred_log=logistic.predict(x_test)

In [23]:
print("accuracy in logistic is ",accuracy_score(y_test,test_pred_log)*100)

accuracy in logistic is  84.44676409185804


In [None]:
##Random Forest Classifier

In [24]:
rfc=RandomForestClassifier()

In [25]:
rfc.fit(x_train,y_train)

RandomForestClassifier()

In [26]:
test_pred_rfc=rfc.predict(x_test)

In [28]:
print("accuracy score is ",accuracy_score(y_test,test_pred_rfc)*100)

accuracy score is  84.3945720250522


In [30]:
scaler=StandardScaler()

In [34]:
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.fit_transform(x_test)

In [36]:
rfc.fit(x_train_scaled,y_train)

RandomForestClassifier()

In [37]:
test_pred_scaled=rfc.predict(x_test_scaled)

In [40]:
print("accuracy after scaling is ",accuracy_score(y_test,test_pred_scaled)*100)

accuracy after scaling is  84.3945720250522


In [None]:
##Decison Tree Classsifier

In [58]:
def fit_predict(x_train,x_test,y_train,y_test,scaler,max_depth,max_features=2,min_samples_split=2,criterion='entropy'):
    dtc=DecisionTreeClassifier(max_depth=max_depth,max_features=max_features,min_samples_split=min_samples_split,
                               criterion=criterion,random_state=45)
    x_train_dt_scaled=scaler.fit_transform(x_train)
    x_test_dt_scaled=scaler.fit_transform(x_test)
    dtc=DecisionTreeClassifier()
    dtc.fit(x_train_dt_scaled,y_train)
    dtc_pred=dtc.predict(x_test_dt_scaled)
    print("Accuracy of DTC ",accuracy_score(y_test,dtc_pred)*100)

In [59]:
dtc=DecisionTreeClassifier()
dtc.fit(x_train,y_train)
dtc_pred_nonscaled=dtc.predict(x_test)
print("accuracy of DTC ",accuracy_score(y_test,dtc_pred_nonscaled)*100)

accuracy of DTC  73.82567849686848


In [None]:
#Max depth tuning:

In [60]:
for i in range(1,10):
    print("Accuracy when Max depth is ",i, end =": ")
    fit_predict(x_train,x_test,y_train,y_test,StandardScaler(),i)

Accuracy when Max depth is  1: Accuracy of DTC  73.22546972860125
Accuracy when Max depth is  2: Accuracy of DTC  73.82567849686848
Accuracy when Max depth is  3: Accuracy of DTC  73.2776617954071
Accuracy when Max depth is  4: Accuracy of DTC  72.86012526096033
Accuracy when Max depth is  5: Accuracy of DTC  73.38204592901879
Accuracy when Max depth is  6: Accuracy of DTC  73.35594989561586
Accuracy when Max depth is  7: Accuracy of DTC  73.46033402922755
Accuracy when Max depth is  8: Accuracy of DTC  72.67745302713988
Accuracy when Max depth is  9: Accuracy of DTC  72.91231732776617


Accuracy is maximum when max depth is 2 which is 73.82

In [None]:
##Max features tuning

In [63]:
for i in range(1,10):
    print("Accuracy when max features is ", i , end = ": ")
    fit_predict(x_train,x_test,y_train,y_test,scaler,max_depth=18,max_features=i)

Accuracy when max features is  1: Accuracy of DTC  74.0866388308977
Accuracy when max features is  2: Accuracy of DTC  73.35594989561586
Accuracy when max features is  3: Accuracy of DTC  73.12108559498957
Accuracy when max features is  4: Accuracy of DTC  73.53862212943632
Accuracy when max features is  5: Accuracy of DTC  73.46033402922755
Accuracy when max features is  6: Accuracy of DTC  73.59081419624218
Accuracy when max features is  7: Accuracy of DTC  72.99060542797496
Accuracy when max features is  8: Accuracy of DTC  73.40814196242171
Accuracy when max features is  9: Accuracy of DTC  73.25156576200418


Accuracy is maximum when max features is 1, which is 74.08

In [None]:
##Criterion tuning

In [65]:
for i in ['gini','entropy']:
    print("Accuracy when criterion is ",i, end=": ")
    fit_predict(x_train,x_test,y_train,y_test,scaler,max_depth=2,max_features=2,min_samples_split=2,criterion=i)

Accuracy when criterion is  gini: Accuracy of DTC  73.53862212943632
Accuracy when criterion is  entropy: Accuracy of DTC  72.78183716075156


Accuracy is maximum when criterion is gini

In [None]:
##Min samples split tuning

In [66]:
for i in range(1,10):
    print("Accuracy when min samples split is ",i, end=": ")
    fit_predict(x_train,x_test,y_train,y_test,scaler,max_depth=2,max_features=2,criterion='gini',min_samples_split=i)

Accuracy when min samples split is  1: Accuracy of DTC  73.2776617954071
Accuracy when min samples split is  2: Accuracy of DTC  72.57306889352819
Accuracy when min samples split is  3: Accuracy of DTC  73.98225469728601
Accuracy when min samples split is  4: Accuracy of DTC  73.43423799582463
Accuracy when min samples split is  5: Accuracy of DTC  73.5125260960334
Accuracy when min samples split is  6: Accuracy of DTC  73.19937369519833
Accuracy when min samples split is  7: Accuracy of DTC  73.5125260960334
Accuracy when min samples split is  8: Accuracy of DTC  73.19937369519833
Accuracy when min samples split is  9: Accuracy of DTC  73.32985386221294


Accuracy is maximum when min samples split is 3, which is 73.98

## Best parameters are:
Max_depth is 2 ,
max_features is 1,
min_samples_split is 3,
criterion is gini

In [72]:
dtc=DecisionTreeClassifier(max_depth=2,max_features=1,min_samples_split=3,
                               criterion='gini',random_state=45)


In [73]:
dtc=DecisionTreeClassifier()
dtc.fit(x_train,y_train)
dtc_pred_nonscaled=dtc.predict(x_test)
print("accuracy of DTC ",accuracy_score(y_test,dtc_pred_nonscaled)*100)

accuracy of DTC  74.13883089770354


In [89]:
from sklearn.model_selection import GridSearchCV

In [91]:
gs=GridSearchCV(rfc,params,verbose=1)

In [90]:
params = {
    'n_estimators': [200,300,400],
    'max_depth' : [20,30,40],
    'min_samples_leaf' : [2,3,4]
}

In [92]:
gs.fit(x_train,y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 12.1min finished


GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [20, 30, 40],
                         'min_samples_leaf': [2, 3, 4],
                         'n_estimators': [200, 300, 400]},
             verbose=1)

In [93]:
gs.best_estimator_

RandomForestClassifier(max_depth=30, min_samples_leaf=2, n_estimators=300)