In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_input = pd.read_csv('Credit_Risk_train_Data.csv')
test_input = pd.read_csv('Credit_Risk_Validate_Data.csv')

In [3]:
print(train_input.columns)
print(test_input.columns)

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'outcome'],
      dtype='object')


In [4]:
#the last column has a different name in both
#lets make the names same. and then merge them together
#so that we can fill the missing values simultaneously
test_input.rename(columns={'outcome': 'Loan_Status'}, inplace= True)

In [5]:
data_all= pd.concat([train_input,test_input], axis= 0)
data_all.shape

(981, 13)

In [6]:
help(data_all.reset_index)

Help on method reset_index in module pandas.core.frame:

reset_index(level: 'Hashable | Sequence[Hashable] | None' = None, drop: 'bool' = False, inplace: 'bool' = False, col_level: 'Hashable' = 0, col_fill: 'Hashable' = '') -> 'DataFrame | None' method of pandas.core.frame.DataFrame instance
    Reset the index, or a level of it.
    
    Reset the index of the DataFrame, and use the default one instead.
    If the DataFrame has a MultiIndex, this method can remove one or more
    levels.
    
    Parameters
    ----------
    level : int, str, tuple, or list, default None
        Only remove the given levels from the index. Removes all levels by
        default.
    drop : bool, default False
        Do not try to insert index into dataframe columns. This resets
        the index to the default integer index.
    inplace : bool, default False
        Modify the DataFrame in place (do not create a new object).
    col_level : int or str, default 0
        If the columns have multiple l

In [7]:
data_all.reset_index(inplace= True, drop= True)
#reset index else merging will have issues
print(data_all.tail())

      Loan_ID Gender Married Dependents     Education Self_Employed  \
976  LP002971   Male     Yes         3+  Not Graduate           Yes   
977  LP002975   Male     Yes          0      Graduate            No   
978  LP002980   Male      No          0      Graduate            No   
979  LP002986   Male     Yes          0      Graduate            No   
980  LP002989   Male      No          0      Graduate           Yes   

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
976             4009             1777.0       113.0             360.0   
977             4158              709.0       115.0             360.0   
978             3250             1993.0       126.0             360.0   
979             5000             2393.0       158.0             360.0   
980             9200                0.0        98.0             180.0   

     Credit_History Property_Area Loan_Status  
976             1.0         Urban           Y  
977             1.0         Urban     

In [8]:
data_all.isnull().sum() #gives the missing values of all columns

Loan_ID               0
Gender               24
Married               3
Dependents           25
Education             0
Self_Employed        55
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [9]:
data_all.shape #read description of each column from pdf document

(981, 13)

In [10]:
#before proceeding to model building, lets fill the missing values

In [11]:
Counter(data_all['Gender'])

Counter({'Male': 775, 'Female': 182, nan: 24})

In [12]:
#lets fill them by male
#these rows are null for gender
#lets fill them with the model of gender which have higher percentage i.e Male
gender_null = data_all[data_all['Gender'].isnull()].index.tolist()
print(gender_null)

[23, 126, 171, 188, 314, 334, 460, 467, 477, 507, 576, 588, 592, 636, 665, 720, 752, 823, 845, 859, 893, 910, 917, 932]


gender_null_M = gender_null[:12]
gender_null_F = gender_null[12:]
data_null['Gender'].iloc[gender_null_M] = 'Male'
data_null['Gender'].iloc[gender_null_F] = 'Female'

In [13]:
data_all['Gender'].iloc[gender_null] = 'Male'

In [14]:
#check if failed
print(sum(data_all['Gender'].isnull()))#okay done
Counter(data_all['Gender'])

0


Counter({'Male': 799, 'Female': 182})

In [15]:
#lets fill marries now
print(Counter(data_all['Married']))#most are married

Counter({'Yes': 631, 'No': 347, nan: 3})


In [16]:
married_null = data_all[data_all['Married'].isnull()].index.tolist()
married_null

[104, 228, 435]

In [17]:
data_all['Married'].iloc[married_null]= 'Yes'

In [18]:
data_all.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents           25
Education             0
Self_Employed        55
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [19]:
# lets see dependents wrt to marriage
pd.crosstab(data_all['Married'],data_all['Dependents'].isnull(),margins=True)

Dependents,False,True,All
Married,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,338,9,347
Yes,618,16,634
All,956,25,981


In [20]:
pd.crosstab(data_all['Dependents'], data_all['Married'], margins=True)

Married,No,Yes,All
Dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,276,269,545
1,36,124,160
2,14,146,160
3+,12,79,91
All,338,618,956


In [21]:
#for bachelors , lets fill missing dependents as 0
#lets find the index of all rows with dependents missing and Marries No
bachelor_nulldependent =data_all[(data_all['Married']=='No') & 
                         (data_all['Dependents'].isnull())].index.tolist()
print(bachelor_nulldependent)

[293, 332, 355, 597, 684, 752, 879, 916, 926]


In [22]:
data_all['Dependents'].iloc[bachelor_nulldependent] = '0'

In [23]:
Counter(data_all['Dependents'])

Counter({'0': 554, '1': 160, '2': 160, '3+': 91, nan: 16})

In [24]:
#for the remaining 16 missing dependents,
#lets see how many dependents Male & Female have
pd.crosstab(data_all['Gender'], data_all['Dependents'])

Dependents,0,1,2,3+
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,127,32,13,9
Male,427,128,147,82


In [25]:
pd.crosstab(data_all['Gender'], data_all['Dependents'].isnull())

Dependents,False,True
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,181,1
Male,784,15


In [26]:
#so female have less dependents
#lets see the gender of the 16 missing dependents
data_all['Gender'].iloc[data_all[data_all['Dependents'].isnull()].index.tolist()]


102      Male
104      Male
120      Male
226      Male
228      Male
301      Male
335      Male
346      Male
435    Female
517      Male
571      Male
660      Male
725      Male
816      Male
861      Male
865      Male
Name: Gender, dtype: object

In [27]:
pd.crosstab((data_all['Gender']== 'Male')&
             (data_all['Married']== 'Yes'), data_all['Dependents'])

Dependents,0,1,2,3+
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,318,48,23,15
True,236,112,137,76


In [28]:
#lets fill the dependents with 1
data_all['Dependents'].iloc[data_all[data_all['Dependents'].isnull()].index.tolist()] ='1'

In [29]:
data_all.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed        55
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [30]:
Counter(data_all['Self_Employed'])

Counter({'No': 807, 'Yes': 119, nan: 55})

In [31]:
self_emp_null = data_all[data_all['Self_Employed'].isnull()].index.tolist()

In [32]:
#fill missing selfemployed with NO
data_all['Self_Employed'].iloc[self_emp_null]= 'No'

In [33]:
data_all.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [35]:
#to check if any row with both LoanAmount and Loan_Amount_Term as NAN
pd.crosstab(data_all['LoanAmount'].isnull(),
            data_all['Loan_Amount_Term'].isnull())

Loan_Amount_Term,False,True
LoanAmount,Unnamed: 1_level_1,Unnamed: 2_level_1
False,934,20
True,27,0


In [36]:
pd.crosstab(data_all['LoanAmount'].isnull(),data_all['Loan_Amount_Term'])

Loan_Amount_Term,6.0,12.0,36.0,60.0,84.0,120.0,180.0,240.0,300.0,350.0,360.0,480.0
LoanAmount,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
False,1,2,3,3,7,4,64,7,20,1,800,22
True,0,0,0,0,0,0,2,1,0,0,23,1


In [37]:
data_all.groupby(data_all['Loan_Amount_Term'])['LoanAmount'].mean()

Loan_Amount_Term
6.0       95.000000
12.0     185.500000
36.0     117.666667
60.0     139.666667
84.0     121.142857
120.0     36.750000
180.0    131.125000
240.0    128.857143
300.0    166.250000
350.0    133.000000
360.0    144.420000
480.0    137.181818
Name: LoanAmount, dtype: float64

In [38]:
#lets fill the missing values in LoanAmount
#with mean of the respective Loan_Term
#we see that 180 and 240 has almost same loan amount 128-131
#&360 has high i.e 144
#so lets fill only 360 by 144
#and all remaining by 130
data_all['LoanAmount'][(data_all['LoanAmount'].isnull())
                       & (data_all['Loan_Amount_Term']==360)]=144
data_all['LoanAmount'][(data_all['LoanAmount'].isnull())
                       & (data_all['Loan_Amount_Term']==480)]=137

In [39]:
data_all['LoanAmount'][(data_all['LoanAmount'].isnull())]=130

In [40]:
#lets fill loan amount term
(data_all['Loan_Amount_Term']).value_counts()

360.0    823
180.0     66
480.0     23
300.0     20
240.0      8
84.0       7
120.0      4
60.0       3
36.0       3
12.0       2
350.0      1
6.0        1
Name: Loan_Amount_Term, dtype: int64

In [41]:
#lets fill the loantenure by the mode i.e 360
data_all['Loan_Amount_Term'][data_all['Loan_Amount_Term'].isnull()]=360

In [42]:
data_all.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [43]:
data_all['Credit_History'].value_counts()

1.0    754
0.0    148
Name: Credit_History, dtype: int64

In [44]:
pd.crosstab(data_all['Gender'],data_all['Credit_History'])
#gender makes no difference

Credit_History,0.0,1.0
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,30,135
Male,118,619


In [45]:
pd.crosstab(data_all['Self_Employed'],data_all['Credit_History'])
#self employed makes no difference

Credit_History,0.0,1.0
Self_Employed,Unnamed: 1_level_1,Unnamed: 2_level_1
No,134,658
Yes,14,96


In [46]:
pd.crosstab(data_all['Education'],data_all['Credit_History'])
#education makes no difference


Credit_History,0.0,1.0
Education,Unnamed: 1_level_1,Unnamed: 2_level_1
Graduate,106,596
Not Graduate,42,158


In [47]:
pd.crosstab(data_all['Married'],data_all['Credit_History'])
#married makes no difference

Credit_History,0.0,1.0
Married,Unnamed: 1_level_1,Unnamed: 2_level_1
No,56,263
Yes,92,491


In [48]:
data_all['Credit_History'][data_all['Credit_History'].isnull()]=1

In [49]:
data_all.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [50]:
data_all.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,144.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [51]:
data_all_new = pd.get_dummies(data_all.drop(['Loan_ID'],axis=1),
                              drop_first=True)

In [52]:
data_all_new.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,5849,0.0,144.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1
1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1,1
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1,1


In [72]:
X= data_all_new.drop(['Loan_Status_Y'],axis=1)
y=data_all_new['Loan_Status_Y']

In [73]:
X.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,5849,0.0,144.0,360.0,1.0,1,0,0,0,0,0,0,0,1
1,4583,1508.0,128.0,360.0,1.0,1,1,1,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,1,0,0,0,0,1,0,1
3,2583,2358.0,120.0,360.0,1.0,1,1,0,0,0,1,0,0,1
4,6000,0.0,141.0,360.0,1.0,1,0,0,0,0,0,0,0,1


In [55]:
y.head()

0    1
1    0
2    1
3    1
4    1
Name: Loan_Status_Y, dtype: uint8

# Train Test Split

In [74]:
from sklearn.model_selection import train_test_split

In [75]:
X_train,X_test,y_train,y_test= train_test_split(X,y)

In [76]:
X_train.shape

(735, 14)

In [77]:
X_test.shape

(246, 14)

# Data Preprocessing 

In [78]:
from sklearn.preprocessing import StandardScaler

In [79]:
scaler = StandardScaler()

In [80]:
#fit only to the training data
scaler.fit(X)

StandardScaler()

In [81]:
#now apply the the tarnsformationto the data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [82]:
X_train[:5]

array([[-0.44882376,  1.71049766,  0.58309499,  0.2705276 ,  0.42151046,
         0.47726799,  0.73980985, -0.46758266, -0.44145701, -0.31976115,
        -0.53452248, -0.37155221, -0.74311183, -0.73158135],
       [-0.30792923, -0.5895062 , -0.05900143,  0.2705276 ,  0.42151046,
        -2.09525891, -1.35169869, -0.46758266,  2.26522626, -0.31976115,
        -0.53452248, -0.37155221, -0.74311183,  1.36690199],
       [-0.48922992, -0.5895062 ,  0.01962262,  0.2705276 ,  0.42151046,
         0.47726799,  0.73980985,  2.13865931, -0.44145701, -0.31976115,
        -0.53452248,  2.6914118 ,  1.34569248, -0.73158135],
       [-0.2146437 , -0.5895062 , -0.42591367,  0.2705276 ,  0.42151046,
        -2.09525891, -1.35169869, -0.46758266, -0.44145701, -0.31976115,
        -0.53452248, -0.37155221, -0.74311183, -0.73158135],
       [ 0.39724614, -0.5895062 ,  0.67482305,  0.2705276 ,  0.42151046,
        -2.09525891, -1.35169869, -0.46758266, -0.44145701, -0.31976115,
        -0.53452248,  2.69

# Training the model using Logistic Regression

In [83]:
from sklearn.linear_model import LogisticRegression

In [84]:
clf = LogisticRegression()
clf.fit(X_train,y_train)

LogisticRegression()

In [85]:
predictions = clf.predict(X_test)

In [86]:
from sklearn.metrics import classification_report, confusion_matrix

In [88]:
print(confusion_matrix(y_test,predictions))

[[ 38  27]
 [  3 178]]


In [89]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.93      0.58      0.72        65
           1       0.87      0.98      0.92       181

    accuracy                           0.88       246
   macro avg       0.90      0.78      0.82       246
weighted avg       0.88      0.88      0.87       246

