Gathering/Exploring the data

numpy - array, matrix
pandas - data analysis, data exploration
matplotlib - for plotting 
sklearn - which contain ml algorithms, data preprocessing 
          functions
nltk - text mining

In [26]:
import pandas as pd

In [27]:
data  = pd.read_csv("loan_data.csv")

In [28]:
data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [29]:
data.shape

(614, 13)

In [30]:
data.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

Preprocessing/Data cleaning

1. Filling the missing value
  (For categorical attribute we use mode,
   and for numerical attribute use mean)

In [31]:
data.isnull()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,False,False,False,False,False,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,False,False,False,False,False,False,False,False,False,False,False,False,False
610,False,False,False,False,False,False,False,False,False,False,False,False,False
611,False,False,False,False,False,False,False,False,False,False,False,False,False
612,False,False,False,False,False,False,False,False,False,False,False,False,False


In [32]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [33]:
data['Gender'].fillna(data['Gender'].mode()[0],inplace = True)

In [34]:
data['Married'].fillna(data['Married'].mode()[0],inplace = True)

In [35]:
data['Dependents'].fillna(data['Dependents'].mode()[0],inplace = True)
data['Self_Employed'].fillna(data['Self_Employed'].mode()[0],inplace = True)

In [36]:
data['LoanAmount'].fillna(data['LoanAmount'].mean(),inplace = True)

In [37]:
data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mean(),inplace = True)
data['Credit_History'].fillna(data['Credit_History'].mean(),inplace = True)

In [38]:
data.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

2. Convert all the categorical attribute to numerical attributes

In [39]:
data.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [40]:
data['Gender'] = data['Gender'].map({'Male':0,'Female':1})

In [41]:
data['Education'].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

In [42]:
data['Education'] = data['Education'].map({'Graduate':0,'Not Graduate':1})

In [43]:
data['Self_Employed'].unique()

array(['No', 'Yes'], dtype=object)

In [44]:
data['Self_Employed'] = data['Self_Employed'].map({'No':0,'Yes':1})

In [45]:
data['Dependents'].unique()

array(['0', '1', '2', '3+'], dtype=object)

In [46]:
data['Dependents'] = data['Dependents'].map({'0':0,'1':1,'2':2,'3+':3})

In [47]:
data['Married'].unique()

array(['No', 'Yes'], dtype=object)

In [48]:
data['Married'] = data['Married'].map({'No':0,'Yes':1})

In [49]:
data['Property_Area'].unique()

array(['Urban', 'Rural', 'Semiurban'], dtype=object)

In [50]:
data['Property_Area'] = data['Property_Area'].map({'Urban':0,'Rural':1,'Semiurban':2})

In [51]:
data.dtypes

Loan_ID               object
Gender                 int64
Married                int64
Dependents             int64
Education              int64
Self_Employed          int64
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
Loan_Status           object
dtype: object

In [52]:
data.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,0,0,0,0,0,5849,0.0,146.412162,360.0,1.0,0,Y
1,LP001003,0,1,1,0,0,4583,1508.0,128.0,360.0,1.0,1,N
2,LP001005,0,1,0,0,1,3000,0.0,66.0,360.0,1.0,0,Y
3,LP001006,0,1,0,1,0,2583,2358.0,120.0,360.0,1.0,0,Y
4,LP001008,0,0,0,0,0,6000,0.0,141.0,360.0,1.0,0,Y
5,LP001011,0,1,2,0,1,5417,4196.0,267.0,360.0,1.0,0,Y
6,LP001013,0,1,0,1,0,2333,1516.0,95.0,360.0,1.0,0,Y
7,LP001014,0,1,3,0,0,3036,2504.0,158.0,360.0,0.0,2,N
8,LP001018,0,1,2,0,0,4006,1526.0,168.0,360.0,1.0,0,Y
9,LP001020,0,1,1,0,0,12841,10968.0,349.0,360.0,1.0,2,N


3. Data normalisation
   min max method
   making to 0-1 scale

   x(norm) = ( x - xmin ) / ( xmax - xmin )


In [53]:
for i in data.columns[1:11]:
  data[i] = (data[i]-data[i].min())/(data[i].max()-data[i].min())

In [54]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,0.0,0.0,0.0,0.0,0.0,0.070489,0.0,0.19886,0.74359,1.0,0,Y
1,LP001003,0.0,1.0,0.333333,0.0,0.0,0.05483,0.036192,0.172214,0.74359,1.0,1,N
2,LP001005,0.0,1.0,0.0,0.0,1.0,0.03525,0.0,0.082489,0.74359,1.0,0,Y
3,LP001006,0.0,1.0,0.0,1.0,0.0,0.030093,0.056592,0.160637,0.74359,1.0,0,Y
4,LP001008,0.0,0.0,0.0,0.0,0.0,0.072356,0.0,0.191027,0.74359,1.0,0,Y


Modelling

Drop unwanted columns

In [55]:
X = data.drop(['Loan_ID','Loan_Status'],axis=1)

In [56]:
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,0.0,0.0,0.000000,0.0,0.0,0.070489,0.000000,0.198860,0.743590,1.0,0
1,0.0,1.0,0.333333,0.0,0.0,0.054830,0.036192,0.172214,0.743590,1.0,1
2,0.0,1.0,0.000000,0.0,1.0,0.035250,0.000000,0.082489,0.743590,1.0,0
3,0.0,1.0,0.000000,1.0,0.0,0.030093,0.056592,0.160637,0.743590,1.0,0
4,0.0,0.0,0.000000,0.0,0.0,0.072356,0.000000,0.191027,0.743590,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...
609,1.0,0.0,0.000000,0.0,0.0,0.034014,0.000000,0.089725,0.743590,1.0,1
610,0.0,1.0,1.000000,0.0,0.0,0.048930,0.000000,0.044863,0.358974,1.0,1
611,0.0,1.0,0.333333,0.0,0.0,0.097984,0.005760,0.353111,0.743590,1.0,0
612,0.0,1.0,0.666667,0.0,0.0,0.091936,0.000000,0.257598,0.743590,1.0,0


In [57]:
Y = data['Loan_Status']

In [58]:
Y

0      Y
1      N
2      Y
3      Y
4      Y
      ..
609    Y
610    Y
611    Y
612    Y
613    N
Name: Loan_Status, Length: 614, dtype: object

<b>Split the data into training set and testing set </b>
Training set is used to train the data model
Testing set is used to evaluate the data model

Use sklearn to model selection and train split

In [59]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.25, random_state=0)

Training the K-NN model 

In [60]:
from sklearn.neighbors import KNeighborsClassifier
#no of neighbours is squre root of total size
classifier = KNeighborsClassifier( n_neighbors = 5, metric = 'minkowski', p=2)

In [61]:
#fit is used to train the model
classifier.fit(X_train,Y_train)

KNeighborsClassifier()

In [62]:
#predicting: predict
y_pred = classifier.predict(X_test)

In [63]:
y_pred

array(['Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N'],
      dtype=object)

In [64]:
Y_test

454    Y
52     N
536    Y
469    N
55     Y
      ..
399    N
89     Y
271    Y
563    Y
162    N
Name: Loan_Status, Length: 154, dtype: object

In [65]:
#print the report of model accuracy
from sklearn.metrics import classification_report
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           N       0.74      0.47      0.57        43
           Y       0.82      0.94      0.87       111

    accuracy                           0.81       154
   macro avg       0.78      0.70      0.72       154
weighted avg       0.80      0.81      0.79       154



In [66]:
#for predicting create new data to the classifier
import numpy as np
#to give the data as frames use double bracket
x_new = np.array([[0,1,3,1,1,56000,230,2300,360,1,0]])

In [67]:
prediction = classifier.predict(x_new)
prediction

  "X does not have valid feature names, but"


array(['Y'], dtype=object)

In [68]:
#predicting for multiple values
x_new = np.array([[0,1,3,1,1,56000,230,2300,360,1,0],
                  [1,1,2,1,1,60000,450,2300,789,1,0]
                ])
prediction = classifier.predict(x_new)
prediction

  "X does not have valid feature names, but"


array(['Y', 'Y'], dtype=object)