**IMPORTING DEPENDENCIES**

In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import normalize

**IMPORTING DATASET**

In [63]:
df_train = pd.read_csv('train_set.csv')
df_test = pd.read_csv('test_set.csv')

In [64]:
df_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


**DATA CLEANING**

1. Replacing 3+ with 3 from Dependents column

In [65]:
df_train['Dependents'].replace('3+' , 3 , inplace = True)
df_test['Dependents'].replace('3+' , 3 , inplace = True)

2. Dropping not so required columns

In [66]:
df_train = df_train.drop(['Loan_ID'] , axis = 1)
df_test = df_test.drop(['Loan_ID'] , axis = 1)

3. Checking for null and nan values and replacing them.

In [67]:
df_train.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [68]:
df_test.isnull().sum()

Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [69]:
df_train.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [70]:
df_train = df_train.dropna(axis = 0)
df_test = df_test.dropna(axis = 0)

3. LABEL ENCODING THE CATEGORICAL COLUMNS

In [71]:
lb = LabelEncoder()
df_train['Gender'] = lb.fit_transform(df_train['Gender'])
df_test['Gender'] = lb.fit_transform(df_test['Gender'])

df_train['Married'] = lb.fit_transform(df_train['Married'])
df_test['Married'] = lb.fit_transform(df_test['Married'])

df_train['Education'] = lb.fit_transform(df_train['Education'])
df_test['Education'] = lb.fit_transform(df_test['Education'])

df_train['Self_Employed'] = lb.fit_transform(df_train['Self_Employed'])
df_test['Self_Employed'] = lb.fit_transform(df_test['Self_Employed'])

df_train['Property_Area'] = lb.fit_transform(df_train['Property_Area'])
df_test['Property_Area'] = lb.fit_transform(df_test['Property_Area'])

df_train = df_train.replace(to_replace ="Y",
                 value = 1)
df_train = df_train.replace(to_replace ="N",
                 value = 0)

df_test = df_test.replace(to_replace ="Y",
                 value = 1)
df_test = df_test.replace(to_replace ="N",
                 value = 0)

In [72]:
df_train.dtypes

Gender                 int64
Married                int64
Dependents            object
Education              int64
Self_Employed          int64
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area          int64
Loan_Status            int64
dtype: object

In [73]:
df_train['Dependents'].astype(str).astype(int)
df_test['Dependents'].astype(str).astype(int)

0      0
1      1
2      2
4      0
5      0
      ..
361    1
362    3
363    0
365    0
366    0
Name: Dependents, Length: 289, dtype: int64

In [74]:
df_train['Loan_Status'].astype(str).astype(int)

1      0
2      1
3      1
4      1
5      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 480, dtype: int64

**DEFINING INDEPENDENT AND DEPENDENT VARIABLES**

In [75]:
x_train = df_train.iloc[:,:-1].values
x_test = df_test.iloc[:,:].values
y_train = df_train.iloc[:,-1].values

**STANDARDISING THE DATA**

In [76]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

**TIME TO TRAIN**

**MODEL 1 : DECISION TREES**

In [110]:
model_1 = DecisionTreeClassifier( random_state = 0)
model_1.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [111]:
y_pred = model_1.predict(x_train)

In [112]:
cm = confusion_matrix(y_train, y_pred)
print(cm)
accuracy_score(y_train, y_pred)

[[148   0]
 [  0 332]]


1.0

**MODEL 2 : K NEAREST NEIGHBOURS**

In [107]:
model_2 = KNeighborsClassifier(n_neighbors = 2, metric = 'minkowski', p = 2)
model_2.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

In [108]:
y_pred = model_2.predict(x_train)

In [109]:
cm = confusion_matrix(y_train, y_pred)
print(cm)
accuracy_score(y_train, y_pred)

[[148   0]
 [ 72 260]]


0.85

**MODEL 3 : RANDOM FOREST CLASSIFIER**

In [83]:
model_3 = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
model_3.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [84]:
y_pred = model_3.predict(x_train)

In [85]:
cm = confusion_matrix(y_train, y_pred)
print(cm)
accuracy_score(y_train, y_pred)

[[144   4]
 [  1 331]]


0.9895833333333334

**MODEL 4 : NAIVE BAYES CLASSIFIER**

In [86]:
model_4 = GaussianNB()
model_4.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [87]:
y_pred = model_4.predict(x_train)

In [88]:
cm = confusion_matrix(y_train, y_pred)
print(cm)
accuracy_score(y_train, y_pred)

[[ 72  76]
 [ 19 313]]


0.8020833333333334

**MODEL 5 : SUPPORT VECTOR MACHINE(LINEAR)**

In [89]:
model_5 = SVC(kernel = 'linear', random_state = 0)
model_5.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [90]:
y_pred = model_5.predict(x_train)

In [91]:
cm = confusion_matrix(y_train, y_pred)
print(cm)
accuracy_score(y_train, y_pred)

[[ 63  85]
 [  7 325]]


0.8083333333333333

**MODEL 6 : SUPPORT VECTOR MACHINE(NON LINEAR)**

In [92]:
model_6 =  SVC(kernel = 'rbf', random_state = 0)
model_6.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [93]:
y_pred = model_6.predict(x_train)

In [94]:
cm = confusion_matrix(y_train, y_pred)
print(cm)
accuracy_score(y_train, y_pred)

[[ 71  77]
 [  5 327]]


0.8291666666666667

**MODEL 7 : LOGISTIC REGRESSION**

In [95]:
model_7 = LogisticRegression(random_state= 0)
model_7.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [96]:
y_pred = model_7.predict(x_train)

In [97]:
cm = confusion_matrix(y_train, y_pred)
print(cm)
accuracy_score(y_train, y_pred)

[[ 64  84]
 [  7 325]]


0.8104166666666667