In [1]:
import joblib
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
data = pd.read_csv("clean_loan_data.csv")

In [3]:
data.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y


In [4]:
X = data[["Married", "Education","ApplicantIncome","LoanAmount","Credit_History"]]

In [5]:
X

Unnamed: 0,Married,Education,ApplicantIncome,LoanAmount,Credit_History
0,Yes,Graduate,4583,128.0,1.0
1,Yes,Graduate,3000,66.0,1.0
2,Yes,Not Graduate,2583,120.0,1.0
3,No,Graduate,6000,141.0,1.0
4,Yes,Not Graduate,2333,95.0,1.0
...,...,...,...,...,...
303,Yes,Graduate,5703,128.0,1.0
304,Yes,Graduate,3232,108.0,1.0
305,No,Graduate,2900,71.0,1.0
306,Yes,Graduate,4106,40.0,1.0


In [6]:
y = data[['Loan_Status']]

In [7]:
y

Unnamed: 0,Loan_Status
0,N
1,Y
2,Y
3,Y
4,Y
...,...
303,Y
304,Y
305,Y
306,Y


In [8]:
labelencoder = preprocessing.LabelEncoder()

In [9]:
y["Loan_Status"] = labelencoder.fit_transform(y["Loan_Status"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y["Loan_Status"] = labelencoder.fit_transform(y["Loan_Status"])


In [10]:
y # Y is 1 & N is 0

Unnamed: 0,Loan_Status
0,0
1,1
2,1
3,1
4,1
...,...
303,1
304,1
305,1
306,1


In [11]:
labelencoder.classes_

array(['N', 'Y'], dtype=object)

In [12]:
X

Unnamed: 0,Married,Education,ApplicantIncome,LoanAmount,Credit_History
0,Yes,Graduate,4583,128.0,1.0
1,Yes,Graduate,3000,66.0,1.0
2,Yes,Not Graduate,2583,120.0,1.0
3,No,Graduate,6000,141.0,1.0
4,Yes,Not Graduate,2333,95.0,1.0
...,...,...,...,...,...
303,Yes,Graduate,5703,128.0,1.0
304,Yes,Graduate,3232,108.0,1.0
305,No,Graduate,2900,71.0,1.0
306,Yes,Graduate,4106,40.0,1.0


In [13]:
X["Married"] = labelencoder.fit_transform(X["Married"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Married"] = labelencoder.fit_transform(X["Married"])


In [14]:
X[['Married']]

Unnamed: 0,Married
0,1
1,1
2,1
3,0
4,1
...,...
303,1
304,1
305,0
306,1


In [15]:
labelencoder.classes_

array(['No', 'Yes'], dtype=object)

In [16]:
X["Education"] = labelencoder.fit_transform(X["Education"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["Education"] = labelencoder.fit_transform(X["Education"])


In [17]:
labelencoder.classes_

array(['Graduate', 'Not Graduate'], dtype=object)

In [18]:
X

Unnamed: 0,Married,Education,ApplicantIncome,LoanAmount,Credit_History
0,1,0,4583,128.0,1.0
1,1,0,3000,66.0,1.0
2,1,1,2583,120.0,1.0
3,0,0,6000,141.0,1.0
4,1,1,2333,95.0,1.0
...,...,...,...,...,...
303,1,0,5703,128.0,1.0
304,1,0,3232,108.0,1.0
305,0,0,2900,71.0,1.0
306,1,0,4106,40.0,1.0


In [19]:
scaler = StandardScaler()

In [20]:
X = scaler.fit_transform(X)

In [21]:
X

array([[ 0.81539246, -0.58734842,  0.67389339,  0.79689813,  0.41364557],
       [ 0.81539246, -0.58734842, -0.41036528, -1.31665281,  0.41364557],
       [ 0.81539246,  1.70256694, -0.6959849 ,  0.52418188,  0.41364557],
       ...,
       [-1.22640329, -0.58734842, -0.47885919, -1.14620515,  0.41364557],
       [ 0.81539246, -0.58734842,  0.34717742, -2.20298062,  0.41364557],
       [-1.22640329, -0.58734842,  0.67389339,  0.96734579, -2.41752858]])

In [22]:
joblib.dump(scaler,'Scaler.pkl')

['Scaler.pkl']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [24]:
def modelperformance(predictions):
    print(f"Accuracy score in model is {accuracy_score(y_test, predictions)}")

In [25]:
lg_model = LogisticRegression()

In [26]:
lg_model

In [27]:
lg_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [28]:
lg_prediction = lg_model.predict(X_test)

In [29]:
modelperformance(lg_prediction)

Accuracy score in model is 0.8064516129032258


In [30]:
param_grid = {"n_neighbors": [1,2,3,4,5,6,7,8,9,10],
             "weights": ["uniform", "distance"]}
gridkn = GridSearchCV(KNeighborsClassifier(),param_grid)

In [31]:
gridkn.fit(X_train, y_train.values.ravel())

In [32]:
gridkn.best_params_

{'n_neighbors': 7, 'weights': 'uniform'}

In [33]:
knpredict = gridkn.predict(X_test)

In [34]:
modelperformance(knpredict)

Accuracy score in model is 0.8064516129032258


In [35]:
svm = SVC()
param_grid_svm = {"C": [0.01,0.02,0.03,0.04,0.05, 0.1, 0.5], "kernel": ['linear','rbf','poly']}

In [36]:
gridsvc = GridSearchCV(svm, param_grid_svm)

In [37]:
gridsvc.fit(X_train, y_train.values.ravel())

In [38]:
gridsvc.best_params_

{'C': 0.01, 'kernel': 'linear'}

In [39]:
svcpredict = gridsvc.predict(X_test)

In [40]:
modelperformance(svcpredict)

Accuracy score in model is 0.8064516129032258


In [41]:
joblib.dump(gridsvc, 'model.pkl')

['model.pkl']