In this notebook we try to practice all the classification algorithms that we learned in this course.

We load a dataset using Pandas library, and apply the following algorithms, and find the best one for this specific dataset by accuracy evaluation methods.

Lets first load required libraries:

In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
from sklearn import preprocessing

# About dataset

About dataset:This dataset is about past loans. The Loan_Data_set.csv data set includes details of 96 customers whose loan are already paid off or defaulted. It includes following fields:

Loan_ID: Loan Id
Gender : The gender of applicant
Married : The marrital status of applicant
Dependents : The total no of dependents
Education :  Graduate or non Graduate
Self_Employed : self employed or not
ApplicantIncome : Income of applicant
CoapplicantIncome : Income of  Coapplicant
LoanAmount : Loan amount needed
Loan_Amount_Term : Term of loan amount
Credit_History : Is there any credit history or not
Property_Area : Area of property ,rural or urban or semi urban
Loan_Status : Loan status yes or not.

# Load Data From CSV File

In [None]:
df = pd.read_csv('loan_data_set.csv')
print(df['Loan_Status'].value_counts())
print(df.columns)
print(df.head())

In [None]:
#Lets define feature sets, X:
#To use scikit-learn library, we have to convert the Pandas data frame to a Numpy array:
X = df[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome',
        'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area']] .values

X[0:5]
y = df['Loan_Status'].values
y[0:5]

In [None]:
#Data Standardization give data zero mean and unit variance, it is good practice, especially for algorithms such as KNN
# which is based on distance of cases:
X = preprocessing.StandardScaler().fit(X).transform(X.astype(str))
X[0:5]

In [None]:
from sklearn.model_selection import train_test_split
#Train Test Split¶
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=4)
print('Train set:', X_train.shape,  y_train.shape)
print('Test set:', X_test.shape,  y_test.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
#Training
#Lets start the algorithm with k=4 for now:
k = 3
#Train Model and Predict
neigh = KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train)
#Predicting
#we can use the model to predict the test set:
yhat = neigh.predict(X_test)
yhat[0:5]

In [None]:
from sklearn import metrics
#Accuracy evaluation
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
Ks = 10
mean_acc = np.zeros((Ks - 1))
std_acc = np.zeros((Ks - 1))
ConfustionMx = [];
for n in range(2, Ks):
        # Train Model and Predict
        neigh = KNeighborsClassifier(n_neighbors=n).fit(X_train, y_train)
        yhat = neigh.predict(X_test)
        mean_acc[n - 1] = metrics.accuracy_score(y_test, yhat)

        std_acc[n - 1] = np.std(yhat == y_test) / np.sqrt(yhat.shape[0])

mean_acc

plt.plot(range(1, Ks), mean_acc, 'g')
plt.fill_between(range(1, Ks), mean_acc - 1 * std_acc, mean_acc + 1 * std_acc, alpha=0.10)
plt.legend(('Accuracy ', '+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Nabors (K)')
plt.tight_layout()
plt.show()
print("The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax() + 1)

# Decision tree 

In [None]:
from sklearn.tree import DecisionTreeClassifier
loanTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
loanTree # it shows the default parameters
loanTree.fit(X_train,y_train)
predTree = loanTree.predict(X_test)
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_test, predTree))

# Support Vector Machine

In [None]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)
yhat = clf.predict(X_test)
yhat [0:5]

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
LR
yhat = LR.predict(X_test)
yhat

# Model Evaluation using Test set


# 1.Knn

In [None]:
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
# predicted y
yhat_knn = neigh.predict(X_test)
# jaccard
jaccard_knn = jaccard_similarity_score(y_test, yhat_knn)
print("KNN Jaccard index: ", jaccard_knn)
# f1_score
f1_score_knn = f1_score(y_test, yhat_knn, average='weighted')
print("KNN F1-score: ", f1_score_knn)

# 2.Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
# predicted y
yhat_dt = loanTree.predict(X_test)

# jaccard
jaccard_dt = jaccard_similarity_score(y_test, yhat_dt)
print("DT Jaccard index: ", jaccard_dt)

# f1_score
f1_score_dt = f1_score(y_test, yhat_dt, average='weighted')
print("DT F1-score: ", f1_score_dt)

# 3.SVM

In [None]:
from sklearn import svm
# predicted y
yhat_svm = clf.predict(X_test)

# jaccard
jaccard_svm = jaccard_similarity_score(y_test, yhat_svm)
print("SVM Jaccard index: ", jaccard_svm)

# f1_score
f1_score_svm = f1_score(y_test, yhat_svm, average='weighted')
print("SVM F1-score: ", f1_score_svm)

# 4.Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
# predicted y
yhat_lg = LR.predict(X_test)
yhat_lg_prob = LR.predict_proba(X_test)

# jaccard
jaccard_lg = jaccard_similarity_score(y_test, yhat_lg)
print("LR Jaccard index: ", jaccard_lg)

# f1_score
f1_score_lg = f1_score(y_test, yhat_lg, average='weighted')
print("LR F1-score: ", f1_score_lg)

# logloss
logloss_lg = log_loss(y_test, yhat_lg_prob)
print("LR log loss: ", logloss_lg)