In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

# read loan train and test datasets
df = pd.read_csv('loan_train.csv')
df1 = pd.read_csv('loan_test.csv')

# fill missing values with the mode for categorical variables and median for numerical variables
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df['Loan_Amount'].fillna(df['Loan_Amount'].median(), inplace=True)
df['Term'].fillna(df['Term'].mode()[0], inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

df1['Gender'].fillna(df1['Gender'].mode()[0], inplace=True)
df1['Dependents'].fillna(df1['Dependents'].mode()[0], inplace=True)
df1['Self_Employed'].fillna(df1['Self_Employed'].mode()[0], inplace=True)
df1['Loan_Amount'].fillna(df1['Loan_Amount'].median(), inplace=True)
df1['Term'].fillna(df1['Term'].mode()[0], inplace=True)
df1['Credit_History'].fillna(df1['Credit_History'].mode()[0], inplace=True)

# convert categorical variables to numerical
cat_vars = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Area']
for var in cat_vars:
    le = LabelEncoder()
    df[var] = le.fit_transform(df[var])
    df1[var] = le.transform(df1[var])

# split train dataset into train and validation datasets
X_train, X_val, y_train, y_val = train_test_split(df.drop('Status', axis=1), df['Status'], test_size=0.2, random_state=335)

# KNN model
knn = KNeighborsClassifier(n_neighbors=191)
knn.fit(X_train, y_train)

# Naive Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)
knn_pred = knn.predict(X_val)
nb_pred = nb.predict(X_val)

# print predictions for test dataset
knn_pred_test = knn.predict(df1)
nb_pred_test = nb.predict(df1)

#calculate accuracy and confusion matrix for KNN model
knn_acc = knn.score(X_val, y_val)
knn_cm = confusion_matrix(y_val, knn_pred)

#calculate accuracy and confusion matrix for Naive Bayes model
nb_acc = nb.score(X_val, y_val)
nb_cm = confusion_matrix(y_val, nb_pred)

print("KNN accuracy on validation dataset:", knn_acc)
print("KNN confusion matrix:\n", knn_cm)
print("Naive Bayes accuracy on validation dataset:", nb_acc)
print("Naive Bayes confusion matrix:\n", nb_cm)


KNN accuracy on validation dataset: 0.7886178861788617
KNN confusion matrix:
 [[ 0 26]
 [ 0 97]]
Naive Bayes accuracy on validation dataset: 0.7560975609756098
Naive Bayes confusion matrix:
 [[ 2 24]
 [ 6 91]]


In [2]:

print("KNN predictions for test dataset:\n", knn_pred_test)
print("Naive Bayes predictions for test dataset:\n", nb_pred_test)

KNN predictions for test dataset:
 ['Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y'

In [4]:
df.shape

(614, 12)

In [5]:
df1.shape

(367, 11)

In [6]:
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History,Area,Status
0,1,0,0,0,0,584900,0.0,15000000,360.0,1.0,2,Y
1,1,1,1,0,0,458300,150800.0,12800000,360.0,1.0,0,N
2,1,1,0,0,1,300000,0.0,6600000,360.0,1.0,2,Y
3,1,1,0,1,0,258300,235800.0,12000000,360.0,1.0,2,Y
4,1,0,0,0,0,600000,0.0,14100000,360.0,1.0,2,Y


In [7]:
df = df.drop('Dependents', axis=1)

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

# read loan train and test datasets
df = pd.read_csv('loan_train.csv')
df1 = pd.read_csv('loan_test.csv')

# fill missing values with the mode for categorical variables and median for numerical variables
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
df['Married'].fillna(df['Married'].mode()[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)
df['Loan_Amount'].fillna(df['Loan_Amount'].median(), inplace=True)
df['Term'].fillna(df['Term'].mode()[0], inplace=True)
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

df1['Gender'].fillna(df1['Gender'].mode()[0], inplace=True)
df1['Dependents'].fillna(df1['Dependents'].mode()[0], inplace=True)
df1['Self_Employed'].fillna(df1['Self_Employed'].mode()[0], inplace=True)
df1['Loan_Amount'].fillna(df1['Loan_Amount'].median(), inplace=True)
df1['Term'].fillna(df1['Term'].mode()[0], inplace=True)
df1['Credit_History'].fillna(df1['Credit_History'].mode()[0], inplace=True)

# convert categorical variables to numerical
cat_vars = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Area']
for var in cat_vars:
    le = LabelEncoder()
    df[var] = le.fit_transform(df[var])
    df1[var] = le.transform(df1[var])

# split train dataset into train and validation datasets
X_train, X_val, y_train, y_val = train_test_split(df.drop('Status', axis=1), df['Status'], test_size=0.2, random_state=335)

# KNN model
knn = KNeighborsClassifier(n_neighbors=191)
knn.fit(X_train, y_train)

# Naive Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_val)
knn_pred = knn.predict(X_val)
nb_pred = nb.predict(X_val)

# print predictions for test dataset
knn_pred_test = knn.predict(df1)
nb_pred_test = nb.predict(df1)

#print("KNN predictions for test dataset:\n", knn_pred_test)
#print("Naive Bayes predictions for test dataset:\n", nb_pred_test)
#calculate accuracy and confusion matrix for KNN model
knn_acc = knn.score(X_val, y_val)
knn_cm = confusion_matrix(y_val, knn_pred)

#calculate accuracy and confusion matrix for Naive Bayes model
nb_acc = nb.score(X_val, y_val)
nb_cm = confusion_matrix(y_val, nb_pred)

print("KNN test accuracy:", knn_acc)
print("KNN test confusion matrix:\n", knn_cm)
print("Naive Bayes test accuracy:", nb_acc)
print("Naive Bayes test confusion matrix:\n", nb_cm)


KNN test accuracy: 0.7886178861788617
KNN test confusion matrix:
 [[ 0 26]
 [ 0 97]]
Naive Bayes test accuracy: 0.7560975609756098
Naive Bayes test confusion matrix:
 [[ 2 24]
 [ 6 91]]
