In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import tree, metrics, ensemble, svm
from sklearn.model_selection import train_test_split

In [None]:
# Calculates the per-class accuracy given predicted and true output labels.
def class_accs(y_pred, y_true):
    acc0 = ((y_pred == y_true) & (y_true == 0)).sum() / (y_true == 0).sum()
    acc1 = ((y_pred == y_true) & (y_true == 1)).sum() / (y_true == 1).sum()
    return acc0, acc1

# Prints a summary of performance metrics given predicted and true output labels.
def print_metrics(y_pred, y_true):
    f1 = metrics.f1_score(y_true, y_pred)
    acc = metrics.accuracy_score(y_true, y_pred)
    acc0, acc1 = class_accs(y_pred, y_true)
    print(f'\tF1 = {f1}')
    print(f'\tAccuracy = {acc}')
    print(f'\t\tclass 0: {acc0}')
    print(f'\t\tclass 1: {acc1}')

In [None]:
# Import data and make it nice (see notebook data_exploration_loan.ipynb)

df = pd.read_csv('../loan_data_set.csv')

df = df.drop("Loan_ID", axis=1)

df = df.dropna(axis=0)

d1 = {"Y": 1, "N": 0}
d2 = {"Yes": 1, "No": 0}
d3 = {"Male": 1, "Female": 0}
d4 = {"0": 0, "1": 1, "2": 2, "3+": 3}
d5 = {"Graduate": 1, "Not Graduate": 0}
d6 = {"Rural": 0, "Semiurban": 0.5, "Urban": 1}

cleanup_nums = {"Loan_Status": d1, "Married": d2, "Self_Employed": d2, "Gender": d3,
"Dependents": d4, "Education": d5, "Property_Area": d6}

df.replace(cleanup_nums, inplace=True)

In [None]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,1,1,1,1,0,4583,1508.0,128.0,360.0,1.0,0.0,0
2,1,1,0,1,1,3000,0.0,66.0,360.0,1.0,1.0,1
3,1,1,0,0,0,2583,2358.0,120.0,360.0,1.0,1.0,1
4,1,0,0,1,0,6000,0.0,141.0,360.0,1.0,1.0,1
5,1,1,2,1,1,5417,4196.0,267.0,360.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,2900,0.0,71.0,360.0,1.0,0.0,1
610,1,1,3,1,0,4106,0.0,40.0,180.0,1.0,0.0,1
611,1,1,1,1,0,8072,240.0,253.0,360.0,1.0,1.0,1
612,1,1,2,1,0,7583,0.0,187.0,360.0,1.0,1.0,1


In [None]:
# Convert everything to a NumPy array
X = df.values[:,:-1]
y = df.values[:,-1]
feature_names = df.columns[:-1]

In [None]:
seed = 1812
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed, stratify=y)

In [None]:
def test_model(X_train, y_train, X_test, y_test, model, plot_tree=False):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    bal_acc = metrics.balanced_accuracy_score(y_test, y_pred)
    print("Balanced accuracy: {}".format(bal_acc))
    print_metrics(y_pred, y_test)

    if plot_tree:
        tree.plot_tree(model, feature_names=feature_names)
        plt.savefig("Decision_tree_loan.png", dpi=800)
        plt.close()

In [None]:
test_model(X_train, y_train, X_test, y_test, tree.DecisionTreeClassifier(), plot_tree=True)
test_model(X_train, y_train, X_test, y_test, ensemble.RandomForestClassifier())
test_model(X_train, y_train, X_test, y_test, svm.SVC(kernel="rbf"))

Balanced accuracy: 0.6167372191468576
	F1 = 0.7560975609756098
	Accuracy = 0.6666666666666666
		class 0: 0.4864864864864865
		class 1: 0.7469879518072289
Balanced accuracy: 0.7070986649299902
	F1 = 0.8603351955307261
	Accuracy = 0.7916666666666666
		class 0: 0.4864864864864865
		class 1: 0.927710843373494
Balanced accuracy: 0.5
	F1 = 0.8177339901477833
	Accuracy = 0.6916666666666667
		class 0: 0.0
		class 1: 1.0


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=5161fece-8ef6-4825-9ea6-16ebba884483' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>