In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.decomposition import PCA
#from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
#from sklearn.neighbors import NeighborhoodComponentsAnalysis
#from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np
import os
SMALL_SIZE = 8
MEDIUM_SIZE = 16
BIGGER_SIZE = 24

In [2]:
#Import test data file, add columns for log-transformed data
test_data = pd.read_csv(os.path.join('../data', 'cleanLoanDataValidationAllIncome.csv'))
test_data['LogTotalIncome']=np.log(test_data['TotalIncome'])
test_data["LogLoanAmount"]= np.log(test_data["LoanAmount"])
pd.set_option('display.max_columns', None)
test_data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,TotalIncome,LogTotalIncome,LogLoanAmount
0,0,1,0,1,0,5720.0,0.0,110000.0,360.0,1,2,5720.0,8.651724,11.608236
1,0,1,1,1,0,3076.0,1500.0,126000.0,360.0,1,2,4576.0,8.428581,11.744037
2,0,1,2,1,0,5000.0,1800.0,208000.0,360.0,1,2,6800.0,8.824678,12.245293
3,0,1,2,1,0,2340.0,2546.0,100000.0,360.0,0,2,4886.0,8.494129,11.512925
4,0,0,0,0,0,3276.0,0.0,78000.0,360.0,1,2,3276.0,8.094378,11.264464
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340,0,1,3,0,1,4009.0,1777.0,113000.0,360.0,1,2,5786.0,8.663196,11.635143
341,0,1,0,1,0,4158.0,709.0,115000.0,360.0,1,2,4867.0,8.490233,11.652687
342,0,0,0,1,0,3250.0,1993.0,126000.0,360.0,0,1,5243.0,8.564649,11.744037
343,0,1,0,1,0,5000.0,2393.0,158000.0,360.0,1,0,7393.0,8.908289,11.970350


In [3]:
#Import training data file, add columns for log-transformed data
train_data = pd.read_csv(os.path.join('../data', 'cleanLoanDataTrainAllIncome.csv'))
train_data['LogTotalIncome']=np.log(train_data['TotalIncome'])
train_data["LogLoanAmount"]= np.log(train_data["LoanAmount"])
pd.set_option('display.max_columns', None)
train_data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome,LogTotalIncome,LogLoanAmount
0,0,1,1,1,0,4583.0,1508.0,128000.0,360.0,1,0,0,6091.0,8.714568,11.759786
1,0,1,0,1,1,3000.0,0.0,66000.0,360.0,1,2,1,3000.0,8.006368,11.097410
2,0,1,0,0,0,2583.0,2358.0,120000.0,360.0,1,2,1,4941.0,8.505323,11.695247
3,0,0,0,1,0,6000.0,0.0,141000.0,360.0,1,2,1,6000.0,8.699515,11.856515
4,0,1,2,1,1,5417.0,4196.0,267000.0,360.0,1,2,1,9613.0,9.170872,12.495004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558,1,0,0,1,0,2900.0,0.0,71000.0,360.0,1,0,1,2900.0,7.972466,11.170435
559,0,1,3,1,0,4106.0,0.0,40000.0,180.0,1,0,1,4106.0,8.320205,10.596635
560,0,1,1,1,0,8072.0,240.0,253000.0,360.0,1,2,1,8312.0,9.025456,12.441145
561,0,1,2,1,0,7583.0,0.0,187000.0,360.0,1,2,1,7583.0,8.933664,12.138864


0. #https://scikit-learn.org/stable/auto_examples/neighbors/plot_nca_dim_reduction.html#sphx-glr-auto-examples-neighbors-plot-nca-dim-reduction-py Dimensionality Reduction with Neighborhood Components Analysis

print(__doc__)

n_neighbors = 3
random_state = 57

target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "ApplicantIncome", "CoapplicantIncome", "LogTotalIncome", "LogLoanAmount"], axis=1)
feature_names = data.columns
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.5, stratify=target, random_state=57)

dim = len(data[0])
n_classes = len(np.unique(y))

# Reduce dimension to 2 with PCA
pca = make_pipeline(StandardScaler(),
                    PCA(n_components=2, random_state=random_state))

# Reduce dimension to 2 with LinearDiscriminantAnalysis
lda = make_pipeline(StandardScaler(),
                    LinearDiscriminantAnalysis(n_components=2))

# Reduce dimension to 2 with NeighborhoodComponentAnalysis
nca = make_pipeline(StandardScaler(),
                    NeighborhoodComponentsAnalysis(n_components=2,
                                                   random_state=random_state))

# Use a nearest neighbor classifier to evaluate the methods
knn = KNeighborsClassifier(n_neighbors=n_neighbors)

# Make a list of the methods to be compared
dim_reduction_methods = [('PCA', pca), ('LDA', lda), ('NCA', nca)]

# plt.figure()
for i, (name, model) in enumerate(dim_reduction_methods):
    plt.figure()
    # plt.subplot(1, 3, i + 1, aspect=1)

    # Fit the method's model
    model.fit(X_train, y_train)

    # Fit a nearest neighbor classifier on the embedded training set
    knn.fit(model.transform(X_train), y_train)

    # Compute the nearest neighbor accuracy on the embedded test set
    acc_knn = knn.score(model.transform(X_test), y_test)

    # Embed the data set in 2 dimensions using the fitted model
    X_embedded = model.transform(X)

    # Plot the projected points and show the evaluation score
    plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1')
    plt.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format(name,
                                                              n_neighbors,
                                                              acc_knn))
plt.show()

## 1a. KNN (Combined income)

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]

In [None]:
data = train_data.drop(["Loan_Status", "ApplicantIncome", "CoapplicantIncome", "LogTotalIncome", "LogLoanAmount"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_test_scaled

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors", fontsize=MEDIUM_SIZE)
plt.ylabel("Testing accuracy Score", fontsize=MEDIUM_SIZE)
plt.axis([0, 40, 0.58, 1.02])
plt.title("KNN 1a", fontsize=BIGGER_SIZE)
plt.savefig('../static/images/KNN_1a.png')
plt.show()


In [None]:
# Note that k: 5 seems to be approx. the best choice for this dataset
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
print('k=5 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## 1b. KNN (Applicant and co-applicant income separate)

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "TotalIncome", "LogTotalIncome", "LogLoanAmount"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_test_scaled

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors", fontsize=MEDIUM_SIZE)
plt.ylabel("Testing accuracy Score", fontsize=MEDIUM_SIZE)
plt.axis([0, 40, 0.58, 1.02])
plt.title("KNN 1b", fontsize=BIGGER_SIZE)
plt.savefig('../static/images/KNN_1b.png')
plt.show()

In [None]:
# Note that k: 5 seems to be approx. the best choice for this dataset
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
print('k=5 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## 1c. KNN (log(TotalIncome))

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "TotalIncome", "ApplicantIncome", "CoapplicantIncome", "LogLoanAmount"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_test_scaled

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors", fontsize=MEDIUM_SIZE)
plt.ylabel("Testing accuracy Score", fontsize=MEDIUM_SIZE)
plt.axis([0, 40, 0.58, 1.02])
plt.title("KNN 1c", fontsize=BIGGER_SIZE)
plt.savefig('../static/images/KNN_1c.png')
plt.show()

In [None]:
# Note that k: 5 seems to be approx. the best choice for this dataset
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
print('k=5 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## 1d. KNN (log(TotalIncome)), log(LoanAmount)

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "TotalIncome", "ApplicantIncome", "CoapplicantIncome", "LoanAmount"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)

In [None]:
X_scaler = StandardScaler().fit(X_train)

In [None]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
X_test_scaled

In [None]:
# Loop through different k values to see which has the highest accuracy
# Note: We only use odd numbers because we don't want any ties
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors", fontsize=MEDIUM_SIZE)
plt.ylabel("Testing accuracy Score", fontsize=MEDIUM_SIZE)
plt.axis([0, 40, 0.58, 1.02])
plt.title("KNN 1d", fontsize=BIGGER_SIZE)
plt.savefig('../static/images/KNN_1d.png')
plt.show()

In [None]:
# Note that k: 5 seems to be approx. the best choice for this dataset
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_scaled, y_train)
print('k=7 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## check test

## 2a. KNN, took out Loan_Amount_Term (Combined incomes)

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "ApplicantIncome", "CoapplicantIncome", "LogTotalIncome", "LogLoanAmount"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors", fontsize=MEDIUM_SIZE)
plt.ylabel("Testing accuracy Score", fontsize=MEDIUM_SIZE)
plt.axis([0, 40, 0.58, 1.02])
plt.title("KNN 2a", fontsize=BIGGER_SIZE)
plt.savefig('../static/images/KNN_2a.png')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
print('k=5 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## 2b. KNN, took out Loan_Amount_Term (Applicant and co-applicant incomes separate)

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "TotalIncome", 'Loan_Amount_Term', "LogTotalIncome", "LogLoanAmount"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors", fontsize=MEDIUM_SIZE)
plt.ylabel("Testing accuracy Score", fontsize=MEDIUM_SIZE)
plt.axis([0, 40, 0.58, 1.02])
plt.title("KNN 2b", fontsize=BIGGER_SIZE)
plt.savefig('../static/images/KNN_2b.png')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train_scaled, y_train)
print('k=11 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## 2c. KNN, took out Loan_Amount_Term (log(TotalIncome))

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "TotalIncome", 'Loan_Amount_Term', "ApplicantIncome", "CoapplicantIncome", "LogLoanAmount"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors", fontsize=MEDIUM_SIZE)
plt.ylabel("Testing accuracy Score", fontsize=MEDIUM_SIZE)
plt.axis([0, 40, 0.58, 1.02])
plt.title("KNN 2c", fontsize=BIGGER_SIZE)
plt.savefig('../static/images/KNN_2c.png')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=13)
knn.fit(X_train_scaled, y_train)
print('k=13 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## 2d. KNN, took out Loan_Amount_Term (log(TotalIncome)), used log(LoanAmount)

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "TotalIncome", 'Loan_Amount_Term', "ApplicantIncome", "CoapplicantIncome", "LoanAmount"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors", fontsize=MEDIUM_SIZE)
plt.ylabel("Testing accuracy Score", fontsize=MEDIUM_SIZE)
plt.axis([0, 40, 0.58, 1.02])
plt.title("KNN 2d", fontsize=BIGGER_SIZE)
plt.savefig('../static/images/KNN_2d.png')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=19)
knn.fit(X_train_scaled, y_train)
print('k=19 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## check test

## 3a. KNN, took out Loan_Amount_Term and LoanAmount (Combined incomes)

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status",'LoanAmount', 'Loan_Amount_Term', "ApplicantIncome", "CoapplicantIncome", "LogTotalIncome", "LogLoanAmount"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors", fontsize=MEDIUM_SIZE)
plt.ylabel("Testing accuracy Score", fontsize=MEDIUM_SIZE)
plt.axis([0, 40, 0.58, 1.02])
plt.title("KNN 3a", fontsize=BIGGER_SIZE)
plt.savefig('../static/images/KNN_3a.png')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=21)
knn.fit(X_train_scaled, y_train)
print('k=21 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## 3b. KNN, took out Loan_Amount_Term and LoanAmount (Applicant and co-applicant incomes separate)

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status",'LoanAmount', 'Loan_Amount_Term', "TotalIncome", "LogTotalIncome", "LogLoanAmount"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors", fontsize=MEDIUM_SIZE)
plt.ylabel("Testing accuracy Score", fontsize=MEDIUM_SIZE)
plt.axis([0, 40, 0.58, 1.02])
plt.title("KNN 3b", fontsize=BIGGER_SIZE)
plt.savefig('../static/images/KNN_3b.png')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=21)
knn.fit(X_train_scaled, y_train)
print('k=21 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## 3c. KNN, took out Loan_Amount_Term and LoanAmount (log(Total Income))

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status",'LoanAmount', 'Loan_Amount_Term', "TotalIncome", "CoapplicantIncome", "ApplicantIncome", "LogLoanAmount"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors", fontsize=MEDIUM_SIZE)
plt.ylabel("Testing accuracy Score", fontsize=MEDIUM_SIZE)
plt.axis([0, 40, 0.58, 1.02])
plt.title("KNN 3c", fontsize=BIGGER_SIZE)
plt.savefig('../static/images/KNN_3c.png')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=19)
knn.fit(X_train_scaled, y_train)
print('k=21 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## 4a. KNN, took out Loan_Amount_Term and LoanAmount and also factors less important in Random Forest tests (Combined incomes)

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "ApplicantIncome", "CoapplicantIncome", 'Self_Employed', 'LoanAmount', 'Education', 'Married', 'Gender', 'Loan_Amount_Term', "LogTotalIncome", "LogLoanAmount"], axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors", fontsize=MEDIUM_SIZE)
plt.ylabel("Testing accuracy Score", fontsize=MEDIUM_SIZE)
plt.axis([0, 40, 0.58, 1.02])
plt.title("KNN 4a", fontsize=BIGGER_SIZE)
plt.savefig('../static/images/KNN_4a.png')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=17)
knn.fit(X_train_scaled, y_train)
print('k=17 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## 4b. KNN, took out Loan_Amount_Term and LoanAmount and also factors less important in Random Forest tests (Applicant and Co-Applicant incomes considered separately)

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "TotalIncome", 'Self_Employed', 'LoanAmount', 'Education', 'Gender', 'Married', 'Loan_Amount_Term', "LogTotalIncome", "LogLoanAmount"],  axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors", fontsize=MEDIUM_SIZE)
plt.ylabel("Testing accuracy Score", fontsize=MEDIUM_SIZE)
plt.axis([0, 40, 0.58, 1.02])
plt.title("KNN 4b", fontsize=BIGGER_SIZE)
plt.savefig('../static/images/KNN_4b.png')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train_scaled, y_train)
print('k=15 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## 4c KNN, took out Loan_Amount_Term and LoanAmount and also factors less important in Random Forest tests (log(TotalIncome))¶

In [None]:
target = train_data["Loan_Status"]
target_names = ["Denied", "Approved"]
data = train_data.drop(["Loan_Status", "TotalIncome", 'Self_Employed', 'LoanAmount', 'Education', 'Gender', 'Married', 'Loan_Amount_Term', "ApplicantIncome", "CoapplicantIncome", "LogLoanAmount"],  axis=1)
feature_names = data.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=57)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
train_scores = []
test_scores = []
for k in range(1, 40, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
    
plt.plot(range(1, 40, 2), train_scores, marker='o')
plt.plot(range(1, 40, 2), test_scores, marker="x")
plt.xlabel("k neighbors", fontsize=MEDIUM_SIZE)
plt.ylabel("Testing accuracy Score", fontsize=MEDIUM_SIZE)
plt.axis([0, 40, 0.58, 1.02])
plt.title("KNN 4c", fontsize=BIGGER_SIZE)
plt.savefig('../static/images/KNN_4c.png')
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train_scaled, y_train)
print('k=11 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))