In [16]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [17]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))



In [19]:
# Convert categorical data to numeric and separate target data
y_train = train_df['target']  
X_train = train_df.drop('target', axis=1)
y_test = test_df['target']
X_test = test_df.drop('target', axis=1)

train_dummies = pd.get_dummies(X_train)
test_dummies = pd.get_dummies(X_test)

In [30]:
# add missing dummy variables to testing set
train_dummies, test_dummies = train_dummies.align(test_dummies, join='outer', axis=1, fill_value=0)
train_dummies

Unnamed: 0,acc_now_delinq,acc_open_past_24mths,all_util,annual_inc,application_type_Individual,application_type_Joint App,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,...,total_il_high_credit_limit,total_pymnt,total_pymnt_inv,total_rec_int,total_rec_late_fee,total_rec_prncp,total_rev_hi_lim,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified
0,0.0,2.0,70.0,75000.0,True,False,13591.0,7092.0,79.7,0.0,...,10000.0,754.41,754.41,309.78,0.0,444.63,66200.0,True,False,False
1,0.0,3.0,55.0,102000.0,True,False,26649.0,4685.0,47.9,0.0,...,71044.0,2891.26,2891.26,1560.39,0.0,1330.87,55700.0,False,True,False
2,0.0,9.0,39.0,45000.0,True,False,1637.0,26933.0,18.4,0.0,...,46328.0,3512.60,3512.60,2481.45,0.0,1031.15,45900.0,False,False,True
3,0.0,2.0,71.0,38000.0,False,True,32787.0,1937.0,85.7,0.0,...,52017.0,2672.95,2672.95,801.10,0.0,1871.85,15300.0,True,False,False
4,0.0,9.0,37.0,43000.0,True,False,4007.0,22609.0,32.1,0.0,...,78680.0,10158.98,10158.98,3280.34,0.0,6878.64,41600.0,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,0.0,5.0,56.0,28000.0,True,False,1879.0,9425.0,49.6,0.0,...,19055.0,7181.35,7181.35,3511.51,0.0,3669.84,31000.0,True,False,False
12176,0.0,7.0,50.0,50000.0,True,False,3564.0,11060.0,16.8,0.0,...,54824.0,5200.43,5200.43,2610.41,0.0,2590.02,15500.0,False,False,True
12177,0.0,10.0,66.0,60000.0,True,False,3940.0,8595.0,41.9,0.0,...,53065.0,1571.64,1571.64,642.07,0.0,929.57,30700.0,True,False,False
12178,0.0,2.0,60.0,62000.0,False,True,34422.0,6843.0,2.2,0.0,...,32930.0,5225.14,5225.14,1053.13,0.0,4172.01,7000.0,False,True,False


Prediction: The logisitic regression will perform better than KNN because:
1. The data has reasoanbly high dimensions, so the distance between points carries less information about the relationships as a lot of it becomes noise.
2. KNN is more sensitive to outliers, and this is a dataset which may easily contain outliers.

In [24]:
# Train the Logistic Regression model on the unscaled data and print the model score
logistic_model = LogisticRegression()
logistic_model.fit(train_dummies, y_train)
print("Logistic Regression score (unscaled data):", logistic_model.score(test_dummies, y_test))


Logistic Regression score (unscaled data): 0.5072309655465759


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
# Train a K-Nearest Neighbors model and print the model score
knn_model = KNeighborsClassifier()
knn_model.fit(train_dummies, y_train)
print("K-Nearest Neighbors score (unscaled data):", knn_model.score(test_dummies, y_test))

K-Nearest Neighbors score (unscaled data): 0.5065929391748192


In [27]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_dummies)
X_test_scaled = scaler.transform(test_dummies)

In [28]:
# Train the Logistic Regression model on the scaled data and print the model score
logistic_model_scaled = LogisticRegression()
logistic_model_scaled.fit(X_train_scaled, y_train)
print("Logistic Regression score (scaled data):", logistic_model_scaled.score(X_test_scaled, y_test))


Logistic Regression score (scaled data): 0.7607401105912378


In [29]:
# Train a K-Nearest Neighbors model on the scaled data and print the model score
knn_model_scaled = KNeighborsClassifier()
knn_model_scaled.fit(X_train_scaled, y_train)
print("K-Nearest Neighbors score (scaled data):", knn_model_scaled.score(X_test_scaled, y_test))


K-Nearest Neighbors score (scaled data): 0.5531688643130582


Logistic Regression performed better when scaled (scaling may also affect KNN as it relies on distance between points)