In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV

In [2]:
# import data
sba_processed = pd.read_csv('../data/SBA_cleaned_data.csv')
df = sba_processed.copy().drop(columns=['Unnamed: 0','NAICS','Industry'])

In [3]:
# create X and y
#X = pd.get_dummies(df.drop(columns=['Default']))
X = df.drop(columns=['Default'])
y = df.Default

In [4]:
# split Train Data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=520)

In [None]:
# build knn classifier model and test accuracy
knn = neighbors.KNeighborsClassifier().fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [6]:
# Improve Model
params = {
    'n_neighbors': [3, 5, 7, 10, 20],
    'weights': ['uniform', 'distance']
}
grid_search = GridSearchCV(neighbors.KNeighborsClassifier(), 
                           param_grid=params, 
                           refit=True,
                           cv=5, n_jobs=-1, verbose=1, scoring = "accuracy").fit(X_train,y_train)

print(f'bt best hyperparams      : {grid_search.best_params_}')
print(f'bt best mean cv accuracy : {grid_search.best_score_:.5f}')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
bt best hyperparams      : {'n_neighbors': 5, 'weights': 'uniform'}
bt best mean cv accuracy : 0.91713


In [9]:
# Improve Model again
params = {
    'n_neighbors': [1,2,4,5,6,8,9],
    'weights': ['uniform', 'distance']
}
grid_search = GridSearchCV(neighbors.KNeighborsClassifier(), 
                           param_grid=params, 
                           refit=True,
                           cv=5, n_jobs=-1, verbose=1, scoring = "accuracy").fit(X_train,y_train)

print(f'bt best hyperparams      : {grid_search.best_params_}')
print(f'bt best mean cv accuracy : {grid_search.best_score_:.5f}')

Fitting 5 folds for each of 14 candidates, totalling 70 fits
bt best hyperparams      : {'n_neighbors': 5, 'weights': 'uniform'}
bt best mean cv accuracy : 0.91713


In [10]:
# improved knn model and test accuracy
knn.improve = neighbors.KNeighborsClassifier(n_neighbors=5,weights='uniform').fit(X_train, y_train)
y_pred_improved = knn.improve.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_improved))

Accuracy: 0.9180816311878522
