# K Nearest Neighbors

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('data/UCI_Credit_Card.csv')
y = df['default.payment.next.month']
X = df.drop(columns=['default.payment.next.month'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [3]:
def get_metrics(model):
    metrics = {}
    
    y_hat_train = model.predict(X_train)
    y_hat_test = model.predict(X_test)
    
    metrics['train_accuracy'] = accuracy_score(y_train, y_hat_train)
    metrics['test_accuracy'] = accuracy_score(y_test, y_hat_test)
    
    metrics['train_f1'] = f1_score(y_train, y_hat_train)
    metrics['test_f1'] = f1_score(y_test, y_hat_test)
    
    metrics['train_precision'] = precision_score(y_train, y_hat_train)
    metrics['test_precision'] = precision_score(y_test, y_hat_test)
    
    metrics['train_recall'] = recall_score(y_train, y_hat_train)
    metrics['test_recall'] = recall_score(y_test, y_hat_test)
    
    return metrics

## Baseline Model

In [4]:
clf0 = KNeighborsClassifier()
clf0.fit(X_train, y_train)
get_metrics(clf0)

{'train_accuracy': 0.8119111111111111,
 'test_accuracy': 0.7518666666666667,
 'train_f1': 0.4337704040674338,
 'test_f1': 0.23194387123400742,
 'train_precision': 0.6562753036437247,
 'test_precision': 0.3552465233881163,
 'train_recall': 0.3239408473221423,
 'test_recall': 0.1721813725490196}

## Data Cleaning

In [5]:
to_drop = ['ID']
categoricals = ['SEX', 'EDUCATION', 'MARRIAGE']
continuous = ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
target = ['default.payment.next.month']

In [6]:
X_train.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop, axis=1, inplace=True)
for col in ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']:
    X_train[col] = X_train[col].map(lambda x: 0 if x <= 0 else x)
    X_test[col] = X_test[col].map(lambda x: 0 if x <= 0 else x)

In [7]:
clf1 = KNeighborsClassifier()
clf1.fit(X_train, y_train)
get_metrics(clf1)

{'train_accuracy': 0.8152888888888888,
 'test_accuracy': 0.7624,
 'train_f1': 0.4446819882415821,
 'test_f1': 0.26424442609413706,
 'train_precision': 0.6709677419354839,
 'test_precision': 0.4050632911392405,
 'train_recall': 0.33253397282174263,
 'test_recall': 0.19607843137254902}

## Creating Dummy Variables

In [8]:
for col in categoricals:
    X_train[col] = X_train[col].map(str)
    X_test[col] = X_test[col].map(str)
X_train = pd.get_dummies(X_train, drop_first=True, dtype=int)
X_test = pd.get_dummies(X_test, drop_first=True, dtype=int)

In [9]:
clf2 = KNeighborsClassifier()
clf2.fit(X_train, y_train)
get_metrics(clf2)

{'train_accuracy': 0.8153777777777778,
 'test_accuracy': 0.7621333333333333,
 'train_f1': 0.4448008553862604,
 'test_f1': 0.26341866226259286,
 'train_precision': 0.6715092816787732,
 'test_precision': 0.40379746835443037,
 'train_recall': 0.33253397282174263,
 'test_recall': 0.1954656862745098}

## Transforming Continuous Variables

In [10]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))

In [11]:
clf3 = KNeighborsClassifier()
clf3.fit(X_train, y_train)
get_metrics(clf3)

{'train_accuracy': 0.8435555555555555,
 'test_accuracy': 0.792,
 'train_f1': 0.5680981595092024,
 'test_f1': 0.4104308390022675,
 'train_precision': 0.7358550540368722,
 'test_precision': 0.5355029585798816,
 'train_recall': 0.4626298960831335,
 'test_recall': 0.3327205882352941}

In [12]:
X_train = X_train.applymap(lambda x: np.log(x+1))
X_test = X_test.applymap(lambda x: np.log(x+1))

In [13]:
clf4 = KNeighborsClassifier()
clf4.fit(X_train, y_train)
get_metrics(clf4)

{'train_accuracy': 0.8452,
 'test_accuracy': 0.7938666666666667,
 'train_f1': 0.5748809959721713,
 'test_f1': 0.4161631419939577,
 'train_precision': 0.7384760112888052,
 'test_precision': 0.5423228346456693,
 'train_recall': 0.4706235011990408,
 'test_recall': 0.33762254901960786}

## Addressing Class Imbalance

In [14]:
df[target[0]].value_counts(), y_train.value_counts(), y_test.value_counts()

(0    23364
 1     6636
 Name: default.payment.next.month, dtype: int64,
 0    17496
 1     5004
 Name: default.payment.next.month, dtype: int64,
 0    5868
 1    1632
 Name: default.payment.next.month, dtype: int64)

In [16]:
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

In [17]:
clf5 = KNeighborsClassifier()
clf5.fit(X_train, y_train)
get_metrics(clf5)

{'train_accuracy': 0.8606824417009602,
 'test_accuracy': 0.6646666666666666,
 'train_f1': 0.8717881282381718,
 'test_f1': 0.434958436306448,
 'train_precision': 0.8074243679056852,
 'test_precision': 0.34338417878680383,
 'train_recall': 0.9473022405121171,
 'test_recall': 0.5931372549019608}