### K- Nearest Neighbours
# Steps
- First encode categorical columns usign pd.get_dummies(one-hot encoding)
- Split the data into training and validation data; use stratify
- Create an instance of KNeighborsClassifier
- Create a parameter grid
- Before training drop useless columns like LoanID
- Also scale the input data as knn relies on distance measures
- Conduct a randomized search over the param_grid to get best hyper-parameters (train the model). Here n_neighbors is an important parameter
- Use the best parameters to provide predictions or labels for test data

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn import tree
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score

df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')
ids = test_df['LoanID']
test_df = test_df.drop(columns='LoanID')
print(df.columns)
print()
print(test_df.columns)

# we need to  encode each categorical column
to_encode = []
for column in df.columns:
    if column == 'LoanID':
        continue
    if df[column].dtype == 'object':
        to_encode.append(column)
    elif df[column].dtype not in ['float64', 'int64', 'float', 'int']:
        to_encode.append(column)
# for column in to_encode:
#     le = LabelEncoder() #for now let's use labelEncoder
#     df[column] = le.fit_transform(df[column])
#     test_df[column] = le.transform(test_df[column])
    # later use le.inverse_transform if needed

#one hot
df = pd.get_dummies(df, columns=to_encode, drop_first=True, dtype=int)
test_df = pd.get_dummies(test_df, columns=to_encode, drop_first=True, dtype=int)

print('encoding done')


# now all the categorical columns are set
df = df.drop(columns=['LoanID'])

train_df, validation_df = train_test_split(df, test_size=0.2, random_state=17,stratify=df['Default'])

scaler = StandardScaler()


x_train_df = train_df.drop(columns=['Default'])
y_train_df = train_df['Default']

scaler.fit_transform(x_train_df)
x_validation_df = validation_df.drop(columns=['Default'])
y_validation_df = validation_df['Default']

scaler.transform(x_validation_df)
scaler.transform(test_df)


# class sklearn.neighbors.KNeighborsClassifier(n_neighbors=5, *, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None)

knn_clf = KNeighborsClassifier()
param_grid = {
    "n_neighbors": [40, 50, 60],  # 50 is the best 
    # "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
    "algorithm": ['auto', 'ball_tree'],
    "weights": ['uniform','distance'],
    "leaf_size": [20, 30, 50],
    # "p": [1, 2],
    "metric": ['minkowski', 'euclidean'],
}


rs = RandomizedSearchCV(estimator=knn_clf, param_distributions=param_grid, random_state=17)
# rs = GridSearchCV(estimator=knn_clf,param_grid=param_grid)
rs.fit(x_train_df, y_train_df)
print("Best Parameters:", rs.best_params_)


# tree_clf.fit(x_train_df, y_train_df)

# y_pred = tree_clf.predict(x_validation_df)
y_pred = rs.predict(x_validation_df)

train_acc = accuracy_score(y_train_df, rs.predict(x_train_df))
valid_acc = accuracy_score(y_validation_df, y_pred)

print(f'Training accuracy {train_acc}')
print(f'Validation accuracy {valid_acc}')




newdf = pd.DataFrame({"LoanID": ids, "Default": rs.predict(X=test_df)})
newdf.to_csv('./csv_submissions/knn.csv', index=False)

Index(['LoanID', 'Age', 'Income', 'LoanAmount', 'CreditScore',
       'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm',
       'DTIRatio', 'Education', 'EmploymentType', 'MaritalStatus',
       'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner',
       'Default'],
      dtype='object')

Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Education',
       'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents',
       'LoanPurpose', 'HasCoSigner'],
      dtype='object')
encoding done
Best Parameters: {'weights': 'uniform', 'n_neighbors': 50, 'metric': 'minkowski', 'leaf_size': 50, 'algorithm': 'ball_tree'}
Training accuracy 0.8841336180784599
Validation accuracy 0.8837380066575289
