# Load in the data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [40]:
from google.colab import drive
drive.mount('/gdrive')



Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [41]:
%cd  /gdrive/MyDrive/364/'KNN Comp'/
%ls

/gdrive/MyDrive/364/KNN Comp
KNN-sample-features.csv  model.pickle  [0m[01;34mPrograms[0m/
KNN-sample-target.csv    process.py    [01;34m__pycache__[0m/


In [None]:
df = pd.read_csv('KNN-data.csv')
df

# Base Model

In [None]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [None]:
def evaluateKNN(x, scale='standard'):
  y= df['y'].to_numpy()
  X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)
  if scale == 'standard':
    pipe = make_pipeline(StandardScaler(), KNeighborsClassifier())
  elif scale == 'normalize':
    pipe = make_pipeline(MinMaxScaler(), KNeighborsClassifier()) 
  else:
     pipe = make_pipeline(KNeighborsClassifier()) 

  pipe.fit(X_train, y_train)  # apply scaling on training data
  print(pipe.score(X_test, y_test))
  


In [None]:
def evaluateKNNGrid(x, scale='standard'):
  y= df['y'].to_numpy()
  X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=101)
  
  if scale == 'standard':
    pipe = make_pipeline(StandardScaler(), KNeighborsClassifier())
  elif scale == 'normalize':
    pipe = make_pipeline(MinMaxScaler(), KNeighborsClassifier()) 
  else:
    pipe = make_pipeline(KNeighborsClassifier())
  

  k_range = list(range(1, 31))
  param_grid = dict(kneighborsclassifier__n_neighbors=k_range, kneighborsclassifier__weights=['uniform', 'distance'], kneighborsclassifier__p=[1,2])
  
  # defining parameter range
  grid = GridSearchCV(pipe, param_grid, cv=10, scoring='accuracy', return_train_score=False,verbose=1)
  
  # fitting the model for grid search
  result=grid.fit(X_train, y_train)

  # summarize result
  print('Best Score: %s' % result.best_score_)
  print('Best Hyperparameters: %s' % result.best_params_)

In [None]:
X = df.drop(columns='y').to_numpy()
y= df['y'].to_numpy()
X_df = df.drop(columns='y')
y_df = df['y']


In [None]:

evaluateKNN(X, scale='standard'), evaluateKNN(X, scale='normalize'), evaluateKNN(X, scale=None)
# standard wins 


0.6715686274509803
0.6666666666666666
0.6225490196078431


(None, None, None)

In [None]:

evaluateKNNGrid(X, scale='standard')

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
Best Score: 0.7852303523035231
Best Hyperparameters: {'kneighborsclassifier__n_neighbors': 22, 'kneighborsclassifier__p': 1, 'kneighborsclassifier__weights': 'distance'}


# Models with processing

In [None]:
%pip install ipynb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import sys
sys.path.insert(0,'/gdrive/MyDrive/364')

In [None]:
# import processing
from process import *

In [None]:
X1 = process_zero_col(X, method='mean')
evaluateKNNGrid(X1, scale='standard')

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
Best Score: 0.7852002408912978
Best Hyperparameters: {'kneighborsclassifier__n_neighbors': 21, 'kneighborsclassifier__p': 2, 'kneighborsclassifier__weights': 'distance'}


In [None]:
X2 = log_transform(X1)  
evaluateKNNGrid(X2, scale='standard')    # best model 

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
Best Score: 0.7876392652815417
Best Hyperparameters: {'kneighborsclassifier__n_neighbors': 15, 'kneighborsclassifier__p': 2, 'kneighborsclassifier__weights': 'distance'}


In [None]:
X3 = col_mult(X1)
evaluateKNNGrid(X3, scale='standard')

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
Best Score: 0.7680367359229148
Best Hyperparameters: {'kneighborsclassifier__n_neighbors': 24, 'kneighborsclassifier__p': 2, 'kneighborsclassifier__weights': 'distance'}


In [None]:
X4= col_mult(X2)
evaluateKNNGrid(X4, scale='standard')

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
Best Score: 0.7754140319180969
Best Hyperparameters: {'kneighborsclassifier__n_neighbors': 26, 'kneighborsclassifier__p': 1, 'kneighborsclassifier__weights': 'distance'}


In [None]:
X5 = col_mult(X)
evaluateKNNGrid(X5, scale='standard')

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
Best Score: 0.7680969587473652
Best Hyperparameters: {'kneighborsclassifier__n_neighbors': 9, 'kneighborsclassifier__p': 2, 'kneighborsclassifier__weights': 'distance'}


In [None]:
X6 = col_div(X1)
evaluateKNNGrid(X6, scale='standard')

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
Best Score: 0.76806684733514
Best Hyperparameters: {'kneighborsclassifier__n_neighbors': 23, 'kneighborsclassifier__p': 1, 'kneighborsclassifier__weights': 'distance'}


In [None]:
X7 = col_div(X1)
X7 = col_mult(X7)
evaluateKNNGrid(X7, scale='standard')

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
Best Score: 0.758250526949714
Best Hyperparameters: {'kneighborsclassifier__n_neighbors': 29, 'kneighborsclassifier__p': 2, 'kneighborsclassifier__weights': 'distance'}


In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(4, interaction_only=False) # 4 was the best
X8 = poly.fit_transform(X1)
evaluateKNNGrid(X8, scale='standard')


### best model

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
Best Score: 0.7582204155374888
Best Hyperparameters: {'kneighborsclassifier__n_neighbors': 25, 'kneighborsclassifier__p': 2, 'kneighborsclassifier__weights': 'distance'}


In [None]:
X10 = log_transform(X8)

evaluateKNNGrid(X10, scale='standard')

Fitting 10 folds for each of 120 candidates, totalling 1200 fits
Best Score: 0.7582204155374888
Best Hyperparameters: {'kneighborsclassifier__n_neighbors': 25, 'kneighborsclassifier__p': 2, 'kneighborsclassifier__weights': 'distance'}


# Export Best Model

In [None]:
model = KNeighborsClassifier(n_neighbors=25, p = 2, weights= 'distance')
X_clean =  process(X)

X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.2)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8235294117647058

In [None]:
import pickle
pickle.dump(model, open("model.pickle", "wb"))