# __KNN regression - k-nearest neighbors__

## __Import Packages and Tools__

import numpy as np

import pandas as pd

import seaborn as sns


from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.preprocessing import StandardScaler

from math import sqrt


import warnings

warnings.filterwarnings("ignore", category=UserWarning)

---

## __Import Dataset(s)__

X_test = pd.read_csv('../data/file_test.csv')


X_train = pd.read_csv('../data/file_train.csv')

---

## __Overview Dataset(s)__

X_train.head()


X_test.head()

<br />

print('Train:', file_train.shape)

print('Test:', file_test.shape)

---

## __Define y_train und y_test__

y_train = file_train['target']

y_test = file_test['target']

<br />

X_train = file_train.drop(['target', 'further_columns_if_needed'], axis = 1)

X_test = file_test.drop(['target', 'further_columns_if_needed'], axis = 1)

<br />

print('y Train:', y_train.shape)

print('y Test:', y_test.shape)

print('X Train:', X_train.shape)

print('X Test:', X_test.shape)

---

## __Create Dummy Features if needed__

X_train = pd.get_dummies(X_train, drop_first=True)


X_test = pd.get_dummies(X_test, drop_first=True)


print('X Train:', X_train.shape)


print('X Test:', X_test.shape)

---

## __Train the model__

knn = KNeighborsRegressor(n_neighbors=600, metric='minkowski', p=2, weights='distance', n_jobs=-1)


knn.fit(X_train, np.ravel(y_train))

---

## __Predictions__

y_pred_train = knn.predict(X_train)


y_pred_test = knn.predict(X_test)

<br />

print('RMSE Train:', sqrt(mean_squared_error(y_train, y_pred_train)))


print('RMSE Test:', sqrt(mean_squared_error(y_test, y_pred_test)))

---
---
## ___GRIDSEARCH ONLY IF NEEDED (TAKES LONG)___

---
---

---

## __Randomized Search for KNN: Prepare GridSearch__

#### Defining parameter grid (as dictionary)
param_grid = {'n_neighbors':[5,10,50,100,500],
              'weights':['uniform', 'distance'],
              'p':[1,2,3,4],
             }

<br />

#### Instantiate gridsearch and define the metric to optimize 
rs_KNN = RandomizedSearchCV(KNeighborsRegressor(), param_grid, scoring='neg_root_mean_squared_error',cv=5, verbose=2, n_jobs=-1, random_state=42)

<br />

#### Fit gridsearch object to data.. also lets see how long it takes
rs_KNN.fit(X_train, y_train)

<br />

#### Best Score and parameters
print('Best score:', round(rs_KNN.best_score_, 3))


print('Best parameters:', rs_KNN.best_params_)

---

## __Predictions after Randomized Search__

y_pred_train = rs_KNN.predict(X_train)

y_pred_test = rs_KNN.predict(X_test)

print('RMSE Train:', sqrt(mean_squared_error(y_train, y_pred_train)))

print('RMSE Test:', sqrt(mean_squared_error(y_test, y_pred_test)))

---

## __Grid Search__

#### Defining parameter grid (as dictionary)
param_grid = {'n_neighbors':[10,50,100],
              'weights':['uniform', 'distance'],
              'p':[1,2,3],
             }

<br />

#### Instantiate gridsearch and define the metric to optimize 
gs_KNN = GridSearchCV(KNeighborsRegressor(), param_grid, scoring='neg_root_mean_squared_error',cv=5, verbose=2, n_jobs=-1)

<br />

#### Fit gridsearch object to data.. also lets see how long it takes
gs_KNN.fit(X_train, y_train)

#### Best score and parameters
print('Best score:', round(gs_KNN.best_score_, 3))


print('Best parameters:', gs_KNN.best_params_)

---

## __Predictions after GridSearch__

y_pred_train = gs_KNN.predict(X_train)

y_pred_test = gs_KNN.predict(X_test)

print('RMSE Train:', sqrt(mean_squared_error(y_train, y_pred_train)))

print('RMSE Test:', sqrt(mean_squared_error(y_test, y_pred_test)))

---

## __Scaling with Standard Scaler - if needed__

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

---

## __Train the model after GridSearch__

knn_scaled = KNeighborsRegressor(n_neighbors=10, metric='minkowski', p=1, weights='distance', n_jobs=-1)

knn_scaled.fit(X_train_scaled, np.ravel(y_train))

---

## __Last Predictions__

y_pred_train = knn_scaled.predict(X_train_scaled)

y_pred_test = knn_scaled.predict(X_test_scaled)

print('RMSE Train:', sqrt(mean_squared_error(y_train, y_pred_train)))

print('RMSE Test:', sqrt(mean_squared_error(y_test, y_pred_test)))