# K nearest Neighbors 

## Importing the Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('ML_training&testing_v01shuffled_20220317.csv')
# dataset

In [3]:
dataset.columns

Index(['Year', 'DOY', 'EVI', 'Evapo', 'LST_Daily', 'LST_Diff', 'NDVI', 'TI',
       'T_air', 'API', 'Clay', 'Elevation', 'lat', 'lon', 'OMC', 'Porosity',
       'Sand', 'Silt', 'Preci', 'Soil Moisture'],
      dtype='object')

## Construction of the matrix of characteristics (X) and the dependent variable vector (y)

In [4]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Division of the dataset into the Training Set and the Test Set

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Very important: Feature scaling of X (range -3 to 3)

### Obs: Fit_transform only applied to X_train to prevent data leaking

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Construction and training of the regression model over the training set


In [7]:
%%time
from sklearn.neighbors import KNeighborsRegressor  #the class "KNeighborsRegressor" from "neighbors" module of scikit-learn 
regressor = KNeighborsRegressor() #regressor object containing KNeighborsRegressor
regressor.fit(X_train, y_train)

CPU times: user 46.4 ms, sys: 18.5 ms, total: 64.9 ms
Wall time: 61.3 ms


KNeighborsRegressor()

## Bulid our own "gridSearchCV" which can run our code in more servers

In [None]:
import itertools

n_neighbors = [3,4,5,6,7]
weights = ['uniform','distance']
p = [1,2]
leaf_size = [20,30,40]
algorithm = ['auto','ball_tree']

param_sets_my = []
for element in itertools.product(n_neighbors, weights, p, leaf_size, algorithm):
    # print(element)
    dict_i = {
        'n_neighbors': element[0], 
        'weights': element[1], 
        'p': element[2], 
        'leaf_size': element[3], 
        'algorithm': element[4]   
    }
    param_sets_my.append(dict_i)

In [None]:
len(param_sets_my)

120

In [None]:
param_sets_my[0]

{'n_neighbors': 3,
 'weights': 'uniform',
 'p': 1,
 'leaf_size': 20,
 'algorithm': 'auto'}

In [35]:
%%time
from multiprocessing import Pool
# param_sets = [
#     {'weights': 'uniform',
#      'leaf_size': 20,
#      'p': 1
#     },
#     {'weights': 'distance',
#      'leaf_size': 30,
#      'p': 2
#     }
# ]

def f(x):
    param_set = param_sets_my[x]
    regressor = KNeighborsRegressor(**param_set) # == KNeighborsRegressor(weights='uniform', leaf_size=20, p=1)
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test) #X_test[:100000]
    score = regressor.score(X_test, y_test) #X_test[:100000], y_test[:100000]
    print(f'{param_set}: {score}')
    return (x, score)

if __name__ == '__main__':
    with Pool(70) as p:
        res = p.map(f, range(len(param_sets_my)))
        # res = p.map(f, [0, 11, 20, 3]) #
        # res = p.map(f, param_sets_my[:3])

{'n_neighbors': 3, 'weights': 'distance', 'p': 1, 'leaf_size': 20, 'algorithm': 'ball_tree'}: 0.8842513083417756
{'n_neighbors': 3, 'weights': 'uniform', 'p': 1, 'leaf_size': 20, 'algorithm': 'ball_tree'}: 0.8799341372776761
{'n_neighbors': 3, 'weights': 'uniform', 'p': 2, 'leaf_size': 30, 'algorithm': 'auto'}: 0.8280451855567557
{'n_neighbors': 4, 'weights': 'uniform', 'p': 1, 'leaf_size': 20, 'algorithm': 'ball_tree'}: 0.8794007206609618
{'n_neighbors': 5, 'weights': 'distance', 'p': 1, 'leaf_size': 20, 'algorithm': 'ball_tree'}: 0.8835407915641089
{'n_neighbors': 3, 'weights': 'uniform', 'p': 2, 'leaf_size': 40, 'algorithm': 'auto'}: 0.8280451855567557
{'n_neighbors': 3, 'weights': 'distance', 'p': 2, 'leaf_size': 30, 'algorithm': 'auto'}: 0.8340462775496291
{'n_neighbors': 4, 'weights': 'distance', 'p': 1, 'leaf_size': 20, 'algorithm': 'ball_tree'}: 0.8847602664133686
{'n_neighbors': 3, 'weights': 'distance', 'p': 2, 'leaf_size': 40, 'algorithm': 'auto'}: 0.8340462775496291
{'n_nei

In [38]:
df = pd.DataFrame(res)
df.columns = ['x', 'r2']
df.head()
df.shape

(120, 2)

In [39]:
df

Unnamed: 0,x,r2
0,0,0.879934
1,1,0.879934
2,2,0.879934
3,3,0.879934
4,4,0.879934
...,...,...
115,115,0.829038
116,116,0.829038
117,117,0.829038
118,118,0.829038


In [40]:
df[df.r2 == df.r2.max()]

Unnamed: 0,x,r2
37,37,0.88476


In [41]:
param_sets_my[37]

{'n_neighbors': 4,
 'weights': 'distance',
 'p': 1,
 'leaf_size': 20,
 'algorithm': 'ball_tree'}