# K nearest Neighbors 

## Importing the Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('/data/private/VM/data/ML_training&testing_v01shuffled_20220317.csv')
# dataset

In [3]:
dataset.columns

Index(['Year', 'DOY', 'EVI', 'Evapo', 'LST_Daily', 'LST_Diff', 'NDVI', 'TI',
       'T_air', 'API', 'Clay', 'Elevation', 'lat', 'lon', 'OMC', 'Porosity',
       'Sand', 'Silt', 'Preci', 'Soil Moisture'],
      dtype='object')

## Construction of the matrix of characteristics (X) and the dependent variable vector (y)

In [4]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Division of the dataset into the Training Set and the Test Set

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Very important: Feature scaling of X (range -3 to 3)

### Obs: Fit_transform only applied to X_train to prevent data leaking

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Construction and training of the regression model over the training set


In [7]:
%%time
from sklearn.ensemble import GradientBoostingRegressor  #"GradientBoostingRegressor" class from "ensemble" module 
regressor = GradientBoostingRegressor() #regressor object
regressor.fit(X_train, y_train)

CPU times: user 2min 15s, sys: 53.9 ms, total: 2min 15s
Wall time: 2min 15s


## Bulid our own "gridSearchCV" which can run our code in more servers

In [8]:
import itertools

n_estimators = [80,90,100,110,120]
max_depth = [1,2,3,4,5]
learning_rate = [0.1,0.3,0.5]

param_sets_my = []
for element in itertools.product(n_estimators,max_depth,learning_rate):
    # print(element)
    dict_i = {
        'n_estimators': element[0], 
        'max_depth': element[1],
        'learning_rate':element[2]
    }
    param_sets_my.append(dict_i)

In [9]:
len(param_sets_my)

75

In [10]:
param_sets_my[0]

{'n_estimators': 80, 'max_depth': 1, 'learning_rate': 0.1}

In [75]:
%%time
from multiprocessing import Pool

def f(x):
    param_set = param_sets_my[x]
    regressor = GradientBoostingRegressor(**param_set) # == KNeighborsRegressor(weights='uniform', leaf_size=20, p=1)
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test) #X_test[:100000]
    score = regressor.score(X_test, y_test) #X_test[:100000], y_test[:100000]
    print(f'{param_set}: {score}')
    return (x, score)

if __name__ == '__main__':
    with Pool(70) as p:
        res = p.map(f, range(len(param_sets_my)))
        # res = p.map(f, [0, 11, 20, 3]) #
        # res = p.map(f, param_sets_my[:3])

{'n_estimators': 120, 'max_depth': 5, 'learning_rate': 0.5}: 0.7976773438869168
CPU times: user 167 ms, sys: 835 ms, total: 1 s
Wall time: 5min 46s


In [77]:
df = pd.DataFrame(res)
df.columns = ['x', 'r2']
df.head()
df.shape

(1, 2)

In [78]:
df

Unnamed: 0,x,r2
0,0,0.797677


In [79]:
df[df.r2 == df.r2.max()]

Unnamed: 0,x,r2
0,0,0.797677


In [81]:
param_sets_my[0]

{'n_estimators': 120, 'max_depth': 5, 'learning_rate': 0.5}

In [71]:
df[24:25]

Unnamed: 0,x,r2
24,24,0.614651
