# K nearest Neighbors 

## Importing the Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import make_scorer

## Importing the dataset

In [2]:
dataset = pd.read_csv('/data/private/VM/data/ML_training&testing_v01shuffled_20220317.csv')
dataset

Unnamed: 0,Year,DOY,EVI,Evapo,LST_Daily,LST_Diff,NDVI,TI,T_air,API,Clay,Elevation,lat,lon,OMC,Porosity,Sand,Silt,Preci,Soil Moisture
0,2015,222,0.120117,-0.002280,29.12,31.14,0.183740,11.589293,23.996123,8.695007,19.700001,1752.627563,40.078119,-112.361396,3.56868,0.490566,37.700001,42.700001,0.000853,0.092000
1,2010,177,0.515586,-0.004072,19.55,8.68,0.685505,14.058996,21.684099,14.624732,22.799999,153.939468,48.726702,3.203102,6.65464,0.509434,10.400000,66.800003,0.000849,0.001500
2,2012,79,0.238310,-0.002077,8.04,10.50,0.410580,14.444198,8.660008,4.510628,23.799999,73.877228,43.717169,3.857831,6.98220,0.490566,29.799999,46.400002,0.321031,0.123435
3,2013,95,0.188224,-0.002522,12.32,29.50,0.352700,15.731341,7.477071,13.977669,31.600000,213.627564,40.052801,-88.372904,6.39604,0.471698,8.800000,59.599998,0.000435,0.394130
4,2007,299,0.435490,-0.001802,8.66,7.38,0.882154,12.428805,7.207212,52.144912,40.299999,192.723587,45.249999,-123.280000,16.34352,0.539623,15.800000,43.799999,0.001720,0.387500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
469429,2016,197,0.309898,-0.001307,25.57,15.68,0.588824,13.894586,20.975506,1.797110,14.300000,2563.126221,35.700000,-105.809995,11.55080,0.581132,59.900002,25.799999,0.126868,0.001000
469430,2011,250,0.157988,-0.000601,22.23,32.08,0.256247,11.722256,22.917156,0.250635,17.000000,661.658142,47.759130,-118.745460,4.18932,0.516981,31.100000,51.900002,0.000858,0.072000
469431,2013,147,0.030116,-0.000081,36.09,19.04,0.047282,11.334223,29.584159,0.885216,25.400000,27.083851,36.602000,-117.144897,0.99992,0.520755,42.900002,31.700001,0.000852,0.020667
469432,2013,192,0.518430,-0.003983,25.18,13.34,0.684706,12.998422,24.711874,5.491154,19.799999,246.750183,39.866798,-93.147005,5.56852,0.464151,9.900000,70.199997,0.000854,0.181333


## Construction of the matrix of characteristics (X) and the dependent variable vector (y)

In [3]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Division of the dataset into the Training Set and the Test Set

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [5]:
X_train

array([[2.01200000e+03, 2.18000000e+02, 1.30008333e-01, ...,
        2.87999992e+01, 4.39000015e+01, 4.41074000e-04],
       [2.01100000e+03, 2.26000000e+02, 4.82789881e-01, ...,
        3.97000008e+01, 4.40999985e+01, 4.35114000e-04],
       [2.01300000e+03, 2.47000000e+02, 2.82306548e-01, ...,
        3.50999985e+01, 4.17999992e+01, 2.63536535e+00],
       ...,
       [2.01300000e+03, 2.03000000e+02, 1.10332738e-01, ...,
        5.77000008e+01, 2.45000000e+01, 0.00000000e+00],
       [2.01000000e+03, 1.09000000e+02, 4.05214290e-02, ...,
        4.29000015e+01, 3.17000008e+01, 8.52346000e-04],
       [2.01000000e+03, 2.26000000e+02, 4.27702679e-01, ...,
        2.41000004e+01, 6.64000015e+01, 4.26173000e-04]])

In [6]:
X_test

array([[2.01500000e+03, 2.18000000e+02, 2.38563095e-01, ...,
        1.28999996e+01, 6.25999985e+01, 1.29044000e-03],
       [2.01400000e+03, 2.39000000e+02, 4.99024987e-01, ...,
        1.89999998e+00, 7.96999969e+01, 4.26173000e-04],
       [2.01400000e+03, 2.05000000e+02, 1.63733333e-01, ...,
        3.89000015e+01, 3.82000008e+01, 6.76512700e-03],
       ...,
       [2.01200000e+03, 5.10000000e+01, 2.85629762e-01, ...,
        9.30000019e+00, 6.46999969e+01, 0.00000000e+00],
       [2.01100000e+03, 2.72000000e+02, 2.73874998e-01, ...,
        3.79000015e+01, 3.92999992e+01, 8.71447000e-04],
       [2.01200000e+03, 2.48000000e+02, 1.39112502e-01, ...,
        6.24000015e+01, 2.05000000e+01, 8.51363000e-04]])

## Very important: Feature scaling of X (range -3 to 3)

### Obs: Fit_transform only applied to X_train to prevent data leaking

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [8]:
X_train

array([[ 3.43512648e-01,  1.99929146e-01, -1.07440073e+00, ...,
        -7.51134991e-01,  5.36323062e-01, -2.27753198e-01],
       [ 6.06565046e-02,  3.07526129e-01,  1.73509821e+00, ...,
        -1.16448126e-01,  5.51958448e-01, -2.27758769e-01],
       [ 6.26368792e-01,  5.89968210e-01,  1.38479243e-01, ...,
        -3.84297725e-01,  3.72148820e-01,  2.23496060e+00],
       ...,
       [ 6.26368792e-01, -1.81519770e-03, -1.23109419e+00, ...,
         9.31658477e-01, -9.80332682e-01, -2.28165445e-01],
       [-2.22199639e-01, -1.26607975e+00, -1.78706090e+00, ...,
         6.98819817e-02, -4.17450123e-01, -2.27368806e-01],
       [-2.22199639e-01,  3.07526129e-01,  1.29639206e+00, ...,
        -1.02480720e+00,  2.29533087e+00, -2.27767126e-01]])

## Construction and training of the regression model over the training set


In [9]:
import pickle

# with open('GB.pkl', 'wb') as f:
#     pickle.dump(regressor, f)
with open('GB.pkl', 'rb') as f:
    regressor = pickle.load(f)
regressor.score(X_test, y_test)

FileNotFoundError: [Errno 2] No such file or directory: 'GB.pkl'

## HOW TO: Applying k-Fold Cross Validation after gridSearchCV

In [44]:
import datetime
starttime_cv = datetime.datetime.now()
starttime_cv

datetime.datetime(2022, 5, 9, 18, 57, 29, 275561)

In [46]:
from sklearn.model_selection import cross_validate
scores = cross_validate(estimator = regressor, X = X_train, y = y_train, cv = 5,
                        verbose = 10,
                        n_jobs = -1,
                        scoring={'r':make_scorer(lambda X_train,y_train: np.corrcoef(X_train, y_train)[0, 1]),
                                'r2':'r2',
                                 'nrmse':'neg_root_mean_squared_error'},
                        return_train_score=False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 72 concurrent workers.


[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] START .....................................................................
[CV] END  nrmse: (test=-0.050) r: (test=0.895) r2: (test=0.801) total time= 4.5min
[CV] END  nrmse: (test=-0.050) r: (test=0.893) r2: (test=0.797) total time= 5.1min


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  5.2min remaining:  7.8min


[CV] END  nrmse: (test=-0.050) r: (test=0.893) r2: (test=0.797) total time= 5.2min


[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  5.2min remaining:  3.5min


[CV] END  nrmse: (test=-0.050) r: (test=0.893) r2: (test=0.797) total time= 5.2min
[CV] END  nrmse: (test=-0.050) r: (test=0.893) r2: (test=0.798) total time= 5.2min


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  5.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  5.2min finished


In [48]:
scores

{'fit_time': array([312.05717206, 309.39451051, 271.42549491, 308.04315376,
        311.22366142]),
 'score_time': array([0.16477537, 0.22839642, 0.18713021, 0.19690824, 0.16417074]),
 'test_r': array([0.89327327, 0.89268181, 0.89535656, 0.89306977, 0.89310487]),
 'test_r2': array([0.79782692, 0.79678087, 0.80145246, 0.79739217, 0.79748117]),
 'test_nrmse': array([-0.05007081, -0.05017134, -0.04977445, -0.05000088, -0.05011428])}

In [49]:
r = scores['test_r']
print("r: {:.2f}".format(r.mean()))
r2 = scores['test_r2']
print("r2: {:.2f}".format(r2.mean()))
nrmse = scores['test_nrmse']
print("nrmse: {:.2f}".format(nrmse.mean()))

r: 0.89
r2: 0.80
nrmse: -0.05


In [50]:
endtime_cv = datetime.datetime.now()
print(endtime_cv)
print("running time of CV before gridSearchCV")
print(endtime_cv - starttime_cv)

2022-05-09 19:05:35.060329
running time of CV before gridSearchCV
0:08:05.784768


In [6]:
np.mean([0.79782692, 0.79678087, 0.80145246, 0.79739217, 0.79748117])

0.798186718

In [7]:
np.std([0.79782692, 0.79678087, 0.80145246, 0.79739217, 0.79748117])

0.0016673355100327238