In [1]:
import pandas as pd
import numpy as np

from util import CombinedPreprocessor
from cross_validation import RandomCV, GroupKFoldCV, SpatialPlusCV, MSpatialPlusCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from IPython.display import display
np.random.seed(41)

### 6/18/2020, 6/16/2021, 6/15/2022 LOFO

In [3]:
indices_6182020 = pd.read_csv("Datasets/Indices_Combined/2020/June_18_2020_New.csv")
indices_6162021 = pd.read_csv("Datasets/Indices_Combined/2021/June_16_2021.csv")
indices_6152022 = pd.read_csv("Datasets/Indices_Combined/2022/June_15_2022.csv")
processor = CombinedPreprocessor()

X_train, X_test, y_train, y_test, train_coordinates, test_coordinates = processor.transform(indices_6162021, indices_6152022, indices_6182020)

random_validator_lofo = RandomCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1, random_state=41)})
gkf_validator = GroupKFoldCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1, random_state=41)})
sp_validator = SpatialPlusCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1, random_state=41)})

rndm_results = random_validator_lofo.results(X_train, X_test, y_train, y_test)
gkf_results = gkf_validator.results(X_train, X_test, y_train, y_test)
sp_results = sp_validator.results(X_train, X_test, y_train, y_test)

display(rndm_results, gkf_results, sp_results)

Unnamed: 0,MODEL,CV_METHOD,RMSE_AVG,R2_AVG
0,LR,RNDM_CV_LOFO,0.049405,0.924081
1,LR,RNDM_TEST_LOFO,0.16068,0.231601
2,RF,RNDM_CV_LOFO,0.037438,0.956292
3,RF,RNDM_TEST_LOFO,0.154125,0.293014


Unnamed: 0,MODEL,CV_METHOD,RMSE_AVG,R2_AVG
0,LR,GKF_CV_LOFO,0.073062,0.394848
1,LR,GKF_TEST_LOFO,0.16068,0.231601
2,RF,GKF_CV_LOFO,0.052261,0.804798
3,RF,GKF_TEST_LOFO,0.154125,0.293014


Unnamed: 0,MODEL,CV_METHOD,RMSE_AVG,R2_AVG
0,LR,SP_CV_LOFO,0.132388,-0.729231
1,LR,SP_TEST_LOFO,0.16068,0.231601
2,RF,SP_CV_LOFO,0.055237,0.701445
3,RF,SP_TEST_LOFO,0.154125,0.293014


### 6/23/2020, 6/22/2021, 6/23/2022

In [5]:
indices_6232020 = pd.read_csv("Datasets/Indices_Combined/2020/June_23_2020_New.csv")
indices_6222021 = pd.read_csv("Datasets/Indices_Combined/2021/June_22_2021.csv")
indices_6232022 = pd.read_csv("Datasets/Indices_Combined/2022/June_23_2022.csv")

processor2 = CombinedPreprocessor()

X_train2, X_test2, y_train2, y_test2, train_coordinates2, test_coordinates2 = processor2.transform(indices_6222021, indices_6232022, indices_6232020)

random_validator2 = RandomCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1, random_state=41)})
gkf_validator2 = GroupKFoldCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1, random_state=41)})
sp_validator2 = SpatialPlusCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1, random_state=41)})

rndm_results2 = random_validator2.results(X_train2, X_test2, y_train2, y_test2)
gkf_results2 = gkf_validator2.results(X_train2, X_test2, y_train2, y_test2)
sp_results2 = sp_validator2.results(X_train2, X_test2, y_train2, y_test2)

display(rndm_results2, gkf_results2, sp_results2)

Unnamed: 0,MODEL,CV_METHOD,RMSE_AVG,R2_AVG
0,LR,RNDM_CV_LOFO,0.049863,0.922642
1,LR,RNDM_TEST_LOFO,0.122318,0.554706
2,RF,RNDM_CV_LOFO,0.038276,0.954213
3,RF,RNDM_TEST_LOFO,0.186583,-0.036117


Unnamed: 0,MODEL,CV_METHOD,RMSE_AVG,R2_AVG
0,LR,GKF_CV_LOFO,0.095548,-0.749393
1,LR,GKF_TEST_LOFO,0.122318,0.554706
2,RF,GKF_CV_LOFO,0.066305,0.767563
3,RF,GKF_TEST_LOFO,0.186583,-0.036117


Unnamed: 0,MODEL,CV_METHOD,RMSE_AVG,R2_AVG
0,LR,SP_CV_LOFO,0.058817,0.826936
1,LR,SP_TEST_LOFO,0.122318,0.554706
2,RF,SP_CV_LOFO,0.051718,0.845608
3,RF,SP_TEST_LOFO,0.186583,-0.036117


### 6/30/2020, 6/30/2021, 6/28/2022

In [4]:
indices_6302020 = pd.read_csv("Datasets/Indices_Combined/2020/June_30_2020_New.csv")
indices_6302021 = pd.read_csv("Datasets/Indices_Combined/2021/June_30_2021.csv")
indices_6282022 = pd.read_csv("Datasets/Indices_Combined/2022/June_28_2022.csv")

processor3 = CombinedPreprocessor()

X_train3, X_test3, y_train3, y_test3, train_coordinates3, test_coordinates3 = processor3.transform(indices_6302021, indices_6282022, indices_6302020)

random_validator3 = RandomCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1, random_state=41)})
gkf_validator3 = GroupKFoldCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1, random_state=41)})
sp_validator3 = SpatialPlusCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1, random_state=41)})

rndm_results3 = random_validator3.results(X_train3, X_test3, y_train3, y_test3)
gkf_results3 = gkf_validator3.results(X_train3, X_test3, y_train3, y_test3)
sp_results3 = sp_validator3.results(X_train3, X_test3, y_train3, y_test3)

display(rndm_results3, gkf_results3, sp_results3)

Unnamed: 0,MODEL,CV_METHOD,RMSE_AVG,R2_AVG
0,LR,RNDM_CV_LOFO,0.051312,0.918108
1,LR,RNDM_TEST_LOFO,0.113453,0.616909
2,RF,RNDM_CV_LOFO,0.037967,0.95503
3,RF,RNDM_TEST_LOFO,0.136847,0.442642


Unnamed: 0,MODEL,CV_METHOD,RMSE_AVG,R2_AVG
0,LR,GKF_CV_LOFO,0.10827,-2.328714
1,LR,GKF_TEST_LOFO,0.113453,0.616909
2,RF,GKF_CV_LOFO,0.051881,0.803299
3,RF,GKF_TEST_LOFO,0.136847,0.442642


Unnamed: 0,MODEL,CV_METHOD,RMSE_AVG,R2_AVG
0,LR,SP_CV_LOFO,0.058285,0.707317
1,LR,SP_TEST_LOFO,0.113453,0.616909
2,RF,SP_CV_LOFO,0.042211,0.85435
3,RF,SP_TEST_LOFO,0.136847,0.442642
