In [1]:
import pandas as pd
import numpy as np

from util import CombinedPreprocessor
from cross_validation import RandomCV, GroupKFoldCV, SpatialPlusCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import GridSearchCV

### 6/18/2020, 6/16/2021, 6/15/2022

In [2]:
indices_6182020 = pd.read_csv("../Datasets/Crop_Yield/Indices_Combined/2020/June_18_2020_New.csv")
indices_6162021 = pd.read_csv("../Datasets/Crop_Yield/Indices_Combined/2021/June_16_2021.csv")
indices_6152022 = pd.read_csv("../Datasets/Crop_Yield/Indices_Combined/2022/June_15_2022.csv")
processor = CombinedPreprocessor()

X_train, X_test, y_train, y_test = processor.transform(indices_6162021, indices_6152022, indices_6182020)

random_validator = RandomCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1, random_state=41)})
gkf_validator = GroupKFoldCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1, random_state=41)})

random_validator.results(X_train, X_test, y_train, y_test)
gkf_validator.results(X_train, X_test, y_train, y_test)

+-----------------+---------------------+---------------------+
|     Random CV   |       RMSE_AVG      |        R2_AVG       |
+-----------------+---------------------+---------------------+
| LR CV Predicted | 0.04940520775923781 |  0.9240810332670986 |
|     LR Test     | 0.16067952817116346 | 0.23160052195703396 |
| RF CV Predicted | 0.03743845447724569 |  0.9562921369126892 |
|     RF Test     | 0.15412481077557874 | 0.29301356921650845 |
+-----------------+---------------------+---------------------+
+-----------------+----------------------+---------------------+
|   GroupKFold CV |       RMSE_AVG       |        R2_AVG       |
+-----------------+----------------------+---------------------+
| LR CV Predicted | 0.07306249534749319  |  0.3948480350051345 |
|     LR Test     | 0.16067952817116346  | 0.23160052195703396 |
| RF CV Predicted | 0.052260898413444196 |  0.8047977164480936 |
|     RF Test     |  0.1541248107755787  | 0.29301356921650845 |
+-----------------+--------------

### 6/23/2020, 6/22/2021, 6/23/2022

In [3]:
indices_6232020 = pd.read_csv("../Datasets/Crop_Yield/Indices_Combined/2020/June_23_2020_New.csv")
indices_6222021 = pd.read_csv("../Datasets/Crop_Yield/Indices_Combined/2021/June_22_2021.csv")
indices_6232022 = pd.read_csv("../Datasets/Crop_Yield/Indices_Combined/2022/June_23_2022.csv")

processor2 = CombinedPreprocessor()

X_train2, X_test2, y_train2, y_test2 = processor2.transform(indices_6222021, indices_6232022, indices_6232020)

random_validator2 = RandomCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1, random_state=41)})
gkf_validator2 = GroupKFoldCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1, random_state=41)})

random_validator2.results(X_train2, X_test2, y_train2, y_test2)
gkf_validator2.results(X_train2, X_test2, y_train2, y_test2)

+-----------------+---------------------+----------------------+
|     Random CV   |       RMSE_AVG      |        R2_AVG        |
+-----------------+---------------------+----------------------+
| LR CV Predicted | 0.04986251605278534 |  0.9226424689585976  |
|     LR Test     |  0.1223180728087902 |  0.5547056082681129  |
| RF CV Predicted | 0.03827614696624009 |  0.9542127687238174  |
|     RF Test     | 0.18658273543238524 | -0.03611718588588886 |
+-----------------+---------------------+----------------------+
+-----------------+---------------------+---------------------+
|   GroupKFold CV |       RMSE_AVG      |        R2_AVG       |
+-----------------+---------------------+---------------------+
| LR CV Predicted |  0.0955478208942627 | -0.7493929793572522 |
|     LR Test     |  0.1223180728087902 |  0.5547056082681129 |
| RF CV Predicted | 0.06630454525176592 |  0.7675631869216328 |
|     RF Test     | 0.18658273543238524 | -0.0361171858858893 |
+-----------------+-------------

### 6/30/2020, 6/30/2021, 6/28/2022

In [4]:
indices_6302020 = pd.read_csv("../Datasets/Crop_Yield/Indices_Combined/2020/June_30_2020_New.csv")
indices_6302021 = pd.read_csv("../Datasets/Crop_Yield/Indices_Combined/2021/June_30_2021.csv")
indices_6282022 = pd.read_csv("../Datasets/Crop_Yield/Indices_Combined/2022/June_28_2022.csv")

processor3 = CombinedPreprocessor()

X_train3, X_test3, y_train3, y_test3 = processor3.transform(indices_6302021, indices_6282022, indices_6302020)

random_validator3 = RandomCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1)})
gkf_validator3 = GroupKFoldCV(models={'LR': LinearRegression(n_jobs=-1), 'RF': RandomForestRegressor(n_jobs=-1, random_state=41)})

random_validator3.results(X_train3, X_test3, y_train3, y_test3)
gkf_validator3.results(X_train3, X_test3, y_train3, y_test3)

+-----------------+---------------------+---------------------+
|     Random CV   |       RMSE_AVG      |        R2_AVG       |
+-----------------+---------------------+---------------------+
| LR CV Predicted | 0.05131233615997181 |  0.9181083318243839 |
|     LR Test     | 0.11345348056217205 |  0.6169092962417637 |
| RF CV Predicted | 0.03816569603249538 |  0.9545742144572322 |
|     RF Test     | 0.13878527511399522 | 0.42673853801208805 |
+-----------------+---------------------+---------------------+
+-----------------+---------------------+--------------------+
|   GroupKFold CV |       RMSE_AVG      |       R2_AVG       |
+-----------------+---------------------+--------------------+
| LR CV Predicted | 0.10826996332180994 | -2.32871395399151  |
|     LR Test     | 0.11345348056217205 | 0.6169092962417637 |
| RF CV Predicted | 0.05188141176706019 | 0.8032991581982384 |
|     RF Test     | 0.13684665464058968 | 0.4426418759013948 |
+-----------------+---------------------+------