In [1]:
from sklearn.model_selection import GroupKFold, cross_val_predict, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import pandas
import numpy
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
numpy.random.seed = 41

In [2]:
drop_cols = [     
        'Unnamed: 0',
        'Latitude',
        'Longitude',
        'Dataset',
        'Obj. Id',
        'Swth Wdth(ft)',
        'Distance(ft)',
        'Duration(s)',
        'Track(deg)',
        'Elevation(ft)',
        'Pass Num',
        'Speed(mph)',
        'Yld Mass(Wet)(lb/ac)',
        'Yld Vol(Wet)(bu/ac)',
        'Yld Vol(Dry)(bu/ac)',
        'Prod(ac/h)',
        'Date'
]

yield_2020 = pandas.read_csv('../Datasets/Crop_Yield/Preprocessed/2020/Production_Yield_PLOTS_2020.csv')
yield_2020.drop(columns=drop_cols, axis='columns',inplace=True)
yield_2020 = yield_2020[yield_2020['Plot'] != -1].copy(deep=True).reset_index(drop=True)

yield_2021 = pandas.read_csv('../Datasets/Crop_Yield/Preprocessed/2021/Production_Yield_PLOTS_2021.csv')
yield_2021.drop(columns=drop_cols, axis='columns',inplace=True)
yield_2021 = yield_2021[yield_2021['Plot'] != -1].copy(deep=True).reset_index(drop=True)

yield_2022 = pandas.read_csv('../Datasets/Crop_Yield/Preprocessed/2022/Production_Yield_PLOTS_2022.csv')
yield_2022.drop(columns=drop_cols, axis='columns',inplace=True)
yield_2022 = yield_2022[yield_2022['Plot'] != -1].copy(deep=True).reset_index(drop=True)

yield_2021_2022 = pandas.concat([yield_2021, yield_2022]).reset_index(drop=True)

In [3]:
Scalers = {
    'test': [MinMaxScaler(), MinMaxScaler()],
    'train': [MinMaxScaler(), MinMaxScaler()]
}

X_test = yield_2020.drop(columns=['Yld Mass(Dry)(lb/ac)', 'Plot'])
Y_test = yield_2020.drop(columns=['Moisture(%)', 'Crop Flw(M)(lb/s)', 'Crop Flw(V)(bu/h)', 'Plot'])

X_train_validate = yield_2021_2022.drop(columns=['Yld Mass(Dry)(lb/ac)', 'Plot'])
Y_train_validate = yield_2021_2022.drop(columns=['Moisture(%)', 'Crop Flw(M)(lb/s)', 'Crop Flw(V)(bu/h)', 'Plot'])
plot_labels_2020_2021 = yield_2021_2022['Plot'].values


X_test[['Moisture(%)', 'Crop Flw(M)(lb/s)', 'Crop Flw(V)(bu/h)']] = Scalers['test'][0].fit_transform(X_test[['Moisture(%)', 'Crop Flw(M)(lb/s)', 'Crop Flw(V)(bu/h)']])
Y_test[['Yld Mass(Dry)(lb/ac)']] = Scalers['test'][1].fit_transform(Y_test[['Yld Mass(Dry)(lb/ac)']])

X_train_validate[['Moisture(%)', 'Crop Flw(M)(lb/s)', 'Crop Flw(V)(bu/h)']] = Scalers['train'][0].fit_transform(X_train_validate[['Moisture(%)', 'Crop Flw(M)(lb/s)', 'Crop Flw(V)(bu/h)']])
Y_train_validate[['Yld Mass(Dry)(lb/ac)']] = Scalers['train'][1].fit_transform(Y_train_validate[['Yld Mass(Dry)(lb/ac)']])


In [4]:
gkf = GroupKFold()
LRModel = LinearRegression()
RFModel = RandomForestRegressor()

folds = gkf.split(X_train_validate, Y_train_validate, plot_labels_2020_2021)
train_indices, test_indices = [list(train) for train in zip(*folds)]
yield_cv = [*zip(train_indices,test_indices)]

LR_RMSE = -cross_val_score(LRModel, X_train_validate, Y_train_validate, cv=yield_cv, scoring='neg_root_mean_squared_error')
LR_r2 = cross_val_score(LRModel, X_train_validate, Y_train_validate, cv=yield_cv, scoring='r2')


print("LR CV RMSE Scores: ", LR_RMSE)
print("LR CV Average RMSE Score: ", numpy.average(LR_RMSE))

print("LR CV R^2 Scores: ", LR_r2)
print("lR CV Average R^2 Score: ", numpy.average(LR_r2))



LR CV RMSE Scores:  [0.04183555 0.04449897 0.06110071 0.04936273 0.04926401]
LR CV Average RMSE Score:  0.04921239391154908
LR CV R^2 Scores:  [0.78004019 0.7468989  0.58519655 0.72606951 0.69863893]
lR CV Average R^2 Score:  0.7073688159247814
