In [1]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from tqdm import tqdm

In [2]:
#import data
df = pd.read_csv('vegetation_data.csv')

#create x and y variables
y = df["N"].astype(int)
X = df.drop(["N_level", "N","POINTID"], axis=1)

#fixing problem where EVI data has "inf" values
X = np.where(np.isinf(X), np.nan, X)
X = np.where(np.isnan(X), np.nanmean(X), X)



In [13]:
#load model
avg_score_model = []
best_params = []
inner_cv = KFold(n_splits=4)
outer_cv = KFold(n_splits=4)

#defining models and parameters to iterate over
models = [GradientBoostingRegressor(random_state=3,n_estimators=1000), RandomForestRegressor(n_estimators=1000, random_state=3)]
parameters = [
    {"max_depth": [2, 3, 5, 8, 10, 50, None], "max_features": [1, 2, 3, 5, 8]},
    {"max_depth": [3, 5, 10, 50, None],"min_samples_split": [2, 5, 10], "max_features": [1, 2, 3, 5, 8]}
]

# zip the models and parameters into a single iterable
for model, params in tqdm(zip(models, parameters)):
    # run the inner loop to optimize hyperparameters
    inner_fold_optim = GridSearchCV(model, params, cv=inner_cv, scoring='neg_mean_squared_error')
    print(inner_fold_optim.fit(X, y).best_params_)
    best_params.append(inner_fold_optim.best_params_)
    # estimate error of the optimized model
    outer_fold_score = cross_val_score(inner_fold_optim, X, y, cv=outer_cv, scoring='neg_mean_squared_error')

    # get the mean MSE across each of outer_cv's 4 folds
    average_score = np.mean(outer_fold_score)
    avg_score_model.append(average_score)

    # results across the folds
    print( "model:" , model)
    print("MSE in the 4 outer folds: ", outer_fold_score)
    print("Average error: ", average_score)

print("Average score for each model: ", avg_score_model)




0it [00:00, ?it/s]

{'max_depth': 50, 'max_features': 5}


1it [07:36, 456.03s/it]

model: GradientBoostingRegressor(n_estimators=1000, random_state=3)
MSE in the 4 outer folds:  [-1.01420743 -0.59448365 -1.07528801 -0.34913301]
Average error:  -0.758278025954738
{'max_depth': 5, 'max_features': 8, 'min_samples_split': 5}


2it [30:53, 926.71s/it] 

model: RandomForestRegressor(n_estimators=1000, random_state=3)
MSE in the 4 outer folds:  [-0.88855235 -0.50390065 -1.04821071 -0.27542431]
Average error:  -0.6790220059513844
Average score for each model:  [-0.758278025954738, -0.6790220059513844]





In [15]:
best_model = models[np.argmax(avg_score_model)]
params = best_params[np.argmax(avg_score_model)]
#fitting the best model
final_regressor = best_model.set_params(**params)
final_regressor.fit(X, y)


# Next step is testing the model on the 2015 data as an independent dataset