In [1]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from tqdm import tqdm
from sklearn.dummy import DummyRegressor
import xgboost as xgb

In [4]:
#import data
df = pd.read_csv('vegetation_data.csv')

#create x and y variables
y = df["N"].astype(int)
X = df.drop(["N_level", "N","POINTID"], axis=1)

#fixing problem where EVI data has "inf" values
X = np.where(np.isinf(X), np.nan, X)
X = np.where(np.isnan(X), np.nanmean(X), X)



In [13]:
#load model
avg_score_model = []
best_params = []
inner_cv = KFold(n_splits=5,random_state=3, shuffle=True)
outer_cv = KFold(n_splits=5,random_state=10, shuffle=True)

#baseline model
baseline_model = DummyRegressor(strategy="mean")


#defining models and parameters to iterate over
#models = [GradientBoostingRegressor(random_state=3,n_estimators=100), RandomForestRegressor(n_estimators=100, random_state=3),xgb.XGBRegressor(random_state=3,n_estimators=100)]
#parameters = [
    #{"max_depth":  [3, 5, 20, 50, 100, None], "max_features": [2, 5, 10], "min_samples_split": [2, 5, 10, 50], "learning_rate": [0.01, 0.05, 0.1, 1],"min_samples_leaf": [1, 2, 5, 20]},
    #{"max_depth": [3, 5, 20, 50, 100, None],"min_samples_split": [2, 5, 10, 50], "max_features": [2, 5, 10], "min_samples_leaf": [1, 2, 5, 20], "bootstrap": [True, False]},
    #{"max_depth":  [3, 5, 20, 50, 100, None],"learning_rate": [0.01, 0.05, 0.1, 1],"subsample": [0.5, 0.7, 1]}
#]


#defining models and parameters to iterate over
models =  [RandomForestRegressor(max_depth=3, max_features=2, n_estimators=100, random_state=3), GradientBoostingRegressor(random_state=3,n_estimators=100), xgb.XGBRegressor(random_state=3,n_estimators=100)]
parameters = [
    #{"max_depth": [3, 5, 50, 100], "max_features": [2, 5, 10], "min_samples_leaf": [1, 5, 20, 50]},
    { "min_samples_leaf": [1, 5, 20, 50,100]},
    {"max_depth":  [3, 5, 50, 100], "max_features": [2, 5, 10], "learning_rate": [ 0.05, 0.1, 1],"min_samples_leaf": [1, 5, 20, 50]},
    {"max_depth":  [3, 5, 50, 100],"learning_rate": [ 0.05, 0.1, 1],"subsample": [0.5, 0.7, 1]}
]

# zip the models and parameters into a single iterable
for model, params in tqdm(zip(models, parameters)):
    # run the inner loop to optimize hyperparameters
    inner_fold_optim = GridSearchCV(model, params, cv=inner_cv, scoring='neg_root_mean_squared_error')
    print(inner_fold_optim.fit(X, y).best_params_)
    best_params.append(inner_fold_optim.best_params_)
    # estimate error of the optimized model
    outer_fold_score = cross_val_score(inner_fold_optim, X, y, cv=outer_cv, scoring='neg_root_mean_squared_error')
    baseline_scores = cross_val_score(baseline_model, X, y, cv=outer_cv, scoring='neg_root_mean_squared_error')


    # get the mean MSE across each of outer_cv's 4 folds
    average_score = np.mean(outer_fold_score)
    average_score_baseline = np.mean(baseline_scores)
    avg_score_model.append(average_score)

    # results across the folds
    print( "model:" , model)
    print("MSE in the 4 outer folds: ", outer_fold_score)
    print("Average error: ", average_score)

print("Average score for each model: ", avg_score_model)
print("Baseline Mean MAE: " , average_score_baseline)





0it [00:00, ?it/s]

{'min_samples_leaf': 20}


1it [00:15, 15.81s/it]

model: RandomForestRegressor(max_depth=3, max_features=2, random_state=3)
MSE in the 4 outer folds:  [-1.31926733 -1.09177339 -0.75044579 -0.68833868 -1.59342868]
Average error:  -1.0886507750301568


1it [00:30, 30.54s/it]


KeyboardInterrupt: 

In [9]:
best_model = models[np.argmax(avg_score_model)]
params = best_params[np.argmax(avg_score_model)]
#fitting the best model
final_regressor = best_model.set_params(**params)
final_regressor.fit(X, y)


# Next step is testing the model on the 2015 data as an independent dataset