### Import packages and data ###

In [15]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error

RSEED = 42

In [16]:
df = pd.read_csv('data/train_modelling.csv')
df.head(2)

Unnamed: 0,mean_temp,mean_precip,mean_rel_humidity,mean_wind_dir,mean_wind_spd,target
0,24.679063,0.007025,0.758058,177.109855,0.899208,79.131702
1,20.845273,1.127273,0.898326,259.973977,1.365202,53.850238


### Select columns and ###
### Define X and y and ###
### Train-test-split ###

In [17]:
# select columns for X
cols = ['mean_temp', 'mean_precip', 'mean_rel_humidity', 'mean_wind_dir', 'mean_wind_spd']

In [18]:
# define X and y
X = df[cols]
y = df['target']

In [19]:
# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RSEED)

## Linear Regression ##

In [20]:
# fitting the linear regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression()

In [21]:
# predicting and RMSE
y_pred_lin = lin_reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred_lin))

40.191645410733344

## Decision Tree Regression ##

In [22]:
# Fitting a Decision Tree Regression
dtr =  DecisionTreeRegressor()
dtr.fit(X_train, y_train)

DecisionTreeRegressor()

In [23]:
#predicting and RMSE
y_pred_dt = dtr.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred_dt))

36.00615915260986

### Random Forest Regressor ###

In [24]:
# fitting a Random Forest Regression
rfr = RandomForestRegressor(random_state=RSEED)
rfr.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [25]:
# predicting and RMSE
y_pred_rfr = rfr.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred_rfr))

27.91935458724813

In [26]:
# Return the coefficient of determination of the prediction (R-squared)
rfr.score(X_test, y_test)

0.5486622011620583

## Randomized Search CV ##

In [27]:
# import pprint (Data pretty printer) and use it to print current parameters of Random Forest
from pprint import pprint

print('Parameters currently in use:\n')
pprint(rfr.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [28]:
# create a parameter grid for RandomSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
# No. of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10,110,num=11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2,5,10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1,2,4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [None]:
# Use the random grid search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validaition,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, verbose=2, random_state=RSEED, n_jobs=-1)

# fit the random search model
rf_random.fit(X_train, y_train)

In [33]:
# best parameters after Search
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [70]:
# fitting Random Forest with best parameters
best_random = RandomForestRegressor(n_estimators=400, min_samples_split=2, min_samples_leaf=1,
                                    max_features='sqrt', max_depth=None, bootstrap=False, random_state=RSEED)
best_random.fit(X_train, y_train)

RandomForestRegressor(bootstrap=False, max_features='sqrt', n_estimators=400,
                      random_state=42)

In [73]:
# predicting and RMSE
y_pred = best_random.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred)).round(2)

27.44

In [55]:
# Evaluate Random Search with accuracy and RMSE comparison

def evaluate (model, test_features, test_labels) :
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))

    return accuracy

base_model = RandomForestRegressor(random_state=RSEED)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)
y_base = base_model.predict(X_test)
base_rmse = np.sqrt(mean_squared_error(y_test, y_base)).round(2)
print(base_rmse)

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)
y_random = best_random.predict(X_test)
random_rmse = np.sqrt(mean_squared_error(y_test, y_random)).round(2)
print(random_rmse)
print('Improvement in Accuracy of {:0.2f}%.'.format(100 * (random_accuracy - base_accuracy) / base_accuracy))
print('Improvement in RMSE of {:0.2f}%.'.format(abs(100*(random_rmse-base_rmse)/base_rmse)))
print('Improvement in RMSE of {:0.2f} total.'.format(abs(random_rmse-base_rmse)))

Model Performance
Average Error: 16.9689 degrees.
Accuracy = 65.01%.
27.92
Model Performance
Average Error: 15.9977 degrees.
Accuracy = 68.55%.
27.35
Improvement in Accuracy of 5.45%.
Improvement in RMSE of 2.04%.
Improvement in RMSE of 0.57 total.


## GridSearchCV ##

In [57]:
# again the best parameters after RandomSearchCV
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [58]:
# create the parameter grid based on the results of random search

param_grid = {
    'bootstrap': [False],
    'max_depth': [80, 90, 100, 110, None],
    'max_features': [2, 3, 'sqrt'],
    'min_samples_leaf': [1, 2],
    'min_samples_split': [2, 4],
    'n_estimators': [100, 200, 300, 400, 1000]
}

rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

grid_search.best_params_

In [63]:
# fitting Random Forest with best parameters (here: the same as before)
best_grid = RandomForestRegressor(n_estimators=400, min_samples_split=2, min_samples_leaf=1,
                                    max_features='sqrt', max_depth=None, bootstrap=False, random_state=RSEED)
best_grid.fit(X_train, y_train)

RandomForestRegressor(bootstrap=False, max_features='sqrt', n_estimators=400,
                      random_state=42)

In [64]:
# predicting and RMSE
y_pred = best_random.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred)).round(2)

27.35

In [65]:
# Evaluate GridSearchCV
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)

print('Improvement of {:0.2f}%.'.format(100*(grid_accuracy-base_accuracy)/base_accuracy))

Model Performance
Average Error: 16.0313 degrees.
Accuracy = 68.40%.
Improvement of 5.22%.
