# __Random Forrest__

## __Import Packages and Tools__

import numpy as np

import pandas as pd


import metpy.calc as mpcalc

from metpy.units import units


from math import sqrt


from sklearn.preprocessing import MinMaxScaler,StandardScaler

from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import SGDRegressor

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix


from yellowbrick.regressor import ResidualsPlot

import matplotlib.pyplot as plt

import itertools

---

## __Import Dataset(s)__

X_test = pd.read_csv('../data/file_test.csv')


X_train = pd.read_csv('../data/file_train.csv')

---

## __Overview Dataset(s)__

X_train.head()


X_test.head()

<br />

print('Train:', file_train.shape)

print('Test:', file_test.shape)

---

## __Define y_train and y_test__

y_train = train_data['target']

y_test = test_data['target']

<br />

X_train = train_data.drop(['target','Unnamed: 0','Unnamed: 0.1','ID'], axis = 1)

X_test = test_data.drop(['target','Unnamed: 0','Unnamed: 0.1','ID'], axis = 1)

<br />

print('y Train:', y_train.shape)

print('y Test:', y_test.shape)

print('X Train:', X_train.shape)

print('X Test:', X_test.shape)

---

## __Create Dummy Features if needed__

X_train = pd.get_dummies(X_train, drop_first=True)


X_test = pd.get_dummies(X_test, drop_first=True)


print('X Train:', X_train.shape)


print('X Test:', X_test.shape)

---
---

# __Random Forrest with scaled met features and log transform__

---

## __Scaling with Min Max scaler__

scaler = MinMaxScaler()

X_train_met_scaled_min_max = scaler.fit_transform(X_train_met)

X_test_met_scaled_min_max = scaler.transform(X_test_met)

---

## __Log Transform__

y_train_log = np.log(y_train)

y_test_log = np.log(y_test)

---

## __Train model log__

#### Create the model with 100 trees


rf_log = RandomForestRegressor(n_estimators=200, random_state=42, max_depth = 10, max_features = 'sqrt', n_jobs=-1, verbose = 1)

rf_log.fit(X_train_met_scaled_min_max, y_train_log)

---

## __Predictions__

#### log transformed dataset

y_pred_train_rf_log = rf_log.predict(X_train_met_scaled_min_max)

y_pred_test_rf_log = rf_log.predict(X_test_met_scaled_min_max)

<br />

print('RMSE Train log:', sqrt(mean_squared_error(y_train_log, y_pred_train_rf_log)))

print('RMSE Test log:', sqrt(mean_squared_error(y_test_log, y_pred_test_rf_log)))

#### original/not transformed dataset

y_pred_train_rf_unlog = np.exp(y_pred_train_rf_log)

y_pred_test_rf_unlog = np.exp(y_pred_test_rf_log)

<br />

print('RMSE Train unlog:', sqrt(mean_squared_error(y_train, y_pred_train_rf_unlog)))

print('RMSE Test unlog:', sqrt(mean_squared_error(y_test, y_pred_test_rf_unlog)))


---
---

# __GridSearch__

param_forest = {'max_depth' : [2, 5, 10, 'None'],
                'min_samples_split' : [2, 5, 10, 20],
                'min_samples_leaf' : [1, 2, 5, 10]}



grid_forest = RandomizedSearchCV(rf, param_distributions = param_forest, cv = 5, scoring = 'neg_root_mean_squared_error',
                            verbose = 2, n_jobs = -1)

grid_forest.fit(X_train_scaled, y_train_log)

#### Best Estimator, Score and Parameter

grid_forest.best_estimator_

grid_forest.best_score_

grid_forest.best_params_

best_model_forest = grid_forest.best_estimator_

best_model_forest

best_model_forest.fit(X_train_scaled, y_train_log)

y_bestpred_train = best_model_forest.predict(X_train_scaled)

y_bestpred_test = best_model_forest.predict(X_test_scaled)

<br />

print(sqrt(mean_squared_error(y_train_log, y_bestpred_train)))

print(sqrt(mean_squared_error(y_test_log, y_bestpred_test)))

---
---

# __Error Analysis / Residuals Plot__

---

## __Residuals Plot with log transformed dataset(s)__

visualizer = ResidualsPlot(rf_log)
<br />

#### Fit the training data to the visualizer
visualizer.fit(X_train_met_scaled_min_max, y_train_log)  

#### Evaluate the model on the test data
visualizer.score(X_test_met_scaled_min_max, y_test_log)  

visualizer.show()

#### Plot with Visualizer

fig=plt.figure(figsize=(6, 6))

plt.axline([1, 1], [2, 2],color='lightgrey')

plt.scatter(y_train_log,y_pred_train_rf_log)

plt.scatter(y_test_log,y_pred_test_rf_log)

plt.xticks(np.arange(0,7.1,1));

plt.yticks(np.arange(0,7.1,1));

plt.xlabel("log(PM2.5) actual");

plt.ylabel("log(PM2.5) predicted");

---

## __Residuals Plot with unlog transformed dataset(s)__

#### Plot with Visualizer

fig=plt.figure(figsize=(6, 6))

plt.axline([1, 1], [2, 2],color='lightgrey')

plt.scatter(y_train,y_pred_train_rf_unlog)

plt.scatter(y_test,y_pred_test_rf_unlog)

plt.xticks(np.arange(0,501,100));

plt.yticks(np.arange(0,501,100));

plt.xlabel("PM2.5 actual");

plt.ylabel("PM2.5 predicted");