# Importing libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

import matplotlib.pyplot as plt
from matplotlib.dates import YearLocator, DateFormatter
from matplotlib.ticker import AutoMinorLocator
import seaborn as sns

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, root_mean_squared_error

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Loading the data
df = pd.read_csv("../data/interim/train_set.csv", parse_dates=["DATE"], index_col="DATE")
print(df.head(5))

california_df = df[["California"]]
print(california_df.head(5))


            California  Florida  NewYork  Texas
DATE                                           
1976-01-01        10.4     10.4     11.2    6.3
1976-02-01        10.1      9.8     11.2    6.2
1976-03-01         9.4      9.2     10.5    5.9
1976-04-01         8.8      8.6      9.9    5.2
1976-05-01         7.9      8.1      9.0    5.1
            California
DATE                  
1976-01-01        10.4
1976-02-01        10.1
1976-03-01         9.4
1976-04-01         8.8
1976-05-01         7.9


In [3]:
# Preparing supervised data.
california = california_df.copy()

for i in range(1, 55):
    col_name = "month" + str(i)
    california[col_name] = california["California"].shift(i)
sup_california = california.dropna().reset_index(drop=True)
sup_california.head()


Unnamed: 0,California,month1,month2,month3,month4,month5,month6,month7,month8,month9,...,month45,month46,month47,month48,month49,month50,month51,month52,month53,month54
0,7.6,7.2,6.5,6.6,6.6,6.8,6.8,5.8,6.0,5.9,...,8.8,8.8,9.1,9.5,9.0,7.9,8.8,9.4,10.1,10.4
1,7.3,7.6,7.2,6.5,6.6,6.6,6.8,6.8,5.8,6.0,...,9.3,8.8,8.8,9.1,9.5,9.0,7.9,8.8,9.4,10.1
2,6.8,7.3,7.6,7.2,6.5,6.6,6.6,6.8,6.8,5.8,...,9.0,9.3,8.8,8.8,9.1,9.5,9.0,7.9,8.8,9.4
3,6.8,6.8,7.3,7.6,7.2,6.5,6.6,6.6,6.8,6.8,...,9.9,9.0,9.3,8.8,8.8,9.1,9.5,9.0,7.9,8.8
4,7.0,6.8,6.8,7.3,7.6,7.2,6.5,6.6,6.6,6.8,...,9.9,9.9,9.0,9.3,8.8,8.8,9.1,9.5,9.0,7.9


### Splitting and preparing the data

In [4]:
# Simple splitting
train = np.array(sup_california[:-55])
test = np.array(sup_california[-55:])

X_train, y_train = train[:, 1:], train[:, 0:1]
X_test, y_test = test[:, 1:], test[:, 0:1]
y_train = y_train.ravel()
y_test = y_test.ravel()

### Selecting the model

In [5]:
# Setting up the dates forecasts.
forecast_df = california_df.copy()

forecast_df = forecast_df.reset_index()

act_test_date = forecast_df["DATE"][-55:].reset_index(drop=True)
act_test_df = forecast_df["California"][-55:].reset_index(drop=True).to_list()

In [6]:
# Setting up the model.
models = {
        "XGB": XGBRegressor(),
        "RF": RandomForestRegressor(random_state=42),
        "DT": DecisionTreeRegressor(random_state=42),
        "SGD": SGDRegressor(random_state=42),
        "GB": GradientBoostingRegressor(random_state=42, subsample=0.8),
        "SVR": SVR(),
    }

In [7]:
def model_selection(models, X1,y1, X2, y2, actual_date, actual_value):

    results = []
    forecast = pd.DataFrame(actual_date)
    forecast["actual"] = actual_value

    # Setting up the parameters.
    param_grid = {
        "XGB" : {
            "n_estimators": [100, 200, 250, 350],
            "learning_rate": [0.01, 0.05, 0.1, 1.0],
            "max_depth": [3, 5, 7, 10],
            },
        "RF" : [
            {"n_estimators": [3, 10, 15, 30], "max_features": [2, 4, 8, 10]},
            {"bootstrap": [False], "n_estimators": [3, 10, 15], "max_features": [2, 6, 10]}
        ],
        "DT" : {
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5, 7],
            "min_samples_leaf": [1, 2, 5, 7, 10]
        },
        "SGD" : {
            "penalty": ["l2", "l1", "elasticnet"],
            "max_iter": [50, 100, 200, 300],
            "eta0": [0.0005, 0.001, 0.005, 0.01],
            "tol": [1e-10, 1e-3]
        },
        "GB" : {
            "n_estimators": [100, 200, 300, 350, 500],
            "learning_rate": [0.01, 0.05, 0.1, 0.5, 0.75, 1],
            "max_depth": [2, 3, 5, 7],
        },
        "SVR" : [
            {"kernel" : ["linear"], "C": [0.01, 0.1, 1, 10, 100, 1000]},
            {"kernel": ["rbf"], "C": [0.01, 0.1, 1, 10, 100, 1000],
             "gamma": [0.0001, 0.001, 0.01, 0.1, 1]}
        ]

    } 

    # Running the gridsearch.
    for name, model in models.items():
        print("="*75)
        print(f"Performing grid search for {name}:")
        grid_search = GridSearchCV(
            model, param_grid[name], cv=5,
            scoring="neg_mean_squared_error", n_jobs=-1
        )
        
        # Fitting the model.
        search_result = grid_search.fit(X1, y1)
        print(f"Best parameters for {name}: {search_result.best_params_}")
        print(f"Best score for {name}: {search_result.best_score_}")
        print("=" *75, "\n")

        # Predicting with the best estimator.
        y_pred = grid_search.predict(X2)

        # Forecast dataframe.
        forecast[name + "_predict"] = y_pred.round(1)

        # Metrics from forecast.
        mae_value = f"{mean_absolute_error(y2, y_pred):.2f}"
        mape_value = f"{mean_absolute_percentage_error(y_test, y_pred):.2f}"
        rmse_value = f"{root_mean_squared_error(y2, y_pred):.2f}"

        results.append(
            {
                "Model": name,
                "MAE": mae_value,
                "MAPE": mape_value,
                "RMSE": rmse_value
            }
        )
    
    # Result to a dataframe.
    metrics_result = pd.DataFrame(results)

    print(forecast.head())

    return metrics_result

In [8]:
model_selection(models, X_train, y_train, X_test, y_test, act_test_date, act_test_df)

Performing grid search for XGB:
Best parameters for XGB: {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 350}
Best score for XGB: -0.5711355167431417

Performing grid search for RF:
Best parameters for RF: {'max_features': 10, 'n_estimators': 30}
Best score for RF: -1.0646291566265058

Performing grid search for DT:
Best parameters for DT: {'max_depth': None, 'min_samples_leaf': 10, 'min_samples_split': 2}
Best score for DT: -0.48688185115444665

Performing grid search for SGD:
Best parameters for SGD: {'eta0': 0.001, 'max_iter': 200, 'penalty': 'l2', 'tol': 1e-10}
Best score for SGD: -0.12240476809178162

Performing grid search for GB:
Best parameters for GB: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 500}
Best score for GB: -0.3499795551124027

Performing grid search for SVR:
Best parameters for SVR: {'C': 0.1, 'kernel': 'linear'}
Best score for SVR: -0.04772028693544846

        DATE  actual  XGB_predict  RF_predict  DT_predict  SGD_predict  \
0 2015-06-01     6

Unnamed: 0,Model,MAE,MAPE,RMSE
0,XGB,0.74,0.17,0.92
1,RF,0.73,0.17,0.95
2,DT,0.6,0.14,0.76
3,SGD,0.16,0.03,0.2
4,GB,0.47,0.11,0.59
5,SVR,0.15,0.03,0.18
