In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from datetime import timedelta
import dataframe_image as dfi
import matplotlib.pyplot as plt
import sklearn.metrics as mtr
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import pickle


pd.options.display.max_colwidth = 200

In [None]:
df = pd.read_csv("../Bases/Cleaned/rota32893_weather.csv")
df

In [None]:
#Separação teste e treino
shape = int(df.shape[0] * 0.3)
df_teste = df[:shape]
df_treino = df[shape:]

In [None]:
best_models = None
best_metric = None
best_cenario = [0,0,0,0]

In [None]:
for cenario in range(0, 4):
        if cenario == 0:
                x_teste = df_teste[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia"]]
                y_teste = df_teste["tempo_viagem"]
                x_treino = df_treino[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia"]]
                y_treino = df_treino["tempo_viagem"]

        elif cenario == 1:
                x_teste = df_teste[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia", "turno_dia"]]
                y_teste = df_teste["tempo_viagem"]
                x_treino = df_treino[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia", "turno_dia"]]
                y_treino = df_treino["tempo_viagem"]
        
        elif cenario == 2:
                x_teste = df_teste[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia", "qtdDiasAno"]]
                y_teste = df_teste["tempo_viagem"]
                x_treino = df_treino[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia", "qtdDiasAno"]]
                y_treino = df_treino["tempo_viagem"]
        
        else:
                x_teste = df_teste[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia", "turno_dia", "qtdDiasAno"]]
                y_teste = df_teste["tempo_viagem"]
                x_treino = df_treino[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia", "turno_dia", "qtdDiasAno"]]
                y_treino = df_treino["tempo_viagem"]
        

        # Hyperparâmetros do XGBoosting
        params_xgb = {
                "n_estimators": list(range(100, 1100, 100)), #Number of gradient boosted trees. Equivalent to number of boosting rounds
                "max_depth": list(range(2, 15)),#Maximum tree depth for base learners.
                "min_child_weight": list(range(1, 11)),#Minimum sum of instance weight(hessian) needed in a child.
                "learning_rate": [0.3, 0.2, 0.1, 0.05, 0.01, 0.005],#Boosting learning rate (xgb’s “eta”)
                "gamma": np.arange(0, 0.7, 0.1)#Minimum loss reduction required to make a further partition on a leaf node of the tree.
                }

        # Hyperparâmetros do Random Forest
        params_rf = {
                "n_estimators": list(range(100, 1100, 100)),#The number of trees in the forest.
                "bootstrap": [True, False],#Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.
                "max_depth": list(range(2, 15)),#The maximum depth of the tree.
                "max_features": [1.0, "sqrt", "log2"],#The number of features to consider when looking for the best split:
                "min_samples_leaf": list(range(1, 11)),#The minimum number of samples required to split an internal node
                "min_samples_split": list(range(2, 11)),#The minimum number of samples required to be at a leaf node.
                }

        # Hyperparâmetros do SVR
        params_svr = {
                "C": [0.1, 1, 10, 100, 1000], #Regularization parameter. The strength of the regularization is inversely proportional to C
                "gamma": ["scale", "auto"],#Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
                }

        #Linear Regression não tem hyper parametrização


        #Criação e fit do modelo XGB
        modelo_xgb = xgb.XGBRegressor(early_stop_rounds = 100)
        xgb_rand_search = RandomizedSearchCV(modelo_xgb, params_xgb, scoring="neg_mean_squared_error", n_iter=40, verbose=True, cv=10, n_jobs=-1, random_state=123)
        xgb_rand_search.fit(x_treino, y_treino)
        modelo_xgb = xgb_rand_search.best_estimator_

        modelo_xgb.fit(x_treino, y_treino, eval_set = [(x_treino, y_treino)])


        #Criação e fit do modelo Random Forest
        modelo_rf = RandomForestRegressor()
        rf_rand_search = RandomizedSearchCV(modelo_rf, params_rf, scoring="neg_mean_squared_error", n_iter=40, verbose=True, cv=10, n_jobs=-1, random_state=123)
        rf_rand_search.fit(x_treino, y_treino)
        modelo_rf = rf_rand_search.best_estimator_

        modelo_rf.fit(x_treino, y_treino)


        #Criação e fit do modelo SVR
        modelo_svr = SVR(kernel = 'rbf')
        svr_rand_search = RandomizedSearchCV(modelo_svr, params_svr, scoring="neg_mean_squared_error", n_iter=10, verbose=True, cv=10, n_jobs=-1, random_state=123)
        svr_rand_search.fit(x_treino, y_treino)
        modelo_svr = svr_rand_search.best_estimator_

        modelo_svr.fit(x_treino, y_treino)


        #Criação e fit do modelo Linear Regression
        modelo_lr = LinearRegression()
        modelo_lr.fit(x_treino, y_treino)

        df_teste["predicao_xgb"] = modelo_xgb.predict(x_teste)
        df_teste["predicao_lr"] = modelo_lr.predict(x_teste)
        df_teste["predicao_rf"] = modelo_rf.predict(x_teste)
        df_teste["predicao_svr"] = modelo_svr.predict(x_teste)

        df_teste.to_csv(f"../Bases/Predictions/rota32893_predictions_cen_{cenario}.csv",index=False)

        RMSE_xgb = mtr.mean_squared_error(df_teste["tempo_viagem"], df_teste["predicao_xgb"], squared=False) 
        RMSE_rf = mtr.mean_squared_error(df_teste["tempo_viagem"], df_teste["predicao_rf"], squared=False)       
        RMSE_svr = mtr.mean_squared_error(df_teste["tempo_viagem"], df_teste["predicao_svr"], squared=False) 
        RMSE_lr = mtr.mean_squared_error(df_teste["tempo_viagem"], df_teste["predicao_lr"], squared=False) 

        models = [modelo_xgb, modelo_rf, modelo_svr, modelo_lr]
        metrics = [RMSE_xgb,RMSE_rf,RMSE_svr,RMSE_lr]

        if best_models is None:
                best_models = models
                best_metrics = metrics
        else:
                for i in range(4):
                        if best_metrics[i] > metrics[i]:
                               best_models[i] = models[i]
                               best_metrics[i] = metrics[i]
                               best_cenario[i] = cenario


In [None]:
models_name = ['xgb', 'rf', 'svr', 'lr']
for i in range(4):
    with open (f'Models/rota32893_{models_name[i]}_cen_{best_cenario[i]}.pkl', 'wb') as m:
        pickle.dump(best_models[i], m)