In [260]:
import xgboost as xgb
import pandas as pd
import numpy as np
from datetime import timedelta,datetime
import dataframe_image as dfi
import matplotlib.pyplot as plt
import sklearn.metrics as mtr
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import pickle


# Primeira Execução

In [261]:
df = pd.read_csv('rota33642.csv')
df.head()

Unnamed: 0,data_partida,data_chegada,linha,tempo_viagem,equipamento,dia_semana,partidaTimeStamp,qtdDiasAno,direcao,tipo_dia,turno_dia,chegadaTimeStamp,hora,hora_dia
0,2019-07-02 05:35:51,2019-07-02 05:47:44,33642,713,1083,2,1562056551,183,2,1,2,1562057264,,5
1,2019-07-02 06:11:07,2019-07-02 06:25:15,33642,848,1083,2,1562058667,183,2,1,2,1562059515,,6
2,2019-07-02 06:53:26,2019-07-02 07:08:02,33642,876,1083,2,1562061206,183,2,1,2,1562062082,,6
3,2019-07-02 07:36:44,2019-07-02 07:55:03,33642,1099,1083,2,1562063804,183,2,1,2,1562064903,,7
4,2019-07-02 08:16:37,2019-07-02 08:30:55,33642,858,1083,2,1562066197,183,2,1,2,1562067055,,8


In [262]:
# Dropamos:
# A data_chegada e a chegadaTimeStamp pois com esses valores daria para prever o tempo de viagem(Y)
# linha e equipamento por se tratar apenas de IDs
# hora por conter apenas valores nulos
# direcao por conter apenas o valor 2

df = df.drop(['data_chegada','linha','equipamento','direcao','chegadaTimeStamp','hora'], axis=1)

In [263]:
corr = df.corr()
corr

  corr = df.corr()


Unnamed: 0,tempo_viagem,dia_semana,partidaTimeStamp,qtdDiasAno,tipo_dia,turno_dia,hora_dia
tempo_viagem,1.0,-0.091237,0.036642,0.033283,-0.041862,0.360651,0.375852
dia_semana,-0.091237,1.0,0.018559,0.018371,0.044735,0.018066,0.020579
partidaTimeStamp,0.036642,0.018559,1.0,0.99996,-0.012245,0.006308,0.005832
qtdDiasAno,0.033283,0.018371,0.99996,1.0,-0.012287,-0.002152,-0.003087
tipo_dia,-0.041862,0.044735,-0.012245,-0.012287,1.0,0.001882,0.004642
turno_dia,0.360651,0.018066,0.006308,-0.002152,0.001882,1.0,0.948907
hora_dia,0.375852,0.020579,0.005832,-0.003087,0.004642,0.948907,1.0


In [264]:
df[['hora_dia','turno_dia']].corr()

Unnamed: 0,hora_dia,turno_dia
hora_dia,1.0,0.948907
turno_dia,0.948907,1.0


In [265]:
df[['partidaTimeStamp','qtdDiasAno']].corr()

Unnamed: 0,partidaTimeStamp,qtdDiasAno
partidaTimeStamp,1.0,0.99996
qtdDiasAno,0.99996,1.0


In [266]:
#Vamos tirar hora_dia pois tem uma alta correlação com turno dia, escolhemos essa coluna pois os valores são apenas de 1 a 4 enquanto hora é de 0 a 23
df = df.drop(['qtdDiasAno'], axis=1)

In [267]:
#Ordenando pela Data
df = df.sort_values(by=["data_partida"], ascending=False)

In [268]:
#Transformamos data_partida em index
df.index = df['data_partida']
df = df.drop(['data_partida'], axis=1)

In [269]:
#Separamos treino e teste
shape = int(df.shape[0] * 0.3)
df_teste = df[:shape]
df_treino = df[shape:]
print(f'Treino: {df_treino.shape}      Teste:{df_teste.shape}')

Treino: (6520, 6)      Teste:(2794, 6)


In [270]:
# Separamos a classe dos atributos 
x_teste = df_teste[['dia_semana','partidaTimeStamp','tipo_dia','hora_dia']]
y_teste = df_teste[['tempo_viagem']]
x_treino = df_treino[['dia_semana','partidaTimeStamp','tipo_dia','hora_dia']]
y_treino = df_treino[['tempo_viagem']]

In [271]:
params_xgb = {
        "n_estimators": list(range(100, 1100, 100)), #Number of gradient boosted trees. Equivalent to number of boosting rounds
        "max_depth": list(range(2, 15)),#Maximum tree depth for base learners.
        "min_child_weight": list(range(1, 11)),#Minimum sum of instance weight(hessian) needed in a child.
        "learning_rate": [0.3, 0.2, 0.1, 0.05, 0.01, 0.005],#Boosting learning rate (xgb’s “eta”)
        "gamma": np.arange(0, 0.7, 0.1)#Minimum loss reduction required to make a further partition on a leaf node of the tree.
        }

# Hyperparâmetros do Random Forest
params_rf = {
        "n_estimators": list(range(100, 1100, 100)),#The number of trees in the forest.
        "bootstrap": [True, False],#Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.
        "max_depth": list(range(2, 15)),#The maximum depth of the tree.
        "max_features": [1.0, "sqrt", "log2"],#The number of features to consider when looking for the best split:
        "min_samples_leaf": list(range(1, 11)),#The minimum number of samples required to split an internal node
        "min_samples_split": list(range(2, 11)),#The minimum number of samples required to be at a leaf node.
        }

In [272]:
xgb_rand_search.best_estimator_

In [273]:
rf_rand_search.best_estimator_

In [274]:
#Criação e fit do modelo XGB
start_xgb = datetime.now()
modelo_xgb = xgb.XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stop_rounds=100, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, gamma=0.0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.005, max_bin=256,
             max_cat_to_onehot=4, max_delta_step=0, max_depth=3, max_leaves=0,
             min_child_weight=5, monotone_constraints='()',
             n_estimators=900, n_jobs=0, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=0)
#modelo_xgb = xgb.XGBRegressor(early_stop_rounds = 100)
#xgb_rand_search = RandomizedSearchCV(modelo_xgb, params_xgb, scoring="neg_mean_squared_error", n_iter=40, verbose=True, cv=10, n_jobs=-1, random_state=123)
#xgb_rand_search.fit(x_treino, y_treino)
#modelo_xgb = xgb_rand_search.best_estimator_
modelo_xgb.fit(x_treino, y_treino, eval_set = [(x_treino, y_treino)],verbose=False)
end_xgb = datetime.now()

#Criação e fit do modelo Random Forest
start_rf = datetime.now()
modelo_rf = RandomForestRegressor(max_depth=10, max_features='log2', min_samples_leaf=4,
                      min_samples_split=7, n_estimators=900)
#modelo_rf = RandomForestRegressor()
#rf_rand_search = RandomizedSearchCV(modelo_rf, params_rf, scoring="neg_mean_squared_error", n_iter=40, verbose=True, cv=10, n_jobs=-1, random_state=123)
#rf_rand_search.fit(x_treino, y_treino)
#modelo_rf = rf_rand_search.best_estimator_
modelo_rf.fit(x_treino, y_treino)
end_rf = datetime.now()

#Criação e fit do modelo Linear Regression
start_lr = datetime.now()
modelo_lr = LinearRegression()
modelo_lr.fit(x_treino, y_treino)
end_lr = datetime.now()


tempo_xgb = end_xgb - start_xgb
tempo_rf = end_rf - start_rf
tempo_lr = end_lr - start_lr

Parameters: { "early_stop_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  modelo_rf.fit(x_treino, y_treino)


In [275]:
# Fizemos as predições e juntamos no DataFrame
df_teste["predicao_xgb"] = modelo_xgb.predict(x_teste)
df_teste["predicao_lr"] = modelo_lr.predict(x_teste)
df_teste["predicao_rf"] = modelo_rf.predict(x_teste)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste["predicao_xgb"] = modelo_xgb.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste["predicao_lr"] = modelo_lr.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste["predicao_rf"] = modelo_rf.predict(x_teste)


In [276]:
# Pesquisar os motivos de se usar apenas métricas relativas - Flávia disse que provavelmente por conta de outlier/valor distantes 
RRSE_xgb = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste["predicao_xgb"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
MAPE_xgb = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste["predicao_xgb"])

RRSE_rf = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste["predicao_rf"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
MAPE_rf = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste["predicao_rf"])

RRSE_lr = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste["predicao_lr"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
MAPE_lr = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste["predicao_lr"])

In [277]:
metrics = []
metrics.append(['Primeira execução','RRSE', RRSE_xgb, RRSE_rf, RRSE_lr])
metrics.append(['Primeira execução','MAPE', MAPE_xgb, MAPE_rf, MAPE_lr]) 
metrics.append(['Primeira execução','Tempo', tempo_xgb, tempo_rf, tempo_lr]) 
metrics = pd.DataFrame(metrics, columns=['Descrição','Métrica', 'XGBoosting','Random Forest','Linear Regression'])
metrics.to_csv('metrica_primeira_execução.csv',index=False)
metrics

Unnamed: 0,Descrição,Métrica,XGBoosting,Random Forest,Linear Regression
0,Primeira execução,RRSE,0.692901,0.721236,0.939445
1,Primeira execução,MAPE,0.103641,0.108859,0.153585
2,Primeira execução,Tempo,0:00:01.427022,0:00:02.708563,0:00:00.002999


In [278]:
df_teste.to_csv('previsão_primeira_execução.csv',index=False)
df_teste.head()

Unnamed: 0_level_0,tempo_viagem,dia_semana,partidaTimeStamp,tipo_dia,turno_dia,hora_dia,predicao_xgb,predicao_lr,predicao_rf
data_partida,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-10-01 00:40:28,680,2,1569901228,1,1,0,831.875977,978.708272,839.670296
2019-10-01 00:16:27,849,2,1569899787,1,1,0,831.875977,978.703918,839.670296
2019-10-01 00:00:45,852,2,1569898845,1,1,0,831.875977,978.701072,839.670296
2019-09-30 23:46:49,824,1,1569898009,1,4,23,1042.120117,1373.619877,1114.098827
2019-09-30 23:30:14,993,1,1569897014,1,4,23,1042.120117,1373.616871,1114.098827


# Teste com Clima

In [279]:
#Lendo dados de clima e convertendo a data pra datetime (na leitura veio como objeto)
clima = pd.read_csv('clima.csv')
clima['Datetime'] = pd.to_datetime(clima['Datetime'])

In [280]:
#copiando o DAtaframe original para não alterar o primeiro
aux = df.copy()
#Mudanças na data_partida para conseguir fazer o merge
aux["data_partida2"] = pd.to_datetime(aux.index)
aux["data_partida2"] = aux["data_partida2"].dt.round("H")
#Merge dos dados e exclusão da coluna auxiliar
df_clima = aux.merge(clima, left_on="data_partida2", right_on="Datetime", how="left")
df_clima.drop(['data_partida2','Datetime'],axis=1,inplace=True)
df_clima.index = aux.index
df_clima.head()

Unnamed: 0_level_0,tempo_viagem,dia_semana,partidaTimeStamp,tipo_dia,turno_dia,hora_dia,Descricao Chuva,Calor
data_partida,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-10-01 00:40:28,680,2,1569901228,1,1,0,0,1
2019-10-01 00:16:27,849,2,1569899787,1,1,0,0,1
2019-10-01 00:00:45,852,2,1569898845,1,1,0,0,1
2019-09-30 23:46:49,824,1,1569898009,1,4,23,0,1
2019-09-30 23:30:14,993,1,1569897014,1,4,23,0,1


In [281]:
#Separamos treino e teste
shape = int(df_clima.shape[0] * 0.3)
df_teste = df_clima[:shape]
df_treino = df_clima[shape:]
print(f'Treino: {df_treino.shape}      Teste:{df_teste.shape}')

Treino: (6520, 8)      Teste:(2794, 8)


In [282]:
# Separamos a classe dos atributos 
x_teste = df_teste[['dia_semana','partidaTimeStamp','tipo_dia','hora_dia','Descricao Chuva','Calor']]
y_teste = df_teste[['tempo_viagem']]
x_treino = df_treino[['dia_semana','partidaTimeStamp','tipo_dia','hora_dia','Descricao Chuva','Calor']]
y_treino = df_treino[['tempo_viagem']]

In [283]:
#Criação e fit do modelo XGB
start_xgb = datetime.now()
modelo_xgb = xgb.XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stop_rounds=100, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, gamma=0.0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.005, max_bin=256,
             max_cat_to_onehot=4, max_delta_step=0, max_depth=3, max_leaves=0,
             min_child_weight=5, monotone_constraints='()',
             n_estimators=900, n_jobs=0, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=0)
#modelo_xgb = xgb.XGBRegressor(early_stop_rounds = 100)
#xgb_rand_search = RandomizedSearchCV(modelo_xgb, params_xgb, scoring="neg_mean_squared_error", n_iter=40, verbose=True, cv=10, n_jobs=-1, random_state=123)
#xgb_rand_search.fit(x_treino, y_treino)
#modelo_xgb = xgb_rand_search.best_estimator_
modelo_xgb.fit(x_treino, y_treino, eval_set = [(x_treino, y_treino)],verbose=False)
end_xgb = datetime.now()

#Criação e fit do modelo Random Forest
start_rf = datetime.now()
modelo_rf = RandomForestRegressor(max_depth=10, max_features='log2', min_samples_leaf=4,
                      min_samples_split=7, n_estimators=900)
#modelo_rf = RandomForestRegressor()
#rf_rand_search = RandomizedSearchCV(modelo_rf, params_rf, scoring="neg_mean_squared_error", n_iter=40, verbose=True, cv=10, n_jobs=-1, random_state=123)
#rf_rand_search.fit(x_treino, y_treino)
#modelo_rf = rf_rand_search.best_estimator_
modelo_rf.fit(x_treino, y_treino)
end_rf = datetime.now()

#Criação e fit do modelo Linear Regression
start_lr = datetime.now()
modelo_lr = LinearRegression()
modelo_lr.fit(x_treino, y_treino)
end_lr = datetime.now()


tempo_xgb = end_xgb - start_xgb
tempo_rf = end_rf - start_rf
tempo_lr = end_lr - start_lr

Parameters: { "early_stop_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  modelo_rf.fit(x_treino, y_treino)


In [284]:
# Fizemos as predições e juntamos no DataFrame
df_teste["predicao_xgb_clima"] = modelo_xgb.predict(x_teste)
df_teste["predicao_lr_clima"] = modelo_lr.predict(x_teste)
df_teste["predicao_rf_clima"] = modelo_rf.predict(x_teste)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste["predicao_xgb_clima"] = modelo_xgb.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste["predicao_lr_clima"] = modelo_lr.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste["predicao_rf_clima"] = modelo_rf.predict(x_teste)


In [285]:
# Pesquisar os motivos de se usar apenas métricas relativas - Flávia disse que provavelmente por conta de outlier/valor distantes 
RRSE_xgb = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste["predicao_xgb_clima"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
MAPE_xgb = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste["predicao_xgb_clima"])

RRSE_rf = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste["predicao_rf_clima"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
MAPE_rf = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste["predicao_rf_clima"])

RRSE_lr = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste["predicao_lr_clima"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
MAPE_lr = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste["predicao_lr_clima"])

In [286]:
metrics = []
metrics.append(['Teste Clima','RRSE', RRSE_xgb, RRSE_rf, RRSE_lr])
metrics.append(['Teste Clima','MAPE', MAPE_xgb, MAPE_rf, MAPE_lr]) 
metrics.append(['Teste Clima','Tempo', tempo_xgb, tempo_rf, tempo_lr]) 
metrics = pd.DataFrame(metrics, columns=['Descrição','Métrica', 'XGBoosting','Random Forest','Linear Regression'])
metrics.to_csv('metrica_teste_clime.csv',index=False)
metrics

Unnamed: 0,Descrição,Métrica,XGBoosting,Random Forest,Linear Regression
0,Teste Clima,RRSE,0.6943,0.722351,0.916085
1,Teste Clima,MAPE,0.103104,0.109225,0.151943
2,Teste Clima,Tempo,0:00:01.420000,0:00:02.321000,0:00:00.003000


In [287]:
df_teste.to_csv('previsão_teste_clima.csv',index=False)
df_teste.head()

Unnamed: 0_level_0,tempo_viagem,dia_semana,partidaTimeStamp,tipo_dia,turno_dia,hora_dia,Descricao Chuva,Calor,predicao_xgb_clima,predicao_lr_clima,predicao_rf_clima
data_partida,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-10-01 00:40:28,680,2,1569901228,1,1,0,0,1,841.116333,1026.636094,885.107957
2019-10-01 00:16:27,849,2,1569899787,1,1,0,0,1,841.116333,1026.633915,885.107957
2019-10-01 00:00:45,852,2,1569898845,1,1,0,0,1,841.116333,1026.632491,885.107957
2019-09-30 23:46:49,824,1,1569898009,1,4,23,0,1,1021.690552,1371.491567,1075.692143
2019-09-30 23:30:14,993,1,1569897014,1,4,23,0,1,1021.690552,1371.490062,1075.692143


# Tempos Anteriores

In [288]:
df_tempos = df.copy()
df_tempos["tempo_viagem_1"] = df_tempos["tempo_viagem"]
df_tempos["tempo_viagem_2"] = df_tempos["tempo_viagem"]
df_tempos["tempo_viagem_3"] = df_tempos["tempo_viagem"]
df_tempos["tempo_viagem_4"] = df_tempos["tempo_viagem"]

df_tempos["tempo_viagem_1"] = df_tempos["tempo_viagem_1"].shift(1)
df_tempos["tempo_viagem_2"] = df_tempos["tempo_viagem_2"].shift(2)
df_tempos["tempo_viagem_3"] = df_tempos["tempo_viagem_3"].shift(3)
df_tempos["tempo_viagem_4"] = df_tempos["tempo_viagem_4"].shift(4)
df_tempos.head()

Unnamed: 0_level_0,tempo_viagem,dia_semana,partidaTimeStamp,tipo_dia,turno_dia,hora_dia,tempo_viagem_1,tempo_viagem_2,tempo_viagem_3,tempo_viagem_4
data_partida,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-10-01 00:40:28,680,2,1569901228,1,1,0,,,,
2019-10-01 00:16:27,849,2,1569899787,1,1,0,680.0,,,
2019-10-01 00:00:45,852,2,1569898845,1,1,0,849.0,680.0,,
2019-09-30 23:46:49,824,1,1569898009,1,4,23,852.0,849.0,680.0,
2019-09-30 23:30:14,993,1,1569897014,1,4,23,824.0,852.0,849.0,680.0


In [289]:
shape = int(df_tempos.shape[0] * 0.3)
df_teste = df_tempos[:shape]
df_treino = df_tempos[shape:]
print(f'Treino: {df_treino.shape}      Teste:{df_teste.shape}')

Treino: (6520, 10)      Teste:(2794, 10)


In [290]:
metrics = []
for cenario in range(1, 5):
        if cenario == 1:
                df_teste = df_teste.iloc[1:]
                df_treino = df_treino.iloc[1:]

                x_teste = df_teste[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia", "tempo_viagem_1"]]
                y_teste = df_teste["tempo_viagem"]
                x_treino = df_treino[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia", "tempo_viagem_1"]]
                y_treino = df_treino["tempo_viagem"]

        elif cenario == 2:
                df_teste = df_teste.iloc[2:]
                df_treino = df_treino.iloc[2:]

                x_teste = df_teste[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia", "tempo_viagem_1", "tempo_viagem_2"]]
                y_teste = df_teste["tempo_viagem"]
                x_treino = df_treino[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia", "tempo_viagem_1", "tempo_viagem_2"]]
                y_treino = df_treino["tempo_viagem"]

        elif cenario == 3:
                df_teste = df_teste.iloc[3:]
                df_treino = df_treino.iloc[3:]

                x_teste = df_teste[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia", "tempo_viagem_1", "tempo_viagem_2", "tempo_viagem_3"]]
                y_teste = df_teste["tempo_viagem"]
                x_treino = df_treino[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia", "tempo_viagem_1", "tempo_viagem_2", "tempo_viagem_3"]]
                y_treino = df_treino["tempo_viagem"]
        else:
                df_teste = df_teste.iloc[4:]
                df_treino = df_treino.iloc[4:]

                x_teste = df_teste[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia", "tempo_viagem_1", "tempo_viagem_2", "tempo_viagem_3","tempo_viagem_4"]]
                y_teste = df_teste["tempo_viagem"]
                x_treino = df_treino[["partidaTimeStamp", "dia_semana", "tipo_dia", "hora_dia", "tempo_viagem_1", "tempo_viagem_2", "tempo_viagem_3","tempo_viagem_4"]]
                y_treino = df_treino["tempo_viagem"]

        #Criação e fit do modelo XGB
        start_xgb = datetime.now()
        modelo_xgb = xgb.XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
                colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
                early_stop_rounds=100, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, gamma=0.0, gpu_id=-1,
                grow_policy='depthwise', importance_type=None,
                interaction_constraints='', learning_rate=0.005, max_bin=256,
                max_cat_to_onehot=4, max_delta_step=0, max_depth=3, max_leaves=0,
                min_child_weight=5, monotone_constraints='()',
                n_estimators=900, n_jobs=0, num_parallel_tree=1, predictor='auto',
                random_state=0, reg_alpha=0)
        #modelo_xgb = xgb.XGBRegressor(early_stop_rounds = 100)
        #xgb_rand_search = RandomizedSearchCV(modelo_xgb, params_xgb, scoring="neg_mean_squared_error", n_iter=40, verbose=True, cv=10, n_jobs=-1, random_state=123)
        #xgb_rand_search.fit(x_treino, y_treino)
        #modelo_xgb = xgb_rand_search.best_estimator_
        modelo_xgb.fit(x_treino, y_treino, eval_set = [(x_treino, y_treino)],verbose=False)
        end_xgb = datetime.now()

        #Criação e fit do modelo Random Forest
        start_rf = datetime.now()
        modelo_rf = RandomForestRegressor(max_depth=10, max_features='log2', min_samples_leaf=4,
                        min_samples_split=7, n_estimators=900)
        #modelo_rf = RandomForestRegressor()
        #rf_rand_search = RandomizedSearchCV(modelo_rf, params_rf, scoring="neg_mean_squared_error", n_iter=40, verbose=True, cv=10, n_jobs=-1, random_state=123)
        #rf_rand_search.fit(x_treino, y_treino)
        #modelo_rf = rf_rand_search.best_estimator_
        modelo_rf.fit(x_treino, y_treino)
        end_rf = datetime.now()

        #Criação e fit do modelo Linear Regression
        start_lr = datetime.now()
        modelo_lr = LinearRegression()
        modelo_lr.fit(x_treino, y_treino)
        end_lr = datetime.now()


        tempo_xgb = end_xgb - start_xgb
        tempo_rf = end_rf - start_rf
        tempo_lr = end_lr - start_lr

        # Fizemos as predições e juntamos no DataFrame
        df_teste[f"predicao_xgb_{cenario}"] = modelo_xgb.predict(x_teste)
        df_teste[f"predicao_lr_{cenario}"] = modelo_lr.predict(x_teste)
        df_teste[f"predicao_rf_{cenario}"] = modelo_rf.predict(x_teste)

        # Pesquisar os motivos de se usar apenas métricas relativas - Flávia disse que provavelmente por conta de outlier/valor distantes 
        RRSE_xgb = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste[f"predicao_xgb_{cenario}"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
        MAPE_xgb = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste[f"predicao_xgb_{cenario}"])

        RRSE_rf = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste[f"predicao_rf_{cenario}"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
        MAPE_rf = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste[f"predicao_rf_{cenario}"])

        RRSE_lr = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste[f"predicao_lr_{cenario}"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
        MAPE_lr = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste[f"predicao_lr_{cenario}"])

        metrics.append([f'Teste Tempos {cenario}','RRSE', RRSE_xgb, RRSE_rf, RRSE_lr])
        metrics.append([f'Teste Tempos {cenario}','MAPE', MAPE_xgb, MAPE_rf, MAPE_lr]) 
        metrics.append([f'Teste Tempos {cenario}','Tempo', tempo_xgb, tempo_rf, tempo_lr]) 

Parameters: { "early_stop_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "early_stop_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "early_stop_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "early_stop_ro

In [291]:
metrics = pd.DataFrame(metrics, columns=['Descrição','Métrica', 'XGBoosting','Random Forest','Linear Regression'])
metrics.to_csv('metrica_teste_tempos.csv',index=False)
metrics


Unnamed: 0,Descrição,Métrica,XGBoosting,Random Forest,Linear Regression
0,Teste Tempos 1,RRSE,0.702625,0.708871,0.80014
1,Teste Tempos 1,MAPE,0.104589,0.105831,0.126999
2,Teste Tempos 1,Tempo,0:00:01.440001,0:00:03.416001,0:00:00.001999
3,Teste Tempos 2,RRSE,0.689807,0.691295,0.743545
4,Teste Tempos 2,MAPE,0.103228,0.103715,0.116678
5,Teste Tempos 2,Tempo,0:00:01.436000,0:00:03.817999,0:00:00.002004
6,Teste Tempos 3,RRSE,0.686384,0.687347,0.724626
7,Teste Tempos 3,MAPE,0.102821,0.103229,0.113276
8,Teste Tempos 3,Tempo,0:00:01.561999,0:00:04.130001,0:00:00.003000
9,Teste Tempos 4,RRSE,0.685212,0.678742,0.713791


In [292]:
df_teste.to_csv('previsão_teste_tempos.csv',index=False)
df_teste.head()

Unnamed: 0_level_0,tempo_viagem,dia_semana,partidaTimeStamp,tipo_dia,turno_dia,hora_dia,tempo_viagem_1,tempo_viagem_2,tempo_viagem_3,tempo_viagem_4,...,predicao_rf_1,predicao_xgb_2,predicao_lr_2,predicao_rf_2,predicao_xgb_3,predicao_lr_3,predicao_rf_3,predicao_xgb_4,predicao_lr_4,predicao_rf_4
data_partida,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-09-30 22:06:32,1055,1,1569891992,1,4,22,950.0,932.0,1063.0,930.0,...,1081.459195,1033.455322,1083.7813,1034.012511,1047.306152,1094.909011,1029.848724,1047.920166,1080.839335,1028.017928
2019-09-30 21:50:37,1428,1,1569891037,1,4,21,1055.0,950.0,932.0,1063.0,...,1135.812845,1037.720459,1120.008162,1068.219441,1027.292603,1089.611186,1042.043907,1057.315063,1102.53667,1065.58985
2019-09-30 21:45:34,986,1,1569890734,1,4,21,1428.0,1055.0,950.0,932.0,...,1169.312274,1097.488281,1286.584728,1144.466186,1093.381592,1218.850773,1108.82901,1089.99707,1181.511792,1100.874328
2019-09-30 21:32:06,1086,1,1569889926,1,4,21,986.0,1428.0,1055.0,950.0,...,1131.706525,1090.69397,1259.291068,1189.834515,1088.927979,1219.926858,1135.339426,1106.074097,1181.558921,1095.319638
2019-09-30 21:21:11,1287,1,1569889271,1,4,21,1086.0,986.0,1428.0,1055.0,...,1137.02623,1060.024414,1143.176976,1081.397921,1147.653198,1224.46919,1177.015707,1159.932739,1203.169402,1195.228876


# Testes Intervalos de Teste_Treino

In [293]:
#Separamos treino e teste
df_intervalos = df.copy()
df_intervalos["data_partida2"] = pd.to_datetime(df_intervalos.index)

df_teste.head()

Unnamed: 0_level_0,tempo_viagem,dia_semana,partidaTimeStamp,tipo_dia,turno_dia,hora_dia,tempo_viagem_1,tempo_viagem_2,tempo_viagem_3,tempo_viagem_4,...,predicao_rf_1,predicao_xgb_2,predicao_lr_2,predicao_rf_2,predicao_xgb_3,predicao_lr_3,predicao_rf_3,predicao_xgb_4,predicao_lr_4,predicao_rf_4
data_partida,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-09-30 22:06:32,1055,1,1569891992,1,4,22,950.0,932.0,1063.0,930.0,...,1081.459195,1033.455322,1083.7813,1034.012511,1047.306152,1094.909011,1029.848724,1047.920166,1080.839335,1028.017928
2019-09-30 21:50:37,1428,1,1569891037,1,4,21,1055.0,950.0,932.0,1063.0,...,1135.812845,1037.720459,1120.008162,1068.219441,1027.292603,1089.611186,1042.043907,1057.315063,1102.53667,1065.58985
2019-09-30 21:45:34,986,1,1569890734,1,4,21,1428.0,1055.0,950.0,932.0,...,1169.312274,1097.488281,1286.584728,1144.466186,1093.381592,1218.850773,1108.82901,1089.99707,1181.511792,1100.874328
2019-09-30 21:32:06,1086,1,1569889926,1,4,21,986.0,1428.0,1055.0,950.0,...,1131.706525,1090.69397,1259.291068,1189.834515,1088.927979,1219.926858,1135.339426,1106.074097,1181.558921,1095.319638
2019-09-30 21:21:11,1287,1,1569889271,1,4,21,1086.0,986.0,1428.0,1055.0,...,1137.02623,1060.024414,1143.176976,1081.397921,1147.653198,1224.46919,1177.015707,1159.932739,1203.169402,1195.228876


In [294]:
data_atual = df_intervalos["data_partida2"].dt.date.min() + timedelta(days=14)
dias_treino = 14
dias_teste = 79


metrics = []
while data_atual < (df_intervalos["data_partida2"].dt.date.max() - timedelta(days=14)):
        df_teste = df_intervalos[df_intervalos["data_partida2"].dt.date > data_atual]
        df_treino = df_intervalos[df_intervalos["data_partida2"].dt.date <= data_atual]

        #Separamos a classe dos atributos 
        x_teste = df_teste[['dia_semana','partidaTimeStamp','tipo_dia','hora_dia']]
        y_teste = df_teste[['tempo_viagem']]
        x_treino = df_treino[['dia_semana','partidaTimeStamp','tipo_dia','hora_dia']]
        y_treino = df_treino[['tempo_viagem']]

        #Criação e fit do modelo XGB
        start_xgb = datetime.now()
        modelo_xgb = xgb.XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
                colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
                early_stop_rounds=100, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, gamma=0.0, gpu_id=-1,
                grow_policy='depthwise', importance_type=None,
                interaction_constraints='', learning_rate=0.005, max_bin=256,
                max_cat_to_onehot=4, max_delta_step=0, max_depth=3, max_leaves=0,
                min_child_weight=5, monotone_constraints='()',
                n_estimators=900, n_jobs=0, num_parallel_tree=1, predictor='auto',
                random_state=0, reg_alpha=0)
        #modelo_xgb = xgb.XGBRegressor(early_stop_rounds = 100)
        #xgb_rand_search = RandomizedSearchCV(modelo_xgb, params_xgb, scoring="neg_mean_squared_error", n_iter=40, verbose=True, cv=10, n_jobs=-1, random_state=123)
        #xgb_rand_search.fit(x_treino, y_treino)
        #modelo_xgb = xgb_rand_search.best_estimator_
        modelo_xgb.fit(x_treino, y_treino, eval_set = [(x_treino, y_treino)],verbose=False)
        end_xgb = datetime.now()

        #Criação e fit do modelo Random Forest
        start_rf = datetime.now()
        modelo_rf = RandomForestRegressor(max_depth=10, max_features='log2', min_samples_leaf=4,
                        min_samples_split=7, n_estimators=900)
        #modelo_rf = RandomForestRegressor()
        #rf_rand_search = RandomizedSearchCV(modelo_rf, params_rf, scoring="neg_mean_squared_error", n_iter=40, verbose=True, cv=10, n_jobs=-1, random_state=123)
        #rf_rand_search.fit(x_treino, y_treino)
        #modelo_rf = rf_rand_search.best_estimator_
        modelo_rf.fit(x_treino, y_treino)
        end_rf = datetime.now()

        #Criação e fit do modelo Linear Regression
        start_lr = datetime.now()
        modelo_lr = LinearRegression()
        modelo_lr.fit(x_treino, y_treino)
        end_lr = datetime.now()


        tempo_xgb = end_xgb - start_xgb
        tempo_rf = end_rf - start_rf
        tempo_lr = end_lr - start_lr

        # Fizemos as predições e juntamos no DataFrame 
        df_teste[f"predicao_xgb {dias_treino} {dias_teste}"] = modelo_xgb.predict(x_teste)
        df_teste[f"predicao_lr {dias_treino} {dias_teste}"] = modelo_lr.predict(x_teste)
        df_teste[f"predicao_rf {dias_treino} {dias_teste}"] = modelo_rf.predict(x_teste)

        # Pesquisar os motivos de se usar apenas métricas relativas - Flávia disse que provavelmente por conta de outlier/valor distantes 
        RRSE_xgb = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste[f"predicao_xgb {dias_treino} {dias_teste}"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
        MAPE_xgb = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste[f"predicao_xgb {dias_treino} {dias_teste}"])

        RRSE_rf = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste[f"predicao_rf {dias_treino} {dias_teste}"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
        MAPE_rf = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste[f"predicao_rf {dias_treino} {dias_teste}"])

        RRSE_lr = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste[f"predicao_lr {dias_treino} {dias_teste}"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
        MAPE_lr = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste[f"predicao_lr {dias_treino} {dias_teste}"])

        metrics.append([f'Teste Intervalos {dias_treino} {dias_teste}','RRSE', RRSE_xgb, RRSE_rf, RRSE_lr])
        metrics.append([f'Teste Intervalos {dias_treino} {dias_teste}','MAPE', MAPE_xgb, MAPE_rf, MAPE_lr]) 
        metrics.append([f'Teste Intervalos {dias_treino} {dias_teste}','Tempo', tempo_xgb, tempo_rf, tempo_lr])

        df_intervalos = df_intervalos.merge(df_teste[[f'predicao_xgb {dias_treino} {dias_teste}',f'predicao_rf {dias_treino} {dias_teste}',f'predicao_lr {dias_treino} {dias_teste}']], how="left", left_index=True, right_index=True)
        

        dias_teste -= 14
        dias_treino += 14

        data_atual += timedelta(days=14)



Parameters: { "early_stop_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  modelo_rf.fit(x_treino, y_treino)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste[f"predicao_xgb {dias_treino} {dias_teste}"] = modelo_xgb.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste[f"predicao_lr {dias_treino} {dias_teste}"] = modelo_lr.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

Parameters: { "early_stop_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  modelo_rf.fit(x_treino, y_treino)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste[f"predicao_xgb {dias_treino} {dias_teste}"] = modelo_xgb.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste[f"predicao_lr {dias_treino} {dias_teste}"] = modelo_lr.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

Parameters: { "early_stop_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  modelo_rf.fit(x_treino, y_treino)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste[f"predicao_xgb {dias_treino} {dias_teste}"] = modelo_xgb.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste[f"predicao_lr {dias_treino} {dias_teste}"] = modelo_lr.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

Parameters: { "early_stop_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  modelo_rf.fit(x_treino, y_treino)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste[f"predicao_xgb {dias_treino} {dias_teste}"] = modelo_xgb.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste[f"predicao_lr {dias_treino} {dias_teste}"] = modelo_lr.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

Parameters: { "early_stop_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  modelo_rf.fit(x_treino, y_treino)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste[f"predicao_xgb {dias_treino} {dias_teste}"] = modelo_xgb.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste[f"predicao_lr {dias_treino} {dias_teste}"] = modelo_lr.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

In [295]:
metrics = pd.DataFrame(metrics, columns=['Descrição','Métrica', 'XGBoosting','Random Forest','Linear Regression'])
metrics.to_csv('metrica_teste_Intervalos.csv',index=False)
metrics

Unnamed: 0,Descrição,Métrica,XGBoosting,Random Forest,Linear Regression
0,Teste Intervalos 14 79,RRSE,0.716661,0.773616,1.272785
1,Teste Intervalos 14 79,MAPE,0.105299,0.110718,0.197197
2,Teste Intervalos 14 79,Tempo,0:00:00.544000,0:00:01.364783,0:00:00.002000
3,Teste Intervalos 28 65,RRSE,0.715926,0.711839,1.015333
4,Teste Intervalos 28 65,MAPE,0.104665,0.104905,0.15685
5,Teste Intervalos 28 65,Tempo,0:00:00.783999,0:00:01.627002,0:00:00.002000
6,Teste Intervalos 42 51,RRSE,0.687216,0.720278,0.953827
7,Teste Intervalos 42 51,MAPE,0.107236,0.111892,0.160238
8,Teste Intervalos 42 51,Tempo,0:00:01.029970,0:00:02.088001,0:00:00.003001
9,Teste Intervalos 56 37,RRSE,0.700957,0.715523,0.940524


In [296]:
df_teste.to_csv('previsão_teste_intervalos.csv',index=False)
df_teste.head()

Unnamed: 0_level_0,tempo_viagem,dia_semana,partidaTimeStamp,tipo_dia,turno_dia,hora_dia,data_partida2,predicao_xgb 14 79,predicao_rf 14 79,predicao_lr 14 79,...,predicao_lr 28 65,predicao_xgb 42 51,predicao_rf 42 51,predicao_lr 42 51,predicao_xgb 56 37,predicao_rf 56 37,predicao_lr 56 37,predicao_xgb 70 23,predicao_lr 70 23,predicao_rf 70 23
data_partida,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-09-10 00:05:40,722,2,1568084740,1,1,0,2019-09-10 00:05:40,861.536194,884.392796,695.343552,...,856.565389,843.985168,862.615252,1004.378035,842.732544,868.819929,981.478063,1069.533813,985.156797,955.188919
2019-09-10 00:16:03,839,2,1568085363,1,1,0,2019-09-10 00:16:03,861.536194,884.392796,695.316902,...,856.555564,843.985168,862.615252,1004.38602,842.732544,868.819929,981.481796,1069.533813,985.160006,955.188919
2019-09-10 00:40:23,746,2,1568086823,1,1,0,2019-09-10 00:40:23,861.536194,884.392796,695.254446,...,856.532541,843.985168,862.615252,1004.404732,842.732544,868.819929,981.490544,1069.533813,985.167526,955.188919
2019-09-10 04:24:52,783,2,1568100292,1,1,4,2019-09-10 04:24:52,810.25769,831.959242,767.16392,...,925.891714,821.052185,856.146944,1075.280632,816.322876,818.587946,1049.680381,1042.127563,1050.572068,915.822976
2019-09-10 05:00:16,780,2,1568102416,1,2,5,2019-09-10 05:00:16,819.322693,857.361716,785.194472,...,943.251112,866.643738,940.057369,1092.983673,866.805298,938.985607,1066.72039,1092.401489,1066.9168,968.626159


# Merges e Concats

In [302]:
m1 = pd.read_csv('metrica_primeira_execução.csv')
m2 = pd.read_csv('metrica_teste_clime.csv')
m3 = pd.read_csv('metrica_teste_tempos.csv')
m4 = pd.read_csv('metrica_teste_Intervalos.csv')
metricas = pd.concat([m1,m2,m3,m4])
metricas.to_csv('metricas.csv',index=False)

In [304]:
d1 = pd.read_csv('previsão_primeira_execução.csv')
d2 = pd.read_csv('previsão_teste_clima.csv')
d3 = pd.read_csv('previsão_teste_tempos.csv')
d4 = pd.read_csv('previsão_teste_intervalos.csv')

In [307]:
d1 = d1.merge(d2[['Descricao Chuva', 'Calor','predicao_xgb_clima', 'predicao_lr_clima', 'predicao_rf_clima']], how="left", left_index=True, right_index=True)

In [310]:
d1 = d1.merge(d3[['tempo_viagem_1', 'tempo_viagem_2',
       'tempo_viagem_3', 'tempo_viagem_4', 'predicao_xgb_1', 'predicao_lr_1',
       'predicao_rf_1', 'predicao_xgb_2', 'predicao_lr_2', 'predicao_rf_2',
       'predicao_xgb_3', 'predicao_lr_3', 'predicao_rf_3', 'predicao_xgb_4',
       'predicao_lr_4', 'predicao_rf_4']], how="left", left_index=True, right_index=True)

In [312]:
d1 = d1.merge(d4[['predicao_xgb 14 79',
       'predicao_rf 14 79', 'predicao_lr 14 79', 'predicao_xgb 28 65',
       'predicao_rf 28 65', 'predicao_lr 28 65', 'predicao_xgb 42 51',
       'predicao_rf 42 51', 'predicao_lr 42 51', 'predicao_xgb 56 37',
       'predicao_rf 56 37', 'predicao_lr 56 37', 'predicao_xgb 70 23',
       'predicao_lr 70 23', 'predicao_rf 70 23']], how="left", left_index=True, right_index=True)

In [315]:
d1.to_csv('previsoes.csv',index=False)