In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
from datetime import timedelta,datetime
import dataframe_image as dfi
import matplotlib.pyplot as plt
import sklearn.metrics as mtr
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import pickle


# Primeira Execução

In [2]:
df = pd.read_csv('rota33642.csv')
df.head()

Unnamed: 0,data_partida,data_chegada,linha,tempo_viagem,equipamento,dia_semana,partidaTimeStamp,qtdDiasAno,direcao,tipo_dia,turno_dia,chegadaTimeStamp,hora,hora_dia
0,2019-07-02 05:35:51,2019-07-02 05:47:44,33642,713,1083,2,1562056551,183,2,1,2,1562057264,,5
1,2019-07-02 06:11:07,2019-07-02 06:25:15,33642,848,1083,2,1562058667,183,2,1,2,1562059515,,6
2,2019-07-02 06:53:26,2019-07-02 07:08:02,33642,876,1083,2,1562061206,183,2,1,2,1562062082,,6
3,2019-07-02 07:36:44,2019-07-02 07:55:03,33642,1099,1083,2,1562063804,183,2,1,2,1562064903,,7
4,2019-07-02 08:16:37,2019-07-02 08:30:55,33642,858,1083,2,1562066197,183,2,1,2,1562067055,,8


In [3]:
# Dropamos:
# A data_chegada e a chegadaTimeStamp pois com esses valores daria para prever o tempo de viagem(Y)
# linha e equipamento por se tratar apenas de IDs
# hora por conter apenas valores nulos
# direcao por conter apenas o valor 2

df = df.drop(['data_chegada','linha','equipamento','direcao','chegadaTimeStamp','hora'], axis=1)

In [4]:
corr = df.corr()
corr

  corr = df.corr()


Unnamed: 0,tempo_viagem,dia_semana,partidaTimeStamp,qtdDiasAno,tipo_dia,turno_dia,hora_dia
tempo_viagem,1.0,-0.091237,0.036642,0.033283,-0.041862,0.360651,0.375852
dia_semana,-0.091237,1.0,0.018559,0.018371,0.044735,0.018066,0.020579
partidaTimeStamp,0.036642,0.018559,1.0,0.99996,-0.012245,0.006308,0.005832
qtdDiasAno,0.033283,0.018371,0.99996,1.0,-0.012287,-0.002152,-0.003087
tipo_dia,-0.041862,0.044735,-0.012245,-0.012287,1.0,0.001882,0.004642
turno_dia,0.360651,0.018066,0.006308,-0.002152,0.001882,1.0,0.948907
hora_dia,0.375852,0.020579,0.005832,-0.003087,0.004642,0.948907,1.0


In [5]:
df[['hora_dia','turno_dia']].corr()

Unnamed: 0,hora_dia,turno_dia
hora_dia,1.0,0.948907
turno_dia,0.948907,1.0


In [6]:
df[['partidaTimeStamp','qtdDiasAno']].corr()

Unnamed: 0,partidaTimeStamp,qtdDiasAno
partidaTimeStamp,1.0,0.99996
qtdDiasAno,0.99996,1.0


In [7]:
#Vamos tirar hora_dia pois tem uma alta correlação com turno dia, escolhemos essa coluna pois os valores são apenas de 1 a 4 enquanto hora é de 0 a 23
df = df.drop(['hora_dia','qtdDiasAno'], axis=1)

In [17]:
#Ordenando pela Data
df = df.sort_values(by=["data_partida"], ascending=False)

In [9]:
#Transformamos data_partida em index
df.index = df['data_partida']
df = df.drop(['data_partida'], axis=1)

In [10]:
#Separamos treino e teste
shape = int(df.shape[0] * 0.3)
df_teste = df[:shape]
df_treino = df[shape:]
print(f'Treino: {df_treino.shape}      Teste:{df_teste.shape}')

Treino: (6520, 5)      Teste:(2794, 5)


In [11]:
# Separamos a classe dos atributos 
x_teste = df_teste[['dia_semana','partidaTimeStamp','tipo_dia','turno_dia']]
y_teste = df_teste[['tempo_viagem']]
x_treino = df_teste[['dia_semana','partidaTimeStamp','tipo_dia','turno_dia']]
y_treino = df_teste[['tempo_viagem']]

In [12]:
# Treinamos os modelos e salvamos o tempo de execução
start_xgb = datetime.now()

#Criação e fit do modelo XGB
start_xgb = datetime.now()
modelo_xgb = xgb.XGBRegressor(early_stop_rounds = 100)
modelo_xgb.fit(x_treino, y_treino, eval_set = [(x_treino, y_treino)])
end_xgb = datetime.now()

#Criação e fit do modelo Random Forest
start_rf = datetime.now()
modelo_rf = RandomForestRegressor(n_estimators = 200)
modelo_rf.fit(x_treino, y_treino)
end_rf = datetime.now()

#Criação e fit do modelo Linear Regression
start_lr = datetime.now()
modelo_lr = LinearRegression()
modelo_lr.fit(x_treino, y_treino)
end_lr = datetime.now()

tempo_xgb = end_xgb - start_xgb
tempo_rf = end_rf - start_rf
tempo_lr = end_lr - start_lr

Parameters: { "early_stop_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-rmse:853.29634
[1]	validation_0-rmse:614.60942
[2]	validation_0-rmse:453.27272
[3]	validation_0-rmse:346.27430
[4]	validation_0-rmse:277.33126
[5]	validation_0-rmse:234.19713
[6]	validation_0-rmse:209.22531
[7]	validation_0-rmse:193.94198
[8]	validation_0-rmse:184.53124
[9]	validation_0-rmse:179.52859
[10]	validation_0-rmse:175.35359
[11]	validation_0-rmse:169.13970
[12]	validation_0-rmse:168.14845
[13]	validation_0-rmse:167.05183
[14]	validation_0-rmse:164.41886
[15]	validation_0-rmse:161.38067
[16]	validation_0-rmse:159.33325
[17]	validation_0-rmse:158.29349
[18]	validation_0-rmse:157.89885
[19]	validation_0-rmse:154.34953
[20]	validation_0-rmse:153

  modelo_rf.fit(x_treino, y_treino)


In [13]:
# Fizemos as predições e juntamos no DataFrame
df_teste["predicao_xgb"] = modelo_xgb.predict(x_teste)
df_teste["predicao_lr"] = modelo_lr.predict(x_teste)
df_teste["predicao_rf"] = modelo_rf.predict(x_teste)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste["predicao_xgb"] = modelo_xgb.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste["predicao_lr"] = modelo_lr.predict(x_teste)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_teste["predicao_rf"] = modelo_rf.predict(x_teste)


In [14]:
# Pesquisar os motivos de se usar apenas métricas relativas - Flávia disse que provavelmente por conta de outlier/valor distantes 
RRSE_xgb = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste["predicao_xgb"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
MAPE_xgb = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste["predicao_xgb"])

RRSE_rf = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste["predicao_rf"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
MAPE_rf = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste["predicao_rf"])

RRSE_lr = np.sqrt(sum((df_teste["tempo_viagem"] - df_teste["predicao_lr"]) ** 2) / sum((df_teste["tempo_viagem"] - np.mean(df_teste["tempo_viagem"])) ** 2))
MAPE_lr = mtr.mean_absolute_percentage_error(df_teste["tempo_viagem"], df_teste["predicao_lr"])

In [15]:
metrics = []
metrics.append(['Primeira execução','RRSE', RRSE_xgb, RRSE_rf, RRSE_lr])
metrics.append(['Primeira execução','MAPE', MAPE_xgb, MAPE_rf, MAPE_lr]) 
metrics.append(['Primeira execução','Tempo', tempo_xgb, tempo_rf, tempo_lr]) 
metrics = pd.DataFrame(metrics, columns=['Descrição','Métrica', 'XGBoosting','Random Forest','Linear Regression'])
metrics.to_csv('metrica_primeira_execução.csv',index=False)

In [16]:
df_teste.to_csv('previsão_primeira_execução.csv',index=False)

# Teste com Clima

In [47]:
#Lendo dados de clima e convertendo a data pra datetime (na leitura veio como objeto)
clima = pd.read_csv('clima.csv')
clima['Datetime'] = pd.to_datetime(clima['Datetime'])

In [None]:
#copiando o DAtaframe original para não alterar o primeiro
aux = df.copy()
#Mudanças na data_partida para conseguir fazer o merge
aux["data_partida2"] = pd.to_datetime(aux.index)
aux["data_partida2"] = aux["data_partida2"].dt.round("H")
#Merge dos dados e exclusão da coluna auxiliar
df_clima = aux.merge(clima, left_on="data_partida2", right_on="Datetime", how="left")
df_clima.drop(['data_partida2','Datetime'],axis=1,inplace=True)
df_clima.index = aux.index
df_clima.head()

In [48]:
df_clima

Unnamed: 0_level_0,tempo_viagem,dia_semana,partidaTimeStamp,tipo_dia,turno_dia,Descricao Chuva,Calor
data_partida,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-10-01 00:40:28,680,2,1569901228,1,1,0,1
2019-10-01 00:16:27,849,2,1569899787,1,1,0,1
2019-10-01 00:00:45,852,2,1569898845,1,1,0,1
2019-09-30 23:46:49,824,1,1569898009,1,4,0,1
2019-09-30 23:30:14,993,1,1569897014,1,4,0,1
...,...,...,...,...,...,...,...
2019-07-01 05:16:42,787,1,1561969002,1,2,0,1
2019-07-01 05:07:10,781,1,1561968430,1,2,0,1
2019-07-01 04:58:33,781,1,1561967913,1,1,0,1
2019-07-01 04:49:57,801,1,1561967397,1,1,0,1


Unnamed: 0_level_0,tempo_viagem,dia_semana,partidaTimeStamp,tipo_dia,turno_dia,Descricao Chuva,Calor
data_partida,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-10-01 00:40:28,680,2,1569901228,1,1,0,1
2019-10-01 00:16:27,849,2,1569899787,1,1,0,1
2019-10-01 00:00:45,852,2,1569898845,1,1,0,1
2019-09-30 23:46:49,824,1,1569898009,1,4,0,1
2019-09-30 23:30:14,993,1,1569897014,1,4,0,1
...,...,...,...,...,...,...,...
2019-07-01 05:16:42,787,1,1561969002,1,2,0,1
2019-07-01 05:07:10,781,1,1561968430,1,2,0,1
2019-07-01 04:58:33,781,1,1561967913,1,1,0,1
2019-07-01 04:49:57,801,1,1561967397,1,1,0,1
