In [448]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from datetime import time
import matplotlib.pyplot as pplot
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
import math

In [2]:
# Descrição das features:
# time           datetime  Data e Hora em que o veículo passa pelo pedágio;
# tollgate_id    string    Identificador do pedágio;
# direction      string    0: entra na rodovia pelo pedágio; 1: sai da rodovia pelo pedágio;
# vehicle_model  int       Um número que indica a capacidade do veículo;
# has_etc        string    Indica se o veículo possui ou não o sistema ETC; 0 - NÃO, 1 - SIM
# vehicle_type   string    0: veículo de passageiro; 1: veículo de carga
# weekday        int       Representa os dias da semana
# weekend        int       1: Para quando for fim de semana; 0: Para quando não for fim de semana

In [476]:
pd_volume_train = pd.read_csv('processed_train_volume2.csv')
#pd_volume_test = pd.read_csv('processed_test_volume2.csv')

In [477]:
pd_volume_train.head()
#pd_volume_test.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type,weekday,weekend,hour,date,holiday,time_window
0,2016-09-19 23:09:25,2,0,1,0,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"
1,2016-09-19 23:11:53,2,0,1,0,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"
2,2016-09-19 23:13:54,2,0,1,0,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"
3,2016-09-19 23:17:48,1,0,1,1,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"
4,2016-09-19 23:16:07,2,0,1,0,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"


In [478]:
pd_volume_train['time'] =  pd.to_datetime(pd_volume_train['time'] , format='%Y-%m-%d %H:%M:%S')
#pd_volume_train = pd_volume_train.set_index(['time_window'])

# 车流量
pd_volume_train = pd_volume_train.groupby([pd.Grouper(freq='20T', key='time'), 'tollgate_id', 'direction', 'time_window', 'date', 'hour']).size()\
       .reset_index().rename(columns = {0:'volume'})

In [479]:
pd_volume_train.head()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume
0,2016-09-19,1,0,"[00:00:00,00:20:00)",2016-09-19,0,13
1,2016-09-19,1,1,"[00:00:00,00:20:00)",2016-09-19,0,140
2,2016-09-19,2,0,"[00:00:00,00:20:00)",2016-09-19,0,2
3,2016-09-19,3,0,"[00:00:00,00:20:00)",2016-09-19,0,17
4,2016-09-19,3,1,"[00:00:00,00:20:00)",2016-09-19,0,181


In [480]:
pd_volume_train['weekday'] = pd_volume_train['time'].dt.dayofweek + 1

In [481]:
pd_volume_train[pd_volume_train['weekday'] == 3].head()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday
694,2016-09-21,1,0,"[00:00:00,00:20:00)",2016-09-21,0,12,3
695,2016-09-21,1,1,"[00:00:00,00:20:00)",2016-09-21,0,97,3
696,2016-09-21,2,0,"[00:00:00,00:20:00)",2016-09-21,0,2,3
697,2016-09-21,3,0,"[00:00:00,00:20:00)",2016-09-21,0,19,3
698,2016-09-21,3,1,"[00:00:00,00:20:00)",2016-09-21,0,124,3


In [482]:
pd.get_dummies(pd_volume_train['time_window']).head()

Unnamed: 0,"[00:00:00,00:20:00)","[00:20:00,00:40:00)","[00:40:00,01:00:00)","[01:00:00,01:20:00)","[01:20:00,01:40:00)","[01:40:00,02:00:00)","[02:00:00,02:20:00)","[02:20:00,02:40:00)","[02:40:00,03:00:00)","[03:00:00,03:20:00)",...,"[20:40:00,21:00:00)","[21:00:00,21:20:00)","[21:20:00,21:40:00)","[21:40:00,22:00:00)","[22:00:00,22:20:00)","[22:20:00,22:40:00)","[22:40:00,23:00:00)","[23:00:00,23:20:00)","[23:20:00,23:40:00)","[23:40:00,00:00:00)"
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
# Converte array em matriz
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return numpy.array(dataX), numpy.array(dataY)

In [483]:
# Transpõe
#df_transp = df.T

# Seta o nome das colunas para os valores da primeira linha
#df_transp.columns = df_transp.iloc[0]

# Define os dados começando a partir da segunda linha
#df_transp = df_transp[1:]

#df_transp.loc[:,'media'] = df_transp.mean(numeric_only=True, axis=0).values

#df_transf = pd_volume_train.set_index(['time'])

# 车流量
#t6_train = t6_train.groupby([pd.TimeGrouper('20Min'), 'tollgate_id', 'direction']).size()\
#       .reset_index().rename(columns = {0:'volume'})

#t6_train = t6_train.set_index(['time'])

# 车流量
#t6_train = t6_train.groupby([pd.TimeGrouper('20Min'), 'tollgate_id', 'direction']).size()\
#       .reset_index().rename(columns = {0:'volume'})

#df_transf = pd_volume_train.groupby(['time_window','weekday','tollgate_id', 'direction', 'date']).size()\
 #      .reset_index().rename(columns = {0:'volume'})

df_transf = pd_volume_train
df_transf.head()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday
0,2016-09-19,1,0,"[00:00:00,00:20:00)",2016-09-19,0,13,1
1,2016-09-19,1,1,"[00:00:00,00:20:00)",2016-09-19,0,140,1
2,2016-09-19,2,0,"[00:00:00,00:20:00)",2016-09-19,0,2,1
3,2016-09-19,3,0,"[00:00:00,00:20:00)",2016-09-19,0,17,1
4,2016-09-19,3,1,"[00:00:00,00:20:00)",2016-09-19,0,181,1


In [484]:
df_transf['media_volume'] = df_transf.groupby(['time_window', 'weekday', 'direction', 'tollgate_id'])["volume"].transform(np.mean)

In [485]:
#df_transf['media_volume'] = df_transf['soma'].mean
df_transf.head()
#del df_transf['soma']

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday,media_volume
0,2016-09-19,1,0,"[00:00:00,00:20:00)",2016-09-19,0,13,1,14.0
1,2016-09-19,1,1,"[00:00:00,00:20:00)",2016-09-19,0,140,1,102.0
2,2016-09-19,2,0,"[00:00:00,00:20:00)",2016-09-19,0,2,1,2.333333
3,2016-09-19,3,0,"[00:00:00,00:20:00)",2016-09-19,0,17,1,17.6
4,2016-09-19,3,1,"[00:00:00,00:20:00)",2016-09-19,0,181,1,127.6


In [379]:
values = np.array(df_transf['volume'])

In [382]:
values

array([ 13,  37,  24,  30,  23,  17,  47,  36,  42,  29,  21,  72,  46,
        85,  50,  31,  68,  83, 114,  56,  28,  94,  87, 131,  77,  47,
       105,  97, 164,  91,  52,  72,  74, 116,  81,  38, 102,  78,  87,
        91,  35, 116,  53, 106,  83,  57, 102,  62, 109,  84,  45,  84,
        63, 117,  92,  53, 113,  83, 118,  83,  12,  48,  26,  24,  21,
        16,  28,  45,  57,  26,  17,  57,  47,  75,  43,  22,  67,  87,
       119,  53,  38,  98,  95, 137,  63,  41, 103,  99, 152, 104,  40,
        83,  70, 107,  99,  34,  99,  69,  96, 108,  50,  92,  58,  95,
        79,  39,  82,  55, 109,  82,  40,  80,  61, 105,  89,  36,  85,
        64,  95,  75,   8,  46,  21,  37,  16,  16,  27,  23,  44,  49,
        23,  55,  42,  73,  37,  22,  71,  76, 114,  47,  33,  88,  89,
       123,  60,  37, 111, 112, 120, 104,  40,  97,  72,  92, 100,  41,
       115,  79, 107, 105,  42, 120,  70, 108,  74,  46,  90,  66, 121,
        92,  36,  73,  67, 106,  84,  37, 119,  75, 147,  88,   

In [486]:
df_transf['desvio_padrao'] = df_transf.groupby(['time_window', 'weekday', 'direction', 'tollgate_id'])["volume"].transform(np.std)

In [487]:
df_transf.head()
#del df_transf['desvio_padrao']

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday,media_volume,desvio_padrao
0,2016-09-19,1,0,"[00:00:00,00:20:00)",2016-09-19,0,13,1,14.0,4.795832
1,2016-09-19,1,1,"[00:00:00,00:20:00)",2016-09-19,0,140,1,102.0,45.912961
2,2016-09-19,2,0,"[00:00:00,00:20:00)",2016-09-19,0,2,1,2.333333,0.57735
3,2016-09-19,3,0,"[00:00:00,00:20:00)",2016-09-19,0,17,1,17.6,2.607681
4,2016-09-19,3,1,"[00:00:00,00:20:00)",2016-09-19,0,181,1,127.6,58.734998


In [489]:
#df_transf['mediaArredondada'] = df_transf['media_volume'].sum()
#df_transf['mediaArrendodada'] = df_transf['media_volume'].mean()
df_transf['desvio_padrao'].fillna(df_transf.groupby(['time_window', 'weekday', 'direction', 'tollgate_id'])["volume"].transform(np.mean), inplace=True)
#del df_transf['mediaArredondada']

In [490]:
df_transf.isnull().sum()

time             0
tollgate_id      0
direction        0
time_window      0
date             0
hour             0
volume           0
weekday          0
media_volume     0
desvio_padrao    0
dtype: int64

In [491]:
df_transf.to_csv('dados_com_media_desvio_treino_volume.csv', index = False)

In [446]:
#df_transf.isnull().sum()
medi

time              0
tollgate_id       0
direction         0
time_window       0
date              0
hour              0
volume            0
weekday           0
media_volume      0
desvio_padrao    27
dtype: int64

In [407]:
pd_volume_test = pd.read_csv('processed_test_volume2.csv')

In [408]:
pd_volume_test.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type,weekday,weekend,hour,date,holiday,time_window
0,2016-10-18 07:59:04,2,0,1,1,0,2,0,7,2016-10-18,False,"[07:40:00,08:00:00)"
1,2016-10-18 07:59:31,2,0,1,1,0,2,0,7,2016-10-18,False,"[07:40:00,08:00:00)"
2,2016-10-18 07:59:50,2,0,1,1,0,2,0,7,2016-10-18,False,"[07:40:00,08:00:00)"
3,2016-10-18 07:32:33,3,0,1,1,0,2,0,7,2016-10-18,False,"[07:20:00,07:40:00)"
4,2016-10-18 07:32:46,3,0,1,1,0,2,0,7,2016-10-18,False,"[07:20:00,07:40:00)"


In [409]:
pd_volume_test['time'] =  pd.to_datetime(pd_volume_test['time'] , format='%Y-%m-%d %H:%M:%S')
#pd_volume_train = pd_volume_train.set_index(['time_window'])

# 车流量
pd_volume_test = pd_volume_test.groupby([pd.Grouper(freq='20T', key='time'), 'tollgate_id', 'direction', 'time_window', 'date', 'hour']).size()\
       .reset_index().rename(columns = {0:'volume'})

In [410]:
pd_volume_test.head()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume
0,2016-10-18 06:00:00,1,0,"[06:00:00,06:20:00)",2016-10-18,6,13
1,2016-10-18 06:00:00,1,1,"[06:00:00,06:20:00)",2016-10-18,6,37
2,2016-10-18 06:00:00,2,0,"[06:00:00,06:20:00)",2016-10-18,6,24
3,2016-10-18 06:00:00,3,0,"[06:00:00,06:20:00)",2016-10-18,6,30
4,2016-10-18 06:00:00,3,1,"[06:00:00,06:20:00)",2016-10-18,6,23


In [412]:
pd_volume_test['weekday'] = pd_volume_test['time'].dt.dayofweek + 1

In [414]:
pd_volume_test[pd_volume_test['weekday'] == 3].head()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday
60,2016-10-19 06:00:00,1,0,"[06:00:00,06:20:00)",2016-10-19,6,12,3
61,2016-10-19 06:00:00,1,1,"[06:00:00,06:20:00)",2016-10-19,6,48,3
62,2016-10-19 06:00:00,2,0,"[06:00:00,06:20:00)",2016-10-19,6,26,3
63,2016-10-19 06:00:00,3,0,"[06:00:00,06:20:00)",2016-10-19,6,24,3
64,2016-10-19 06:00:00,3,1,"[06:00:00,06:20:00)",2016-10-19,6,21,3


In [423]:
pd_volume_test['media_volume'] = pd_volume_test.groupby(['time_window', 'direction', 'tollgate_id'])["volume"].transform(np.mean)

In [424]:
pd_volume_test.head()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday,media_volume
0,2016-10-18 06:00:00,1,0,"[06:00:00,06:20:00)",2016-10-18,6,13,2,11.285714
1,2016-10-18 06:00:00,1,1,"[06:00:00,06:20:00)",2016-10-18,6,37,2,36.0
2,2016-10-18 06:00:00,2,0,"[06:00:00,06:20:00)",2016-10-18,6,24,2,20.571429
3,2016-10-18 06:00:00,3,0,"[06:00:00,06:20:00)",2016-10-18,6,30,2,28.428571
4,2016-10-18 06:00:00,3,1,"[06:00:00,06:20:00)",2016-10-18,6,23,2,18.857143


In [425]:
pd_volume_test['desvio_padrao'] = pd_volume_test.groupby(['time_window', 'direction', 'tollgate_id'])["volume"].transform(np.std)

In [444]:
pd_volume_test.isnull().sum()

time             0
tollgate_id      0
direction        0
time_window      0
date             0
hour             0
volume           0
weekday          0
media_volume     0
desvio_padrao    0
dtype: int64

In [427]:
pd_volume_test.to_csv('dados_com_media_desvio_sem_weekday_treino_volume.csv', index = False)

In [492]:
def feature_format():
    v_train = pd.read_csv('dados_com_media_desvio_treino_volume.csv')
    v_test = pd.read_csv('dados_com_media_desvio_sem_weekday_treino_volume.csv')
    #pd_volume_train = pd_volume_train.set_index(['time'])
    #pd_volume_test = pd_volume_test.set_index(['time'])
    #volume_train = v_train.groupby(['time_window','tollgate_id','direction','date', 'hour']).size().reset_index().rename(columns = {0:'volume'})
    #volume_test = v_test.groupby(['time_window','tollgate_id','direction','date', 'hour']).size().reset_index().rename(columns = {0:'volume'})
    #print(volume_train)                
    x = pd.Series(v_train['time_window'].unique())
    s = pd.Series(range(len(x)),index = x.values)
    v_train['window_n'] = v_train['time_window'].map(s)
    v_test['window_n'] = v_test['time_window'].map(s)
#        print vol_test.tail()
    #volume_train['weekday'] = v_train['weekday']
    #volume_test['weekday'] = v_test['weekday']
    
    feature_train = v_train.drop('volume', axis = 1)
    feature_test = v_test.drop('volume',axis = 1)
    values_train = v_train['volume'].values
    values_test = v_test['volume'].values
    
    return feature_train, feature_test, values_train, values_test

In [493]:
feature_train, feature_test, values_train, values_test = feature_format()

In [494]:
feature_train.isnull().sum()
#pd_volume_train[pd_volume_train['weekday'] == 3].head()

time             0
tollgate_id      0
direction        0
time_window      0
date             0
hour             0
weekday          0
media_volume     0
desvio_padrao    0
window_n         0
dtype: int64

In [495]:
values_train

array([ 13, 140,   2, ...,  18,  15,  22])

In [496]:
rng = np.random.RandomState(1)
regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 15),
                         n_estimators=20, random_state = rng)

In [501]:
regr.fit(feature_train[['window_n', 'tollgate_id', 'direction', 'weekday', 'media_volume', 'desvio_padrao']], values_train)

y_pred = regr.predict(feature_test[['window_n', 'tollgate_id', 'direction', 'weekday', 'media_volume', 'desvio_padrao']])

mape = np.mean(np.abs((y_pred - values_test)/values_test))

#print (feature_test)
print (mape)


0.20885738574397447


In [500]:
#Função que calcula o MAPE
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [335]:
mean_absolute_percentage_error(y_pred, values_test)

13.20529099656337