In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from datetime import time
import matplotlib.pyplot as pplot
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
import math

  from collections import Sequence
  from numpy.core.umath_tests import inner1d


In [2]:
# Descrição das features:
# time           datetime  Data e Hora em que o veículo passa pelo pedágio;
# tollgate_id    string    Identificador do pedágio;
# direction      string    0: entra na rodovia pelo pedágio; 1: sai da rodovia pelo pedágio;
# vehicle_model  int       Um número que indica a capacidade do veículo;
# has_etc        string    Indica se o veículo possui ou não o sistema ETC; 0 - NÃO, 1 - SIM
# vehicle_type   string    0: veículo de passageiro; 1: veículo de carga
# weekday        int       Representa os dias da semana
# weekend        int       1: Para quando for fim de semana; 0: Para quando não for fim de semana

In [2]:
pd_volume_train = pd.read_csv('processed_train_volume2.csv')
#pd_volume_test = pd.read_csv('processed_test_volume2.csv')

In [3]:
pd_volume_train.head()
#pd_volume_test.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type,weekday,weekend,hour,date,holiday,time_window
0,2016-09-19 23:09:25,2,0,1,0,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"
1,2016-09-19 23:11:53,2,0,1,0,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"
2,2016-09-19 23:13:54,2,0,1,0,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"
3,2016-09-19 23:17:48,1,0,1,1,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"
4,2016-09-19 23:16:07,2,0,1,0,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"


In [4]:
pd_volume_train['time'] =  pd.to_datetime(pd_volume_train['time'] , format='%Y-%m-%d %H:%M:%S')
#pd_volume_train = pd_volume_train.set_index(['time_window'])

# 车流量
pd_volume_train = pd_volume_train.groupby([pd.Grouper(freq='20T', key='time'), 'tollgate_id', 'direction', 'time_window', 'date', 'hour']).size()\
       .reset_index().rename(columns = {0:'volume'})

In [5]:
pd_volume_train.head()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume
0,2016-09-19,1,0,"[00:00:00,00:20:00)",2016-09-19,0,13
1,2016-09-19,1,1,"[00:00:00,00:20:00)",2016-09-19,0,140
2,2016-09-19,2,0,"[00:00:00,00:20:00)",2016-09-19,0,2
3,2016-09-19,3,0,"[00:00:00,00:20:00)",2016-09-19,0,17
4,2016-09-19,3,1,"[00:00:00,00:20:00)",2016-09-19,0,181


In [6]:
pd_volume_train['weekday'] = pd_volume_train['time'].dt.dayofweek + 1

In [7]:
pd_volume_train[pd_volume_train['weekday'] == 3]

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday
694,2016-09-21 00:00:00,1,0,"[00:00:00,00:20:00)",2016-09-21,0,12,3
695,2016-09-21 00:00:00,1,1,"[00:00:00,00:20:00)",2016-09-21,0,97,3
696,2016-09-21 00:00:00,2,0,"[00:00:00,00:20:00)",2016-09-21,0,2,3
697,2016-09-21 00:00:00,3,0,"[00:00:00,00:20:00)",2016-09-21,0,19,3
698,2016-09-21 00:00:00,3,1,"[00:00:00,00:20:00)",2016-09-21,0,124,3
699,2016-09-21 00:20:00,1,0,"[00:20:00,00:40:00)",2016-09-21,0,6,3
700,2016-09-21 00:20:00,1,1,"[00:20:00,00:40:00)",2016-09-21,0,64,3
701,2016-09-21 00:20:00,2,0,"[00:20:00,00:40:00)",2016-09-21,0,2,3
702,2016-09-21 00:20:00,3,0,"[00:20:00,00:40:00)",2016-09-21,0,17,3
703,2016-09-21 00:20:00,3,1,"[00:20:00,00:40:00)",2016-09-21,0,116,3


In [8]:
#Adicionando valor da janela de tempo anterior na janela de tempo atual
pd_volume_train["volume_anterior"] = pd_volume_train.groupby(['direction', 'tollgate_id'])["volume"].transform("shift")
#condition = ~(df.groupby("time_window")['time'].transform("shift") == df['time'])
#df.loc[ condition,"volume_anterior" ] = None
pd_volume_train["volume_anterior"] =pd_volume_train.groupby(['direction', 'tollgate_id'])["volume_anterior"].fillna(method="ffill").fillna(0)
pd_volume_train

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday,volume_anterior
0,2016-09-19 00:00:00,1,0,"[00:00:00,00:20:00)",2016-09-19,0,13,1,0.0
1,2016-09-19 00:00:00,1,1,"[00:00:00,00:20:00)",2016-09-19,0,140,1,0.0
2,2016-09-19 00:00:00,2,0,"[00:00:00,00:20:00)",2016-09-19,0,2,1,0.0
3,2016-09-19 00:00:00,3,0,"[00:00:00,00:20:00)",2016-09-19,0,17,1,0.0
4,2016-09-19 00:00:00,3,1,"[00:00:00,00:20:00)",2016-09-19,0,181,1,0.0
5,2016-09-19 00:20:00,1,0,"[00:20:00,00:40:00)",2016-09-19,0,6,1,13.0
6,2016-09-19 00:20:00,1,1,"[00:20:00,00:40:00)",2016-09-19,0,56,1,140.0
7,2016-09-19 00:20:00,3,0,"[00:20:00,00:40:00)",2016-09-19,0,16,1,17.0
8,2016-09-19 00:20:00,3,1,"[00:20:00,00:40:00)",2016-09-19,0,112,1,181.0
9,2016-09-19 00:40:00,1,0,"[00:40:00,01:00:00)",2016-09-19,0,9,1,6.0


In [19]:
#Adicionando valor da janela de tempo (2x) anterior na janela de tempo atual
pd_volume_train["volume_anterior_2"] = pd_volume_train.groupby(['direction', 'tollgate_id'])["volume"].transform("shift", 2)
#condition = ~(df.groupby("time_window")['time'].transform("shift") == df['time'])
#df.loc[ condition,"volume_anterior" ] = None
pd_volume_train["volume_anterior_2"] =pd_volume_train.groupby(['direction', 'tollgate_id'])["volume_anterior_2"].fillna(method="ffill").fillna(0)
pd_volume_train

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday,volume_anterior,volume_anterior_2
0,2016-09-19 00:00:00,1,0,"[00:00:00,00:20:00)",2016-09-19,0,13,1,0.0,0.0
1,2016-09-19 00:00:00,1,1,"[00:00:00,00:20:00)",2016-09-19,0,140,1,0.0,0.0
2,2016-09-19 00:00:00,2,0,"[00:00:00,00:20:00)",2016-09-19,0,2,1,0.0,0.0
3,2016-09-19 00:00:00,3,0,"[00:00:00,00:20:00)",2016-09-19,0,17,1,0.0,0.0
4,2016-09-19 00:00:00,3,1,"[00:00:00,00:20:00)",2016-09-19,0,181,1,0.0,0.0
5,2016-09-19 00:20:00,1,0,"[00:20:00,00:40:00)",2016-09-19,0,6,1,13.0,0.0
6,2016-09-19 00:20:00,1,1,"[00:20:00,00:40:00)",2016-09-19,0,56,1,140.0,0.0
7,2016-09-19 00:20:00,3,0,"[00:20:00,00:40:00)",2016-09-19,0,16,1,17.0,0.0
8,2016-09-19 00:20:00,3,1,"[00:20:00,00:40:00)",2016-09-19,0,112,1,181.0,0.0
9,2016-09-19 00:40:00,1,0,"[00:40:00,01:00:00)",2016-09-19,0,9,1,6.0,13.0


In [8]:
pd_volume_train.head()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday,volume_anterior
0,2016-09-19,1,0,"[00:00:00,00:20:00)",2016-09-19,0,13,1,0.0
1,2016-09-19,1,1,"[00:00:00,00:20:00)",2016-09-19,0,140,1,0.0
2,2016-09-19,2,0,"[00:00:00,00:20:00)",2016-09-19,0,2,1,0.0
3,2016-09-19,3,0,"[00:00:00,00:20:00)",2016-09-19,0,17,1,0.0
4,2016-09-19,3,1,"[00:00:00,00:20:00)",2016-09-19,0,181,1,0.0


In [10]:
# Converte array em matriz
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return numpy.array(dataX), numpy.array(dataY)

In [9]:
# Transpõe
#df_transp = df.T

# Seta o nome das colunas para os valores da primeira linha
#df_transp.columns = df_transp.iloc[0]

# Define os dados começando a partir da segunda linha
#df_transp = df_transp[1:]

#df_transp.loc[:,'media'] = df_transp.mean(numeric_only=True, axis=0).values

#df_transf = pd_volume_train.set_index(['time'])

# 车流量
#t6_train = t6_train.groupby([pd.TimeGrouper('20Min'), 'tollgate_id', 'direction']).size()\
#       .reset_index().rename(columns = {0:'volume'})

#t6_train = t6_train.set_index(['time'])

# 车流量
#t6_train = t6_train.groupby([pd.TimeGrouper('20Min'), 'tollgate_id', 'direction']).size()\
#       .reset_index().rename(columns = {0:'volume'})

#df_transf = pd_volume_train.groupby(['time_window','weekday','tollgate_id', 'direction', 'date']).size()\
 #      .reset_index().rename(columns = {0:'volume'})

df_transf = pd_volume_train
df_transf.head()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday,volume_anterior
0,2016-09-19,1,0,"[00:00:00,00:20:00)",2016-09-19,0,13,1,0.0
1,2016-09-19,1,1,"[00:00:00,00:20:00)",2016-09-19,0,140,1,0.0
2,2016-09-19,2,0,"[00:00:00,00:20:00)",2016-09-19,0,2,1,0.0
3,2016-09-19,3,0,"[00:00:00,00:20:00)",2016-09-19,0,17,1,0.0
4,2016-09-19,3,1,"[00:00:00,00:20:00)",2016-09-19,0,181,1,0.0


In [10]:
df_transf['media_volume'] = df_transf.groupby(['time_window', 'weekday', 'direction', 'tollgate_id'])["volume"].transform(np.mean)

In [11]:
#df_transf['media_volume'] = df_transf['soma'].mean
df_transf.head()
#del df_transf['soma']

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday,volume_anterior,media_volume
0,2016-09-19,1,0,"[00:00:00,00:20:00)",2016-09-19,0,13,1,0.0,14.0
1,2016-09-19,1,1,"[00:00:00,00:20:00)",2016-09-19,0,140,1,0.0,102.0
2,2016-09-19,2,0,"[00:00:00,00:20:00)",2016-09-19,0,2,1,0.0,2.333333
3,2016-09-19,3,0,"[00:00:00,00:20:00)",2016-09-19,0,17,1,0.0,17.6
4,2016-09-19,3,1,"[00:00:00,00:20:00)",2016-09-19,0,181,1,0.0,127.6


In [12]:
values = np.array(df_transf['volume'])

In [13]:
values

array([ 13, 140,   2, ...,  18,  15,  22])

In [12]:
df_transf['desvio_padrao'] = df_transf.groupby(['time_window', 'weekday', 'direction', 'tollgate_id'])["volume"].transform(np.std)

In [13]:
df_transf.head()
#del df_transf['desvio_padrao']

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday,volume_anterior,media_volume,desvio_padrao
0,2016-09-19,1,0,"[00:00:00,00:20:00)",2016-09-19,0,13,1,0.0,14.0,4.795832
1,2016-09-19,1,1,"[00:00:00,00:20:00)",2016-09-19,0,140,1,0.0,102.0,45.912961
2,2016-09-19,2,0,"[00:00:00,00:20:00)",2016-09-19,0,2,1,0.0,2.333333,0.57735
3,2016-09-19,3,0,"[00:00:00,00:20:00)",2016-09-19,0,17,1,0.0,17.6,2.607681
4,2016-09-19,3,1,"[00:00:00,00:20:00)",2016-09-19,0,181,1,0.0,127.6,58.734998


In [14]:
#df_transf['mediaArredondada'] = df_transf['media_volume'].sum()
#df_transf['mediaArrendodada'] = df_transf['media_volume'].mean()
df_transf['desvio_padrao'].fillna(df_transf.groupby(['time_window', 'weekday', 'direction', 'tollgate_id'])["volume"].transform(np.mean), inplace=True)
#del df_transf['mediaArredondada']

In [15]:
df_transf.isnull().sum()

time               0
tollgate_id        0
direction          0
time_window        0
date               0
hour               0
volume             0
weekday            0
volume_anterior    0
media_volume       0
desvio_padrao      0
dtype: int64

In [18]:
df_transf.to_csv('dados_treino_volume_com_valor_anterior.csv', index = False)

In [446]:
#df_transf.isnull().sum()
medi

time              0
tollgate_id       0
direction         0
time_window       0
date              0
hour              0
volume            0
weekday           0
media_volume      0
desvio_padrao    27
dtype: int64

In [19]:
pd_volume_test = pd.read_csv('processed_test_volume2.csv')

In [20]:
pd_volume_test.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type,weekday,weekend,hour,date,holiday,time_window
0,2016-10-18 07:59:04,2,0,1,1,0,2,0,7,2016-10-18,False,"[07:40:00,08:00:00)"
1,2016-10-18 07:59:31,2,0,1,1,0,2,0,7,2016-10-18,False,"[07:40:00,08:00:00)"
2,2016-10-18 07:59:50,2,0,1,1,0,2,0,7,2016-10-18,False,"[07:40:00,08:00:00)"
3,2016-10-18 07:32:33,3,0,1,1,0,2,0,7,2016-10-18,False,"[07:20:00,07:40:00)"
4,2016-10-18 07:32:46,3,0,1,1,0,2,0,7,2016-10-18,False,"[07:20:00,07:40:00)"


In [21]:
pd_volume_test['time'] =  pd.to_datetime(pd_volume_test['time'] , format='%Y-%m-%d %H:%M:%S')
#pd_volume_train = pd_volume_train.set_index(['time_window'])

# 车流量
pd_volume_test = pd_volume_test.groupby([pd.Grouper(freq='20T', key='time'), 'tollgate_id', 'direction', 'time_window', 'date', 'hour']).size()\
       .reset_index().rename(columns = {0:'volume'})

In [22]:
pd_volume_test.head()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume
0,2016-10-18 06:00:00,1,0,"[06:00:00,06:20:00)",2016-10-18,6,13
1,2016-10-18 06:00:00,1,1,"[06:00:00,06:20:00)",2016-10-18,6,37
2,2016-10-18 06:00:00,2,0,"[06:00:00,06:20:00)",2016-10-18,6,24
3,2016-10-18 06:00:00,3,0,"[06:00:00,06:20:00)",2016-10-18,6,30
4,2016-10-18 06:00:00,3,1,"[06:00:00,06:20:00)",2016-10-18,6,23


In [23]:
pd_volume_test['weekday'] = pd_volume_test['time'].dt.dayofweek + 1

In [24]:
pd_volume_test[pd_volume_test['weekday'] == 3].head()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday
60,2016-10-19 06:00:00,1,0,"[06:00:00,06:20:00)",2016-10-19,6,12,3
61,2016-10-19 06:00:00,1,1,"[06:00:00,06:20:00)",2016-10-19,6,48,3
62,2016-10-19 06:00:00,2,0,"[06:00:00,06:20:00)",2016-10-19,6,26,3
63,2016-10-19 06:00:00,3,0,"[06:00:00,06:20:00)",2016-10-19,6,24,3
64,2016-10-19 06:00:00,3,1,"[06:00:00,06:20:00)",2016-10-19,6,21,3


In [25]:
pd_volume_test["volume_anterior"] = pd_volume_test.groupby(['direction', 'tollgate_id'])["volume"].transform("shift")
#condition = ~(df.groupby("time_window")['time'].transform("shift") == df['time'])
#df.loc[ condition,"volume_anterior" ] = None
pd_volume_test["volume_anterior"] =pd_volume_test.groupby("time_window")["volume_anterior"].fillna(method="ffill").fillna(0)
pd_volume_test.head()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday,volume_anterior
0,2016-10-18 06:00:00,1,0,"[06:00:00,06:20:00)",2016-10-18,6,13,2,0.0
1,2016-10-18 06:00:00,1,1,"[06:00:00,06:20:00)",2016-10-18,6,37,2,0.0
2,2016-10-18 06:00:00,2,0,"[06:00:00,06:20:00)",2016-10-18,6,24,2,0.0
3,2016-10-18 06:00:00,3,0,"[06:00:00,06:20:00)",2016-10-18,6,30,2,0.0
4,2016-10-18 06:00:00,3,1,"[06:00:00,06:20:00)",2016-10-18,6,23,2,0.0


In [26]:
pd_volume_test["volume_anterior_2"] = pd_volume_test.groupby(['direction', 'tollgate_id'])["volume"].transform("shift", 2)
#condition = ~(df.groupby("time_window")['time'].transform("shift") == df['time'])
#df.loc[ condition,"volume_anterior" ] = None
pd_volume_test["volume_anterior_2"] =pd_volume_test.groupby("time_window")["volume_anterior_2"].fillna(method="ffill").fillna(0)
pd_volume_test.head()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday,volume_anterior,volume_anterior_2
0,2016-10-18 06:00:00,1,0,"[06:00:00,06:20:00)",2016-10-18,6,13,2,0.0,0.0
1,2016-10-18 06:00:00,1,1,"[06:00:00,06:20:00)",2016-10-18,6,37,2,0.0,0.0
2,2016-10-18 06:00:00,2,0,"[06:00:00,06:20:00)",2016-10-18,6,24,2,0.0,0.0
3,2016-10-18 06:00:00,3,0,"[06:00:00,06:20:00)",2016-10-18,6,30,2,0.0,0.0
4,2016-10-18 06:00:00,3,1,"[06:00:00,06:20:00)",2016-10-18,6,23,2,0.0,0.0


In [27]:
pd_volume_test['media_volume'] = pd_volume_test.groupby(['time_window', 'direction', 'tollgate_id'])["volume"].transform(np.mean)

In [28]:
pd_volume_test.tail()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,volume,weekday,volume_anterior,volume_anterior_2,media_volume
415,2016-10-24 16:40:00,1,0,"[16:40:00,17:00:00)",2016-10-24,16,44,1,50.0,49.0,50.571429
416,2016-10-24 16:40:00,1,1,"[16:40:00,17:00:00)",2016-10-24,16,103,1,108.0,99.0,102.285714
417,2016-10-24 16:40:00,2,0,"[16:40:00,17:00:00)",2016-10-24,16,80,1,75.0,69.0,72.857143
418,2016-10-24 16:40:00,3,0,"[16:40:00,17:00:00)",2016-10-24,16,130,1,100.0,115.0,125.428571
419,2016-10-24 16:40:00,3,1,"[16:40:00,17:00:00)",2016-10-24,16,81,1,80.0,101.0,88.857143


In [29]:
pd_volume_test['desvio_padrao'] = pd_volume_test.groupby(['time_window', 'direction', 'tollgate_id'])["volume"].transform(np.std)

In [30]:
pd_volume_test.isnull().sum()

time                 0
tollgate_id          0
direction            0
time_window          0
date                 0
hour                 0
volume               0
weekday              0
volume_anterior      0
volume_anterior_2    0
media_volume         0
desvio_padrao        0
dtype: int64

In [16]:
ve_train = pd.read_csv('dados_treino_volume_com_valor_anterior.csv')
ve_test = pd.read_csv('dados_teste_volume_com_valor_anterior.csv')
ve_train.count()

time               10063
tollgate_id        10063
direction          10063
time_window        10063
date               10063
hour               10063
volume             10063
weekday            10063
volume_anterior    10063
media_volume       10063
desvio_padrao      10063
dtype: int64

In [32]:
pd_volume_test.to_csv('dados_teste_volume_com_valor_anterior.csv', index = False)

In [45]:
v_train = pd.read_csv('dados_treino_volume_com_valor_anterior.csv')
v_test = pd.read_csv('dados_teste_volume_com_valor_anterior.csv')

In [46]:
df_remove = v_train.loc[(v_train['date'] >= '2016-10-01') 
                         & (v_train['date'] <= '2016-10-07') 
                        ]

v_train = v_train.drop(df_remove.index)

In [47]:
def feature_format():
    #pd_volume_train = pd_volume_train.set_index(['time'])
    #pd_volume_test = pd_volume_test.set_index(['time'])
    #volume_train = v_train.groupby(['time_window','tollgate_id','direction','date', 'hour']).size().reset_index().rename(columns = {0:'volume'})
    #volume_test = v_test.groupby(['time_window','tollgate_id','direction','date', 'hour']).size().reset_index().rename(columns = {0:'volume'})
    #print(volume_train)                
    x = pd.Series(v_train['time_window'].unique())
    s = pd.Series(range(len(x)),index = x.values)
    v_train['window_n'] = v_train['time_window'].map(s)
    v_test['window_n'] = v_test['time_window'].map(s)
#        print vol_test.tail()
    #volume_train['weekday'] = v_train['weekday']
    #volume_test['weekday'] = v_test['weekday']
    
    feature_train = v_train.drop('volume', axis = 1)
    feature_test = v_test.drop('volume',axis = 1)
    values_train = v_train['volume'].values
    values_test = v_test['volume'].values
    
    return feature_train, feature_test, values_train, values_test

In [48]:
feature_train, feature_test, values_train, values_test = feature_format()

In [49]:
feature_test.head()
#pd_volume_train[pd_volume_train['weekday'] == 3].head()

Unnamed: 0,time,tollgate_id,direction,time_window,date,hour,weekday,volume_anterior,volume_anterior_2,media_volume,desvio_padrao,window_n
0,2016-10-18 06:00:00,1,0,"[06:00:00,06:20:00)",2016-10-18,6,2,0.0,0.0,11.285714,3.9036,18
1,2016-10-18 06:00:00,1,1,"[06:00:00,06:20:00)",2016-10-18,6,2,0.0,0.0,36.0,8.205689,18
2,2016-10-18 06:00:00,2,0,"[06:00:00,06:20:00)",2016-10-18,6,2,0.0,0.0,20.571429,6.629659,18
3,2016-10-18 06:00:00,3,0,"[06:00:00,06:20:00)",2016-10-18,6,2,0.0,0.0,28.428571,8.22308,18
4,2016-10-18 06:00:00,3,1,"[06:00:00,06:20:00)",2016-10-18,6,2,0.0,0.0,18.857143,4.980916,18


In [36]:
len(values_train)

10063

In [50]:
rng = np.random.RandomState(1)
regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 50),
                         n_estimators=300, random_state = rng)

In [52]:
regr.fit(feature_train[['window_n', 'weekday', 'volume_anterior','media_volume', 'desvio_padrao']], values_train)

y_pred = regr.predict(feature_test[['window_n', 'weekday', 'volume_anterior', 'media_volume', 'desvio_padrao']])

mape = np.mean(np.abs((y_pred - values_test)/values_test))

#print (feature_test)
print(mape)
#regr.score(feature_train[['window_n','tollgate_id', 'direction', 'weekday', 'volume_anterior','volume_anterior_2','media_volume', 'desvio_padrao']], values_train) 


0.19263590849286105


In [67]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1500, random_state = 42)
# Train the model on training data
rf.fit(feature_train[['weekday','volume_anterior','media_volume', 'desvio_padrao', 'window_n']], values_train);

In [68]:
# Use the forest's predict method on the test data
predictions = rf.predict(feature_test[['weekday', 'volume_anterior', 'media_volume', 'desvio_padrao', 'window_n']])
# Calculate the absolute errors
errors = abs(predictions - values_test)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 9.39 degrees.


In [69]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / values_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 82.28 %.


In [70]:
#Função que calcula o MAPE
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [71]:
mean_absolute_percentage_error(values_test, predictions)

17.718611807255595

In [78]:
def rmse(predictions, targets):
    return np.sqrt(np.mean((predictions-targets)**2))

In [80]:
rmse(values_test, predictions)

12.594800897836254

In [74]:
values_test

array([ 13,  37,  24,  30,  23,  17,  47,  36,  42,  29,  21,  72,  46,
        85,  50,  31,  68,  83, 114,  56,  28,  94,  87, 131,  77,  47,
       105,  97, 164,  91,  52,  72,  74, 116,  81,  38, 102,  78,  87,
        91,  35, 116,  53, 106,  83,  57, 102,  62, 109,  84,  45,  84,
        63, 117,  92,  53, 113,  83, 118,  83,  12,  48,  26,  24,  21,
        16,  28,  45,  57,  26,  17,  57,  47,  75,  43,  22,  67,  87,
       119,  53,  38,  98,  95, 137,  63,  41, 103,  99, 152, 104,  40,
        83,  70, 107,  99,  34,  99,  69,  96, 108,  50,  92,  58,  95,
        79,  39,  82,  55, 109,  82,  40,  80,  61, 105,  89,  36,  85,
        64,  95,  75,   8,  46,  21,  37,  16,  16,  27,  23,  44,  49,
        23,  55,  42,  73,  37,  22,  71,  76, 114,  47,  33,  88,  89,
       123,  60,  37, 111, 112, 120, 104,  40,  97,  72,  92, 100,  41,
       115,  79, 107, 105,  42, 120,  70, 108,  74,  46,  90,  66, 121,
        92,  36,  73,  67, 106,  84,  37, 119,  75, 147,  88,   

In [75]:
predictions

array([ 11.49427381,  29.65311111,  18.44033333,  20.96661111,
        15.86296667,  15.69378333,  44.83952222,  30.96304444,
        40.74986667,  33.18321111,  19.44950317,  64.6414    ,
        42.54427619,  74.33266667,  39.10413333,  24.16549048,
        71.36678889,  77.47077778, 106.47744444,  61.25816667,
        38.55916667,  94.04733333,  93.81245714, 123.6416873 ,
        71.45114444,  36.79311111, 101.00777778, 106.79727778,
       139.088     ,  96.39666667,  51.256     ,  95.32132222,
        76.68711111, 130.56483333,  97.07862222,  53.43272222,
        94.822     ,  75.93855556,  98.50333333,  97.11266667,
        47.47972222, 110.78816667,  67.28868889, 107.4011    ,
        87.19888889,  51.15022222,  99.22802222,  71.05477778,
       112.38155556,  96.10705556,  56.03040952,  98.06717778,
        71.88844444, 112.29127778,  93.98692222,  47.86668571,
        97.37216667,  69.49653333, 118.72808889,  92.38195238,
        13.07216667,  47.55333333,  22.336     ,  33.20