In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from datetime import time
import matplotlib.pyplot as pplot
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

  from collections import Sequence
  from numpy.core.umath_tests import inner1d


In [2]:
# Descrição das features:
# time           datetime  Data e Hora em que o veículo passa pelo pedágio;
# tollgate_id    string    Identificador do pedágio;
# direction      string    0: entra na rodovia pelo pedágio; 1: sai da rodovia pelo pedágio;
# vehicle_model  int       Um número que indica a capacidade do veículo;
# has_etc        string    Indica se o veículo possui ou não o sistema ETC; 0 - NÃO, 1 - SIM
# vehicle_type   string    0: veículo de passageiro; 1: veículo de carga
# weekday        int       Representa os dias da semana
# weekend        int       1: Para quando for fim de semana; 0: Para quando não for fim de semana

In [15]:
volume_df = pd.read_csv("dataset/volume(table 6)_test1.csv")
volume_df.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
0,2016-10-18 07:59:04,2,0,1,1,
1,2016-10-18 07:59:31,2,0,1,1,
2,2016-10-18 07:59:50,2,0,1,1,
3,2016-10-18 07:32:33,3,0,1,1,
4,2016-10-18 07:32:46,3,0,1,1,


In [5]:
#Caracteristicas dos dados
volume_df.info()
volume_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 543699 entries, 0 to 543698
Data columns (total 6 columns):
time             543699 non-null object
tollgate_id      543699 non-null int64
direction        543699 non-null int64
vehicle_model    543699 non-null int64
has_etc          543699 non-null int64
vehicle_type     212710 non-null float64
dtypes: float64(1), int64(4), object(1)
memory usage: 24.9+ MB


Unnamed: 0,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
count,543699.0,543699.0,543699.0,543699.0,212710.0
mean,2.086138,0.391227,1.089689,0.22142,0.22597
std,0.925612,0.488026,0.838314,0.415203,0.41822
min,1.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,1.0,0.0,0.0
50%,2.0,0.0,1.0,0.0,0.0
75%,3.0,1.0,1.0,0.0,0.0
max,3.0,1.0,7.0,1.0,1.0


In [16]:
#Retirando os valores nulos da coluna vehicle_type pelo modelo do veículo.
    #No vehicle_type indica 0 para veículo de passageiros e 1 para carga.
    #Poderíamos verificar a partir do modelo do veiculo, para veiculo com capacidade de até 4
    #Ficou definido que sera para passageiro, sendo maior que 4 será veiculo de carga

In [17]:
#volume_df['vehicle_type'] = volume_df['vehicle_model'].apply(lambda x: 0 if x < 5 else 1)

In [18]:
volume_df.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
0,2016-10-18 07:59:04,2,0,1,1,0
1,2016-10-18 07:59:31,2,0,1,1,0
2,2016-10-18 07:59:50,2,0,1,1,0
3,2016-10-18 07:32:33,3,0,1,1,0
4,2016-10-18 07:32:46,3,0,1,1,0


In [19]:
volume_df.tail()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
29436,2016-10-24 16:27:39,3,1,1,0,0
29437,2016-10-24 16:27:18,3,1,1,0,0
29438,2016-10-24 16:37:42,3,1,5,0,1
29439,2016-10-24 16:47:48,3,1,5,0,1
29440,2016-10-24 16:48:40,3,1,2,0,0


In [20]:
#Função que será usada para obter a janela de tempo de 20 minutos
def get_timewindow(t):
        time_window = 20
        if t.minute < time_window:
            window = [time(t.hour, 0), time(t.hour,20)]
        elif t.minute < time_window*2:
            window = [time(t.hour, 20), time(t.hour, 40)]
        else:
            try:
                window = [time(t.hour, 40), time(t.hour + 1, 0)]
            except ValueError:
                window = [time(t.hour, 40), time(0,0,0)]
        s_window = '[' + str(window[0]) + ',' + str(window[1]) + ')'
        return s_window

def get_hour(t):
        return t.hour

In [21]:
#Ajustando o formato da coluna time
volume_df['time'] = pd.to_datetime(volume_df['time'], format = '%Y-%m-%d %H:%M:%S')

#Representam os feriados
holiday1 = [pd.Timestamp('2016-10-1'), pd.Timestamp('2016-10-8')]
holiday2 = [pd.Timestamp('2016-9-15'), pd.Timestamp('2016-9-18')]

#Adiciona valores para os dias da semana
volume_df['weekday'] = volume_df['time'].dt.dayofweek + 1

#Classificar cada atributo de time aplicando a janela de tempo de vinte minutos
volume_df['t'] = volume_df['time'].dt.time

#Adicionando valores para saber se é referente a um fim de semana ou não
volume_df['weekend'] = volume_df['weekday'].apply(lambda x: 0 if x < 6 else 1)

volume_df['hour'] = volume_df['t'].apply(get_hour)

volume_df['date'] = volume_df['time'].dt.date

volume_df['holiday'] = volume_df['time'].between(holiday1[0],holiday1[1])\
                            | volume_df['time'].between(holiday2[0],holiday2[1])

volume_df['time_window'] = volume_df['t'].apply(get_timewindow)
del volume_df['t']

volume_df.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type,weekday,weekend,hour,date,holiday,time_window
0,2016-10-18 07:59:04,2,0,1,1,0,2,0,7,2016-10-18,False,"[07:40:00,08:00:00)"
1,2016-10-18 07:59:31,2,0,1,1,0,2,0,7,2016-10-18,False,"[07:40:00,08:00:00)"
2,2016-10-18 07:59:50,2,0,1,1,0,2,0,7,2016-10-18,False,"[07:40:00,08:00:00)"
3,2016-10-18 07:32:33,3,0,1,1,0,2,0,7,2016-10-18,False,"[07:20:00,07:40:00)"
4,2016-10-18 07:32:46,3,0,1,1,0,2,0,7,2016-10-18,False,"[07:20:00,07:40:00)"


In [22]:
#Salvando dados de treino
volume_df.to_csv('processed_test_volume2.csv', index = False)


In [46]:
#Fazendo o mesmo processo para os dados de teste

In [41]:
pd_volume_train = pd.read_csv('processed_training_volume2.csv')
pd_volume_test = pd.read_csv('processed_test_volume2.csv')
pd_volume_train = pd_volume_train.set_index(['time'])
pd_volume_test = pd_volume_test.set_index(['time'])
volume_train = pd_volume_train.groupby(['time_window','tollgate_id','direction', 'hour', 'date']).size().reset_index().rename(columns = {0:'volume'})
volume_test = pd_volume_test.groupby(['time_window','tollgate_id','direction', 'hour', 'date']).size().reset_index().rename(columns = {0:'volume'})

x = pd.Series(volume_train['time_window'].unique())
s = pd.Series(range(len(x)),index = x.values)
volume_train['window_n'] = volume_train['time_window'].map(s)
volume_test['window_n'] = volume_test['time_window'].map(s)
#volume_test.tail()
#volume_train.tail()
#volume_train.head()
volume_test.head()

FileNotFoundError: File b'processed_training_volume2.csv' does not exist

In [100]:
def feature_format():
    pd_volume_train = pd.read_csv('processed_train_volume2.csv')
    pd_volume_test = pd.read_csv('processed_test_volume2.csv')
    #pd_volume_train = pd_volume_train.set_index(['time'])
    #pd_volume_test = pd_volume_test.set_index(['time'])
    volume_train = pd_volume_train.groupby(['time_window','tollgate_id','direction','date', 'hour']).size().reset_index().rename(columns = {0:'volume'})
    volume_test = pd_volume_test.groupby(['time_window','tollgate_id','direction','date', 'hour']).size().reset_index().rename(columns = {0:'volume'})
                    
    x = pd.Series(volume_train['time_window'].unique())
    s = pd.Series(range(len(x)),index = x.values)
    volume_train['window_n'] = volume_train['time_window'].map(s)
    volume_test['window_n'] = volume_test['time_window'].map(s)
#        print vol_test.tail()
    volume_train['weekday'] = pd_volume_train['weekday']
    volume_test['weekday'] = pd_volume_test['weekday']
    
    feature_train = volume_train.drop('volume', axis = 1)
    feature_test = volume_test.drop('volume',axis = 1)
    values_train = volume_train['volume'].values
    values_test = volume_test['volume'].values
    
    return feature_train, feature_test, values_train, values_test

In [104]:
pd_volume_train = pd.read_csv('processed_train_volume2.csv')
pd_volume_train.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type,weekday,weekend,hour,date,holiday,time_window
0,2016-09-19 23:09:25,2,0,1,0,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"
1,2016-09-19 23:11:53,2,0,1,0,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"
2,2016-09-19 23:13:54,2,0,1,0,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"
3,2016-09-19 23:17:48,1,0,1,1,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"
4,2016-09-19 23:16:07,2,0,1,0,0,1,0,23,2016-09-19,False,"[23:00:00,23:20:00)"


In [101]:
feature_train, feature_test, values_train, values_test = feature_format()

In [103]:
print(feature_train)

               time_window  tollgate_id  direction        date  hour  \
0      [00:00:00,00:20:00)            1          0  2016-09-19     0   
1      [00:00:00,00:20:00)            1          0  2016-09-20     0   
2      [00:00:00,00:20:00)            1          0  2016-09-21     0   
3      [00:00:00,00:20:00)            1          0  2016-09-22     0   
4      [00:00:00,00:20:00)            1          0  2016-09-23     0   
5      [00:00:00,00:20:00)            1          0  2016-09-24     0   
6      [00:00:00,00:20:00)            1          0  2016-09-25     0   
7      [00:00:00,00:20:00)            1          0  2016-09-26     0   
8      [00:00:00,00:20:00)            1          0  2016-09-27     0   
9      [00:00:00,00:20:00)            1          0  2016-09-28     0   
10     [00:00:00,00:20:00)            1          0  2016-09-29     0   
11     [00:00:00,00:20:00)            1          0  2016-09-30     0   
12     [00:00:00,00:20:00)            1          0  2016-10-01  

In [93]:
feature_train[['window_n', 'tollgate_id', 'direction', 'weekday']].head

<bound method NDFrame.head of        window_n  tollgate_id  direction  weekday
0             0            1          0        1
1             0            1          0        1
2             0            1          0        1
3             0            1          0        1
4             0            1          0        1
5             0            1          0        1
6             0            1          0        1
7             0            1          0        1
8             0            1          0        1
9             0            1          0        1
10            0            1          0        1
11            0            1          0        1
12            0            1          0        1
13            0            1          0        1
14            0            1          0        1
15            0            1          0        1
16            0            1          0        1
17            0            1          0        1
18            0            1          0

In [84]:
print(values_train)

[13 14 12 ...  9 11 22]


In [85]:
print(feature_test)

             time_window  tollgate_id  direction        date  hour  window_n  \
0    [06:00:00,06:20:00)            1          0  2016-10-18     6        18   
1    [06:00:00,06:20:00)            1          0  2016-10-19     6        18   
2    [06:00:00,06:20:00)            1          0  2016-10-20     6        18   
3    [06:00:00,06:20:00)            1          0  2016-10-21     6        18   
4    [06:00:00,06:20:00)            1          0  2016-10-22     6        18   
5    [06:00:00,06:20:00)            1          0  2016-10-23     6        18   
6    [06:00:00,06:20:00)            1          0  2016-10-24     6        18   
7    [06:00:00,06:20:00)            1          1  2016-10-18     6        18   
8    [06:00:00,06:20:00)            1          1  2016-10-19     6        18   
9    [06:00:00,06:20:00)            1          1  2016-10-20     6        18   
10   [06:00:00,06:20:00)            1          1  2016-10-21     6        18   
11   [06:00:00,06:20:00)            1   

In [86]:
rng = np.random.RandomState(1)
regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 15),
                         n_estimators=20, random_state = rng)

In [87]:
regr.fit(feature_train[['window_n', 'tollgate_id', 'direction', 'weekday']], values_train)

y_pred = regr.predict(feature_test[['window_n', 'tollgate_id', 'direction', 'weekday']])

mape = np.mean(np.abs((y_pred - values_test)/values_test))

#print (feature_test)
print (mape)

#print (y_pred,values_test)

0.6641250895650788


In [26]:
x_test.head()

Unnamed: 0_level_0,tollgate_id,direction,weekday,hour,window_n
time_window,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"[06:00:00,06:20:00)",1,0,1,6,18
"[06:00:00,06:20:00)",1,0,2,6,18
"[06:00:00,06:20:00)",1,0,3,6,18
"[06:00:00,06:20:00)",1,0,4,6,18
"[06:00:00,06:20:00)",1,0,5,6,18


In [27]:
y_test

array([ 18,  13,  12,   8,   8,   7,  13,  31,  37,  48,  46,  33,  31,
        26,  27,  24,  26,  21,  16,  22,   8,  41,  30,  24,  37,  22,
        27,  18,  23,  23,  21,  16,  21,  19,   9,  21,  17,  16,  16,
        13,   8,  10,  57,  47,  28,  27,  39,  36,  50,  39,  36,  45,
        23,  24,  22,  11,  53,  42,  57,  44,  49,  37,  29,  41,  29,
        26,  49,  28,  18,  19,  24,  21,  17,  23,  21,  22,   8,  59,
        72,  57,  55,  54,  56,  33,  52,  46,  47,  42,  39,  43,  25,
       105,  85,  75,  73,  66,  51,  43,  60,  50,  43,  37,  42,  25,
        33,  35,  31,  22,  22,  32,  14,  17,  76,  68,  67,  71,  65,
        49,  45, 104,  83,  87,  76,  66,  24,  29, 150, 114, 119, 114,
       116,  76,  51,  75,  56,  53,  47,  54,  38,  36,  34,  28,  38,
        33,  41,  27,  24, 116,  94,  98,  88, 102,  44,  41, 111,  87,
        95,  89,  77,  49,  35, 148, 131, 137, 123, 135,  89,  71, 101,
        77,  63,  60,  76,  39,  53,  43,  47,  41,  37,  38,  3

In [77]:
#Função que calcula a média quadrática
def rmse(predictions, targets):
    differences = predictions - targets
    differences_squared = differences ** 2
    mean_of_differences_squared = differences_squared.mean()
    rmse_val = np.sqrt(mean_of_differences_squared)
    return rmse_val

In [90]:
#Função que calcula o MAPE
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [89]:
#Executando algoritmo de regressão linear
model1 = LinearRegression()
model1.fit(feature_train[['window_n', 'tollgate_id', 'direction', 'weekday']], values_train)
pred_y1 = model1.predict(feature_test[['window_n', 'tollgate_id', 'direction', 'weekday']])
pred_y1

array([46.51090942, 46.51090942, 46.51090942, 46.51090942, 46.51090942,
       46.51090942, 46.51090942, 41.65792359, 41.65792359, 41.65792359,
       41.65792359, 41.65792359, 41.65792359, 41.65792359, 52.1068759 ,
       52.1068759 , 52.1068759 , 52.1068759 , 52.1068759 , 52.1068759 ,
       52.1068759 , 57.70284237, 57.70284237, 57.70284237, 57.70284237,
       57.70284237, 57.70284237, 57.70284237, 52.84985654, 52.84985654,
       52.84985654, 52.84985654, 52.84985654, 52.84985654, 52.84985654,
       46.72588268, 46.72588268, 46.72588268, 46.72588268, 46.72588268,
       46.72588268, 46.72588268, 41.87289684, 41.87289684, 41.87289684,
       41.87289684, 41.87289684, 41.87289684, 41.87289684, 52.32184915,
       52.32184915, 52.32184915, 52.32184915, 52.32184915, 52.32184915,
       52.32184915, 57.91781563, 57.91781563, 57.91781563, 57.91781563,
       57.91781563, 57.91781563, 57.91781563, 53.06482979, 53.06482979,
       53.06482979, 53.06482979, 53.06482979, 53.06482979, 53.06

In [81]:
#Algoritmo Regressão Linear com Gradiente Descendente
class LinearRegressionGD(object):

    def __init__(self, eta=0.001, n_iter=20):
        self.eta = eta
        self.n_iter = n_iter

    def fit(self, X, y):
        self.w_ = np.zeros(1 + X.shape[1])
        self.cost_ = []

        for i in range(self.n_iter):
            output = self.net_input(X)
            errors = (y - output)
            self.w_[1:] += self.eta * X.T.dot(errors)
            self.w_[0] += self.eta * errors.sum()
            cost = (errors**2).sum() / 2.0
            self.cost_.append(cost)
        return self

    def net_input(self, X):
        return np.dot(X, self.w_[1:]) + self.w_[0]

    def predict(self, X):
        return self.net_input(X)

In [89]:
model2 = LinearRegressionGD(eta = 0.0009, n_iter = 40)
model2.fit(x_train, y_train)
pred_y2 = model2.predict(x_test)

In [32]:
#Algoritmo Regressão Linear com Gradiente Descendente Estocrástico
model3 = SGDRegressor(eta0=0.0009, max_iter=40)
model3.fit(x_train, y_train)
pred_y3 = model3.predict(x_test)

In [91]:
mean_absolute_percentage_error(values_test, pred_y1)

56.9775897514345

In [102]:
print(  rmse(pred_y1, y_test),"\n", #Algoritmo 1
        rmse(pred_y2, y_test),"\n", #Algoritmo 2
        rmse(pred_y3, y_test) #Algoritmo 3
     )

150.6964566871464 
 5.701458141552023e+145 
 187.39084578228082


In [33]:
mean_squared_error(pred_y3, y_test)

28344.218948382113

In [40]:
rng = np.random.RandomState(1)
regr = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 21),
                         n_estimators=11, random_state = rng)

regr.fit(x_train, y_train)

y_pred = regr.predict(x_test)

mape = np.mean(np.abs((y_pred - y_test)/y_test))

print (mape)

4.002390009178407
