In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from datetime import time
import matplotlib.pyplot as pplot

  from collections import Sequence
  from numpy.core.umath_tests import inner1d


In [5]:
# Descrição das features:
# time           datetime  Data e Hora em que o veículo passa pelo pedágio;
# tollgate_id    string    Identificador do pedágio;
# direction      string    0: entra na rodovia pelo pedágio; 1: sai da rodovia pelo pedágio;
# vehicle_model  int       Um número que indica a capacidade do veículo;
# has_etc        string    Indica se o veículo possui ou não o sistema ETC; 0 - NÃO, 1 - SIM
# vehicle_type   string    0: veículo de passageiro; 1: veículo de carga
# weekday        int       Representa os dias da semana
# weekend        int       1: Para quando for fim de semana; 0: Para quando não for fim de semana

In [2]:
volume_df = pd.read_csv("dataset/volume(table 6)_training.csv")
volume_df.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
0,2016-09-19 23:09:25,2,0,1,0,
1,2016-09-19 23:11:53,2,0,1,0,
2,2016-09-19 23:13:54,2,0,1,0,
3,2016-09-19 23:17:48,1,0,1,1,
4,2016-09-19 23:16:07,2,0,1,0,


In [3]:
#Caracteristicas dos dados
volume_df.info()
volume_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 543699 entries, 0 to 543698
Data columns (total 6 columns):
time             543699 non-null object
tollgate_id      543699 non-null int64
direction        543699 non-null int64
vehicle_model    543699 non-null int64
has_etc          543699 non-null int64
vehicle_type     212710 non-null float64
dtypes: float64(1), int64(4), object(1)
memory usage: 24.9+ MB


Unnamed: 0,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
count,543699.0,543699.0,543699.0,543699.0,212710.0
mean,2.086138,0.391227,1.089689,0.22142,0.22597
std,0.925612,0.488026,0.838314,0.415203,0.41822
min,1.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,1.0,0.0,0.0
50%,2.0,0.0,1.0,0.0,0.0
75%,3.0,1.0,1.0,0.0,0.0
max,3.0,1.0,7.0,1.0,1.0


In [12]:
#Retirando os valores nulos da coluna vehicle_type pelo modelo do veículo.
    #No vehicle_type indica 0 para veículo de passageiros e 1 para carga.
    #Poderíamos verificar a partir do modelo do veiculo, para veiculo com capacidade de até 4
    #Ficou definido que sera para passageiro, sendo maior que 4 será veiculo de carga

In [4]:
volume_df['vehicle_type'] = volume_df['vehicle_model'].apply(lambda x: 0 if x < 5 else 1)

In [5]:
volume_df.head()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
0,2016-09-19 23:09:25,2,0,1,0,0
1,2016-09-19 23:11:53,2,0,1,0,0
2,2016-09-19 23:13:54,2,0,1,0,0
3,2016-09-19 23:17:48,1,0,1,1,0
4,2016-09-19 23:16:07,2,0,1,0,0


In [6]:
volume_df.tail()

Unnamed: 0,time,tollgate_id,direction,vehicle_model,has_etc,vehicle_type
543694,2016-10-17 18:18:22,1,1,1,1,0
543695,2016-10-17 18:18:49,1,1,1,1,0
543696,2016-10-17 18:20:03,1,1,1,1,0
543697,2016-10-17 17:20:20,1,1,1,1,0
543698,2016-10-17 17:21:52,1,1,1,1,0


In [7]:
#Função que será usada para obter a janela de tempo de 20 minutos
def get_timewindow(t):
        time_window = 20
        if t.minute < time_window:
            window = [time(t.hour, 0), time(t.hour,20)]
        elif t.minute < time_window*2:
            window = [time(t.hour, 20), time(t.hour, 40)]
        else:
            try:
                window = [time(t.hour, 40), time(t.hour + 1, 0)]
            except ValueError:
                window = [time(t.hour, 40), time(0,0,0)]
        s_window = '[' + str(window[0]) + ',' + str(window[1]) + ')'
        return s_window

In [15]:
#Ajustando o formato da coluna time
volume_df['time'] = pd.to_datetime(volume_df['time'], format = '%Y-%m-%d %H:%M:%S')

#Adiciona valores para os dias da semana
volume_df['weekday'] = volume_df['time'].dt.dayofweek + 1

#Adicionando valores para saber se é referente a um fim de semana ou não
volume_df['weekend'] = volume_df['weekday'].apply(lambda x: 0 if x < 6 else 1)

#Classificar cada atributo de time aplicando a janela de tempo de vinte minutos
volume_df['t'] = volume_df['time'].dt.time
volume_df['time_window'] = volume_df['t'].apply(get_timewindow)
del volume_df['t']

volume_df.head()
volume_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 543699 entries, 0 to 543698
Data columns (total 9 columns):
time             543699 non-null datetime64[ns]
tollgate_id      543699 non-null int64
direction        543699 non-null int64
vehicle_model    543699 non-null int64
has_etc          543699 non-null int64
vehicle_type     543699 non-null int64
weekday          543699 non-null int64
weekend          543699 non-null int64
time_window      543699 non-null object
dtypes: datetime64[ns](1), int64(7), object(1)
memory usage: 37.3+ MB


In [55]:
#Salvando dados de treino
volume_df.to_csv('processed_test_volume.csv', index = False)


In [46]:
#Fazendo o mesmo processo para os dados de teste

In [2]:
pd_volume_train = pd.read_csv('processed_train_volume.csv')
pd_volume_test = pd.read_csv('processed_test_volume.csv')
pd_volume_train = pd_volume_train.set_index(['time'])
pd_volume_test = pd_volume_test.set_index(['time'])
volume_train = pd_volume_train.groupby(['time_window','tollgate_id','direction','weekday', 'hour']).size().reset_index().rename(columns = {0:'volume'})
volume_test = pd_volume_test.groupby(['time_window','tollgate_id','direction','weekday', 'hour']).size().reset_index().rename(columns = {0:'volume'})

x = pd.Series(volume_train['time_window'].unique())
s = pd.Series(range(len(x)),index = x.values)
volume_train['window_n'] = volume_train['time_window'].map(s)
volume_test['window_n'] = volume_test['time_window'].map(s)
#volume_test.tail()
#volume_train.tail()
#volume_train.head()
volume_test.head()

KeyError: 'hour'

In [10]:
def feature_format():
    pd_volume_train = pd.read_csv('processed_train_volume.csv')
    pd_volume_test = pd.read_csv('processed_test_volume.csv')
    pd_volume_train = pd_volume_train.set_index(['time'])
    pd_volume_test = pd_volume_test.set_index(['time'])
    volume_train = pd_volume_train.groupby(['time_window','tollgate_id','direction','weekday']).size().reset_index().rename(columns = {0:'volume'})
    volume_test = pd_volume_test.groupby(['time_window','tollgate_id','direction','weekday']).size().reset_index().rename(columns = {0:'volume'})
                    
    x = pd.Series(volume_train['time_window'].unique())
    s = pd.Series(range(len(x)),index = x.values)
    volume_train['window_n'] = volume_train['time_window'].map(s)
    volume_test['window_n'] = volume_test['time_window'].map(s)
    #volume_test.tail()
    
    feature_train = volume_train.drop('volume', axis = 1).set_index(['time_window'])
    feature_test = volume_test.drop('volume',axis = 1).set_index(['time_window'])
    values_train = volume_train['volume'].values
    values_test = volume_test['volume'].values
    
    return feature_train, feature_test, values_train, values_test

In [11]:
x_train, x_test, y_train, y_test = feature_format()

In [12]:
x_train

Unnamed: 0_level_0,tollgate_id,direction,weekday,window_n
time_window,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"[00:00:00,00:20:00)",1,0,1,0
"[00:00:00,00:20:00)",1,0,2,0
"[00:00:00,00:20:00)",1,0,3,0
"[00:00:00,00:20:00)",1,0,4,0
"[00:00:00,00:20:00)",1,0,5,0
"[00:00:00,00:20:00)",1,0,6,0
"[00:00:00,00:20:00)",1,0,7,0
"[00:00:00,00:20:00)",1,1,1,0
"[00:00:00,00:20:00)",1,1,2,0
"[00:00:00,00:20:00)",1,1,3,0


In [74]:
y_train

array([70, 48, 58, ..., 43, 31, 42])

In [75]:
x_test.head()

Unnamed: 0_level_0,tollgate_id,direction,weekday,window_n
time_window,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"[06:00:00,06:20:00)",1,0,1,18
"[06:00:00,06:20:00)",1,0,2,18
"[06:00:00,06:20:00)",1,0,3,18
"[06:00:00,06:20:00)",1,0,4,18
"[06:00:00,06:20:00)",1,0,5,18


In [76]:
y_test

array([ 18,  13,  12,   8,   8,   7,  13,  31,  37,  48,  46,  33,  31,
        26,  27,  24,  26,  21,  16,  22,   8,  41,  30,  24,  37,  22,
        27,  18,  23,  23,  21,  16,  21,  19,   9,  21,  17,  16,  16,
        13,   8,  10,  57,  47,  28,  27,  39,  36,  50,  39,  36,  45,
        23,  24,  22,  11,  53,  42,  57,  44,  49,  37,  29,  41,  29,
        26,  49,  28,  18,  19,  24,  21,  17,  23,  21,  22,   8,  59,
        72,  57,  55,  54,  56,  33,  52,  46,  47,  42,  39,  43,  25,
       105,  85,  75,  73,  66,  51,  43,  60,  50,  43,  37,  42,  25,
        33,  35,  31,  22,  22,  32,  14,  17,  76,  68,  67,  71,  65,
        49,  45, 104,  83,  87,  76,  66,  24,  29, 150, 114, 119, 114,
       116,  76,  51,  75,  56,  53,  47,  54,  38,  36,  34,  28,  38,
        33,  41,  27,  24, 116,  94,  98,  88, 102,  44,  41, 111,  87,
        95,  89,  77,  49,  35, 148, 131, 137, 123, 135,  89,  71, 101,
        77,  63,  60,  76,  39,  53,  43,  47,  41,  37,  38,  3

In [77]:
#Função que calcula a média quadrática
def rmse(predictions, targets):
    differences = predictions - targets
    differences_squared = differences ** 2
    mean_of_differences_squared = differences_squared.mean()
    rmse_val = np.sqrt(mean_of_differences_squared)
    return rmse_val

In [78]:
#Função que calcula o MAPE
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [108]:
#Executando algoritmo de regressão linear
model1 = LinearRegression()
model1.fit(x_train, y_train)
pred_y1 = model1.predict(x_test)
pred_y1

array([198.04889684, 192.81530042, 187.581704  , 182.34810758,
       177.11451116, 171.88091474, 166.64731832, 185.01983928,
       179.78624286, 174.55264644, 169.31905002, 164.0854536 ,
       158.85185718, 153.61826076, 221.2795516 , 216.04595518,
       210.81235876, 205.57876234, 200.34516593, 195.11156951,
       189.87797309, 244.51020636, 239.27660994, 234.04301353,
       228.80941711, 223.57582069, 218.34222427, 213.10862785,
       231.4811488 , 226.24755238, 221.01395596, 215.78035954,
       210.54676312, 205.3131667 , 200.07957028, 199.10444659,
       193.87085017, 188.63725375, 183.40365733, 178.17006091,
       172.93646449, 167.70286807, 186.07538903, 180.84179261,
       175.60819619, 170.37459977, 165.14100335, 159.90740693,
       154.67381051, 222.33510135, 217.10150493, 211.86790851,
       206.63431209, 201.40071567, 196.16711926, 190.93352284,
       245.56575611, 240.33215969, 235.09856328, 229.86496686,
       224.63137044, 219.39777402, 214.1641776 , 232.53

In [81]:
#Algoritmo Regressão Linear com Gradiente Descendente
class LinearRegressionGD(object):

    def __init__(self, eta=0.001, n_iter=20):
        self.eta = eta
        self.n_iter = n_iter

    def fit(self, X, y):
        self.w_ = np.zeros(1 + X.shape[1])
        self.cost_ = []

        for i in range(self.n_iter):
            output = self.net_input(X)
            errors = (y - output)
            self.w_[1:] += self.eta * X.T.dot(errors)
            self.w_[0] += self.eta * errors.sum()
            cost = (errors**2).sum() / 2.0
            self.cost_.append(cost)
        return self

    def net_input(self, X):
        return np.dot(X, self.w_[1:]) + self.w_[0]

    def predict(self, X):
        return self.net_input(X)

In [89]:
model2 = LinearRegressionGD(eta = 0.0009, n_iter = 40)
model2.fit(x_train, y_train)
pred_y2 = model2.predict(x_test)

In [93]:
#Algoritmo Regressão Linear com Gradiente Descendente Estocrástico
model3 = SGDRegressor(eta0=0.0009, max_iter=40)
model3.fit(x_train, y_train)
pred_y3 = model3.predict(x_test)

In [103]:
mean_absolute_percentage_error(y_test, pred_y1)

344.6219144002485

In [102]:
print(  rmse(pred_y1, y_test),"\n", #Algoritmo 1
        rmse(pred_y2, y_test),"\n", #Algoritmo 2
        rmse(pred_y3, y_test) #Algoritmo 3
     )

150.6964566871464 
 5.701458141552023e+145 
 187.39084578228082
