# ARIMA model on history travel time

In [None]:
%matplotlib inline
import lab.setup
import functools
import pandas as pd
import numpy as np
import numba

from pandas.tseries.offsets import *

import statsmodels.api as sm
import statsmodels.tsa.arima_model as arima_model

from statsmodels.graphics.api import qqplot

g_region_temporal = 5
g_region_spatial  = 1

DATA_PATH = 'dataset'

## load prepared data from linear.ipynb&& build features

In [2]:
ds_train_full = pd.read_csv('dataset/ds_filled_s1.csv', dtype={'link_ID':'uint64'}, low_memory=False)

In [3]:
ds_train_full.head(1)

Unnamed: 0,link_ID,time_intv,date,time_interval,travel_time,in_links,out_links,filled,uplink_0,uplink_1,uplink_2,uplink_3,downlink_0,downlink_1,downlink_2,downlink_3,uplink_mean_tt,downlink_mean_tt
0,3377906280028510514,2016-03-03 00:00:00,,,5.1,4377906282541600514,4377906280763800514,True,4377906282541600514,0,0,0,4377906280763800514,0,0,0,55.4,8.4


In [4]:
TRAIN_SET_RATIO = 0.80

def get_df(df):
    tmp = df[['travel_time', 'uplink_mean_tt', 'downlink_mean_tt']]
    tmp.index = df['time_intv']
    tmp.index = pd.to_datetime(tmp.index)
    return tmp
    
link_no = ds_train_full.link_ID.unique().shape[0]
frames_train = []
frames_valid = []
counter = 0
for link_ID, link_ds in ds_train_full.groupby('link_ID'):
    counter += 1
    if counter < link_no * TRAIN_SET_RATIO:
        frames_train.append(get_df(link_ds))
    else:
        frames_valid.append(get_df(link_ds))

get_df(link_ds).head(5)

Unnamed: 0_level_0,travel_time,uplink_mean_tt,downlink_mean_tt
time_intv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-03-03 00:00:00,2.2,5.5,37.2
2016-03-03 00:02:00,2.4,6.3,34.8
2016-03-03 00:04:00,2.2,5.7,28.0
2016-03-03 00:06:00,3.0,6.3,28.0
2016-03-03 00:08:00,3.5,8.8,22.3


In [17]:
import matplotlib.pyplot as plt
import warnings
import multiprocessing.pool as pool
import os

best_args = (3, 0, 4)
start = '2016-03-03 01:00:00'
end = '2016-03-03 23:59:59'
series_limit = 10

def mape(y_hat, y):
    """Compute root mean squared error"""
    return np.mean(((y - y_hat) / y).abs())

def train_multi_series(series, p, q):
    params = None
    running_loss = 0.0
    counter = 0
    for s in series[:series_limit]:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            try:
                arma_mod = sm.tsa.VARMAX(s, order=(p, q))
                arma_res = arma_mod.fit(start_params=params)
            except Exception as e:
                # raise
                continue
                
            params = arma_res.params
            yhat = arma_res.predict(start='2016-03-03 01:00:00', end='2016-03-03 23:59:59', dynamic=False)
            loss = mape(yhat, s['2016-03-03 01:00:00':'2016-03-03 23:59:59'])
            
            running_loss += loss
            counter += 1
            
            if counter % 10 == 0:
                print('progress: {}/{}'.format(counter, len(series)))
                print('mean running loss:', running_loss['travel_time'] / counter)
    
    # fig, ax = plt.subplots(figsize=(5,4))
    # fig = arma_res.plot_predict(start='2016-03-03 00:00:00', end='2016-03-04 02:00:00', ax=ax)
    # legend = ax.legend(loc='upper left')
    return running_loss / counter, arma_res

def validate(series, trained_params, p, q):
    running_loss = 0.0
    counter = 0
    for s in series[:series_limit]:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            try:
                arma_mod = sm.tsa.VARMAX(s, order=(p, q))
                res = arma_mod.fit(start_params=trained_params)
                
            except Exception as e:
                raise
                continue
                
            yhat = arma_mod.predict(
                trained_params, start='2016-03-03 01:00:00', end='2016-03-03 23:59:59', dynamic=False)
            loss = mape(yhat, s['2016-03-03 01:00:00':'2016-03-03 23:59:59'])
            running_loss += loss
            counter += 1
            # print('progress: {}/{}'.format(counter, len(series)))
    
    if counter == 0:
        # failed to converge
        return 1e6
        
    return running_loss / counter

def train_pq(p, q):
    loss, arma_res = train_multi_series(frames_train, p, q)
    valid_loss = validate(frames_valid, arma_res.params, p, q)
    print("===== process:{}, qp:{}, train-loss: {}, valid-loss: {} =====".format(
        os.getpid(), (p, q), loss, valid_loss))
    
    return loss, valid_loss, (p, q)

def grid_search():
    exec_pool = pool.Pool()
    results = []
    best = 1e6
    best_valid = 1e6
    for p in range(1, 5 + 1):
        for q in range(1, 5 + 1):
            results.append(exec_pool.apply_async(train_pq, (p, q)))
    exec_pool.close()
    exec_pool.join()

    for result in results:
        r = result.get()
        valid_loss = r[1]
        if valid_loss < best_valid:
            best_valid = valid_loss
            best = r[0]
            best_args = r[2]

    print("===== best train-loss: {}, valid-loss: {} =====".format(best, best_valid))

# grid_search()
train_pq(3, 4)

ValueError: Given a pandas object and the index does not contain dates

In [None]:
print(best, best_valid, best_args)

In [16]:
frames_train[0].transpose().head(5)

time_intv,2016-03-03 00:00:00,2016-03-03 00:02:00,2016-03-03 00:04:00,2016-03-03 00:06:00,2016-03-03 00:08:00,2016-03-03 00:10:00,2016-03-03 00:12:00,2016-03-03 00:14:00,2016-03-03 00:16:00,2016-03-03 00:18:00,...,2016-03-03 23:40:00,2016-03-03 23:42:00,2016-03-03 23:44:00,2016-03-03 23:46:00,2016-03-03 23:48:00,2016-03-03 23:50:00,2016-03-03 23:52:00,2016-03-03 23:54:00,2016-03-03 23:56:00,2016-03-03 23:58:00
travel_time,5.1,5.1,5.1,5.1,5.1,5.1,5.1,5.1,5.1,4.9,...,3.2,3.2,3.2,3.2,5.6,4.5,4.4,4.2,4.4,4.6
uplink_mean_tt,55.4,67.0,67.0,33.2,36.8,31.7,32.4,34.4,29.4,25.0,...,52.5,50.9,38.9,55.4,37.3,29.6,25.7,26.4,20.6,20.0
downlink_mean_tt,8.4,8.8,7.5,8.1,8.7,8.9,9.5,9.9,10.6,8.8,...,7.1,9.3,9.3,20.5,24.5,14.2,14.1,7.5,7.7,8.3
