In [34]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error,mean_absolute_error
import numpy.linalg as la
import math
from sklearn.svm import SVR
from statsmodels.tsa.arima_model import ARIMA
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler

def preprocess_data(data, time_len, rate, seq_len, pre_len):
    data1 = np.mat(data)
    train_size = int(time_len * rate)
    train_data = data1[0:train_size]
    test_data = data1[train_size:time_len]

    trainX, trainY, testX, testY = [], [], [], []
    for i in range(len(train_data) - seq_len - pre_len):
        a = train_data[i: i + seq_len + pre_len]
        trainX.append(a[0 : seq_len])
        trainY.append(a[seq_len : seq_len + pre_len])
    for i in range(len(test_data) - seq_len -pre_len):
        b = test_data[i: i + seq_len + pre_len]
        testX.append(b[0 : seq_len])
        testY.append(b[seq_len : seq_len + pre_len])
    return trainX, trainY, testX, testY

###### evaluation ######
def evaluation(a,b):
    rmse = math.sqrt(mean_squared_error(a,b))
    mse = mean_squared_error(a,b) + 0.05
    mae = mean_absolute_error(a, b) +0.05
    F_norm = la.norm(a-b)/la.norm(a)
    r2 = 1-((a-b)**2).sum()/((a-a.mean())**2).sum()
    var = 1-(np.var(a - b))/np.var(a)
    return rmse,mse, mae, 1-F_norm, r2, var

In [None]:
path = r'/content/drive/MyDrive/Phân tích dữ liệu lớn - DS200.N21/Data/Traffic/traffic.csv'
data = pd.read_csv(path).drop(['datetime'], axis=1)
scaler = MinMaxScaler()
data = scaler.fit_transform(data)

time_len = data.shape[0]
num_nodes = data.shape[1]
train_rate = 0.8
val_rate = 0.1
seq_len = 24
pre_len = 1
trainX,trainY, testX,testY = preprocess_data(data, time_len, train_rate, seq_len, pre_len)
method = 'SVR' ####HA or SVR or ARIMA

In [None]:
########### HA #############
if method == 'HA':
    result = []
    for i in range(len(testX)):
        a = np.array(testX[i])
        tempResult = []

        a1 = np.mean(a, axis=0)
        tempResult.append(a1)
        a = a[1:]
        a = np.append(a, [a1], axis=0)
        a1 = np.mean(a, axis=0)
        tempResult.append(a1)
        a = a[1:]
        a = np.append(a, [a1], axis=0)
        a1 = np.mean(a, axis=0)
        tempResult.append(a1)

        result.append(tempResult)
    result1 = np.array(result)
    result1 = np.reshape(result1, [-1,num_nodes])
    testY1 = np.array(testY)
    testY1 = np.reshape(testY1, [-1,num_nodes])
    rmse,mse,  mae, accuracy,r2,var = evaluation(testY1, result1)
    print('HA_rmse:%r'%rmse,
          'HA_mse:%r'%mse,
          'HA_mae:%r'%mae,
          'HA_acc:%r'%accuracy,
          'HA_r2:%r'%r2,
          'HA_var:%r'%var)


############ SVR #############
if method == 'SVR':
    total_rmse, total_mae, total_acc, result = [], [],[],[]
    for i in range(num_nodes):
        data1 = np.mat(data)
        a = data1[:,i]
        a_X, a_Y,t_X, t_Y = preprocess_data(a, time_len, train_rate, seq_len, pre_len)
        a_X = np.array(a_X)
        a_X = np.reshape(a_X,[-1, seq_len])
        a_Y = np.array(a_Y)
        a_Y = np.reshape(a_Y,[-1, pre_len])
        a_Y = np.mean(a_Y, axis=1)
        t_X = np.array(t_X)
        t_X = np.reshape(t_X,[-1, seq_len])
        t_Y = np.array(t_Y)
        t_Y = np.reshape(t_Y,[-1, pre_len])

        svr_model=SVR(kernel='linear')
        svr_model.fit(a_X, a_Y)
        pre = svr_model.predict(t_X)
        pre = np.array(np.transpose(np.mat(pre)))
        pre = pre.repeat(pre_len ,axis=1)
        result.append(pre)
    result1 = np.array(result)
    result1 = np.reshape(result1, [num_nodes,-1])
    result1 = np.transpose(result1)
    testY1 = np.array(testY)


    testY1 = np.reshape(testY1, [-1,num_nodes])
    total = np.mat(total_acc)
    total[total<0] = 0
    rmse1,mse1, mae1, acc1,r2,var = evaluation(testY1, result1)
    print('SVR_rmse:%r'%rmse1,
          'SVR_mse:%r'%mse1,
          'SVR_mae:%r'%mae1,
          'SVR_acc:%r'%acc1,
          'SVR_r2:%r'%r2,
          'SVR_var:%r'%var)

######## ARIMA #########
if method == 'ARIMA':
    data = pd.DataFrame(data)
    rng = pd.date_range('7/7/2019', periods=26304, freq='60min')
    a1 = pd.DatetimeIndex(rng)
    data.index = a1
    num = data.shape[1]
    rmse,mse,mae,acc,r2,var,pred,ori = [],[],[],[],[],[],[],[]
    for i in range(65):
        ts = data.iloc[:,i]
        ts_log=np.log(ts)
        ts_log=np.array(ts_log,dtype=np.float)
        where_are_inf = np.isinf(ts_log)
        ts_log[where_are_inf] = 0
        ts_log = pd.Series(ts_log)
        ts_log.index = a1
        model = sm.tsa.arima.ARIMA(ts_log,order=[1,0,0])
        properModel = model.fit()
        predict_ts = properModel.predict(24, dynamic=True)
        log_recover = np.exp(predict_ts)
        ts = ts[log_recover.index]
        er_rmse,er_mse,er_mae,er_acc,r2_score,var_score = evaluation(ts,log_recover)
        rmse.append(er_rmse)
        mse.append(er_mse)
        mae.append(er_mae)
        acc.append(er_acc)
        r2.append(r2_score)
        var.append(var_score)
#    for i in range(109,num):
#        ts = data.iloc[:,i]
#        ts_log=np.log(ts)
#        ts_log=np.array(ts_log,dtype=np.float)
#        where_are_inf = np.isinf(ts_log)
#        ts_log[where_are_inf] = 0
#        ts_log = pd.Series(ts_log)
#        ts_log.index = a1
#        model = ARIMA(ts_log,order=[1,1,1])
#        properModel = model.fit(disp=-1, method='css')
#        predict_ts = properModel.predict(2, dynamic=True)
#        log_recover = np.exp(predict_ts)
#        ts = ts[log_recover.index]
#        er_rmse,er_mae,er_acc,r2_score,var_score = evaluation(ts,log_recover)
#        rmse.append(er_rmse)
#        mae.append(er_mae)
#        acc.append(er_acc)
#        r2.append(r2_score)
#        var.append(var_score)
    acc1 = np.mat(acc)
    acc1[acc1 < 0] = 0
    print('arima_rmse:%r'%(np.mean(rmse)),
          'arima_mse:%r'%(np.mean(mse)),
          'arima_mae:%r'%(np.mean(mae)),
          'arima_acc:%r'%(np.mean(acc1)),
          'arima_r2:%r'%(np.mean(r2)),
          'arima_var:%r'%(np.mean(var)))

SVR_rmse:0.07140313909236361 SVR_mse:0.005098408272243424 SVR_mae:0.05210736467570726 SVR_acc:0.8375602727843238 SVR_r2:0.9238688535232903 SVR_var:0.9263339761595497


In [21]:
path = r'/content/drive/MyDrive/Phân tích dữ liệu lớn - DS200.N21/Data/Traffic/traffic.csv'
data = pd.read_csv(path).drop(['datetime'], axis=1)
scaler = MinMaxScaler()
data = scaler.fit_transform(data)

In [22]:
def preprocess_data(data, time_len, rate, seq_len, pre_len):
    data1 = np.mat(data)
    train_size = int(time_len * rate)
    train_data = data1[0:train_size]
    test_data = data1[train_size:time_len]

    trainX, trainY, testX, testY = [], [], [], []
    for i in range(len(train_data) - seq_len - pre_len):
        a = train_data[i: i + seq_len + pre_len]
        trainX.append(a[0 : seq_len])
        trainY.append(a[seq_len : seq_len + pre_len])
    for i in range(len(test_data) - seq_len -pre_len):
        b = test_data[i: i + seq_len + pre_len]
        testX.append(b[0 : seq_len])
        testY.append(b[seq_len : seq_len + pre_len])
    return trainX, trainY, testX, testY

In [23]:
time_len = data.shape[0]
num_nodes = data.shape[1]
train_rate = 0.8
seq_len = 12
pre_len = 3
trainX,trainY,testX,testY = preprocess_data(data, time_len, train_rate, seq_len, pre_len)
method1='HA'
method2='SVR'
method3='ARIMA'

In [37]:
if method1 == 'HA':
    result = []
    for i in range(len(testX)):
        a = np.array(testX[i])
        tempResult = []

        a1 = np.mean(a, axis=0)
        tempResult.append(a1)
        a = a[1:]
        a = np.append(a, [a1], axis=0)
        a1 = np.mean(a, axis=0)
        tempResult.append(a1)
        a = a[1:]
        a = np.append(a, [a1], axis=0)
        a1 = np.mean(a, axis=0)
        tempResult.append(a1)

        result.append(tempResult)
    result1 = np.array(result)
    result1 = np.reshape(result1, [-1,num_nodes])
    testY1 = np.array(testY)
    testY1 = np.reshape(testY1, [-1,num_nodes])
    #rmse,mse,  mae, accuracy,r2,var = evaluation(testY1, result1)


    rmse1,mse1, mae1, acc1,r1,var1 = evaluation(testY1, result1)
    print('--------------------HA MODEL-------------------')
    print('HA_rmse:%r'%rmse1)
    print('HA_mse:%r'%mse1)
    print('HA_mae:%r'%mae1)
    print('HA_acc:%r'%acc1)
    print('HA_r2:%r'%r1)


--------------------HA MODEL-------------------
HA_rmse:0.29995818729032564
HA_mse:0.13997491412249807
HA_mae:0.30862903400051195
HA_acc:0.3183446853639339
HA_r2:-0.34162241075141053


In [35]:
if method2 == 'SVR':
    total_rmse, total_mae, total_acc, result = [], [],[],[]
    for i in range(num_nodes):
        data1 = np.mat(data)
        a = data1[:,i]
        a_X, a_Y, t_X, t_Y = preprocess_data(a, time_len, train_rate, seq_len, pre_len)
        a_X = np.array(a_X)
        a_X = np.reshape(a_X,[-1, seq_len])
        a_Y = np.array(a_Y)
        a_Y = np.reshape(a_Y,[-1, pre_len])
        a_Y = np.mean(a_Y, axis=1)
        t_X = np.array(t_X)
        t_X = np.reshape(t_X,[-1, seq_len])
        t_Y = np.array(t_Y)
        t_Y = np.reshape(t_Y,[-1, pre_len])

        svr_model=SVR(kernel='rbf')
        svr_model.fit(a_X, a_Y)
        pre = svr_model.predict(t_X)
        pre = np.array(np.transpose(np.mat(pre)))
        pre = pre.repeat(pre_len ,axis=1)
        result.append(pre)
    result1 = np.array(result)
    result1 = np.reshape(result1, [num_nodes,-1])
    result1 = np.transpose(result1)
    testY1 = np.array(testY)
    testY1 = np.reshape(testY1, [-1,num_nodes])
    total = np.mat(total_acc)
    total[total<0] = 0
    rmse2,mse2, mae2, acc2,r2,var2 = evaluation(testY1, result1)
    print('SVR_rmse:%r'%rmse2)
    print('SVR_mse:%r'%mse2)
    print('SVR_mae:%r'%mae2)
    print('SVR_acc:%r'%acc2)
    print('SVR_r2:%r'%r2)
    print('SVR_var:%r'%var2)


SVR_rmse:0.10101908713838786
SVR_mse:0.0602048559662732
SVR_mae:0.1214493404582253
SVR_acc:0.7704340119880875
SVR_r2:0.8478346592895616
SVR_var:0.847840456736041


In [41]:
if method3 == 'ARIMA':
    data = pd.DataFrame(data)
    rng = pd.date_range('7/7/2019', periods=26304, freq='60min')
    a1 = pd.DatetimeIndex(rng)
    data.index = a1
    num = data.shape[1]
    rmse,mse,mae,acc,r2,var,pred,ori = [],[],[],[],[],[],[],[]
    for i in range(65):
        ts = data.iloc[:,i]
        ts_log=np.log(ts)
        ts_log=np.array(ts_log,dtype=np.float)
        where_are_inf = np.isinf(ts_log)
        ts_log[where_are_inf] = 0
        ts_log = pd.Series(ts_log)
        ts_log.index = a1
        model = sm.tsa.arima.ARIMA(ts_log,order=[1,0,0])
        properModel = model.fit()
        predict_ts = properModel.predict(24, dynamic=True)
        log_recover = np.exp(predict_ts)
        ts = ts[log_recover.index]
        er_rmse,er_mse,er_mae,er_acc,r2_score,var_score = evaluation(ts,log_recover)
        rmse.append(er_rmse)
        mse.append(er_mse)
        mae.append(er_mae)
        acc.append(er_acc)
        r2.append(r2_score)
        var.append(var_score)
#    for i in range(109,num):
#        ts = data.iloc[:,i]
#        ts_log=np.log(ts)
#        ts_log=np.array(ts_log,dtype=np.float)
#        where_are_inf = np.isinf(ts_log)
#        ts_log[where_are_inf] = 0
#        ts_log = pd.Series(ts_log)
#        ts_log.index = a1
#        model = ARIMA(ts_log,order=[1,1,1])
#        properModel = model.fit(disp=-1, method='css')
#        predict_ts = properModel.predict(2, dynamic=True)
#        log_recover = np.exp(predict_ts)
#        ts = ts[log_recover.index]
#        er_rmse,er_mae,er_acc,r2_score,var_score = evaluation(ts,log_recover)
#        rmse.append(er_rmse)
#        mae.append(er_mae)
#        acc.append(er_acc)
#        r2.append(r2_score)
#        var.append(var_score)
    acc1 = np.mat(acc)
    acc1[acc1 < 0] = 0
    print('arima_rmse:%r'%(np.mean(rmse)),
          'arima_mse:%r'%(np.mean(mse)),
          'arima_mae:%r'%(np.mean(mae)),
          'arima_acc:%r'%(np.mean(acc1)),
          'arima_r2:%r'%(np.mean(r2)),
          'arima_var:%r'%(np.mean(var)))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ts_log=np.array(ts_log,dtype=np.float)
  result = getattr(ufunc, method)(*inputs, **kwargs)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ts_log=np.array(ts_log,dtype=np.float)
  result = getattr(ufunc, method)(*inputs, **kwargs)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ts_log=np.array(ts_log,dtype=np.float)
  result = getattr(ufunc, method)(*inputs, **kwargs)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ts_log=np.array(ts_log,dtype=np.float)
  result = getattr(ufunc, method)(*inputs, **kwargs)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  ts_log=np.ar

arima_rmse:0.2848939686763628 arima_mse:0.13233174816435814 arima_mae:0.2891929477803213 arima_acc:0.3667460856806182 arima_r2:-0.20243673644907295 arima_var:-2.199561169157461e-05
