In [42]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.cross_validation import train_test_split
from dbanalysis import stop_tools
from dbanalysis import headers as hd
import time
from sklearn.metrics import mean_absolute_error
import xgboost
from xgboost import XGBRegressor
from sklearn import metrics
#from sklearn.utils import check_arrays

pd.set_option('display.max_columns', 500)

In [43]:

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    MAPE=np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    del(y_true)
    del(y_pred)
    return MAPE

In [44]:
def neural_networks(df, df_traveltime):
    errors=[]
    x_train, x_test, y_train, y_test=train_test_split(df, df_traveltime, test_size=0.2, random_state=4)
    nn=MLPRegressor(activation='relu',solver='adam',hidden_layer_sizes=(100, 60,20),random_state=1, max_iter=1000)
    nn.fit(x_train,y_train.values.ravel())
    pred=nn.predict(x_test)
    errors.append(mean_absolute_error(y_test, pred))
    errors.append(mean_absolute_percentage_error(y_test, pred))
    errors.append(metrics.r2_score(y_test, pred))
    del(df)
    del(df_traveltime)
    return errors
    

In [65]:
def xgb(df, df_traveltime):
    errors=[]
    x_train, x_test, y_train, y_test=train_test_split(df, df_traveltime, test_size=0.2, random_state=4)
    xgb=XGBRegressor()
    xgb.fit(x_train,y_train.values.ravel())
    pred=xgb.predict(x_test)
    errors.append(mean_absolute_error(y_test, pred))
    errors.append(mean_absolute_percentage_error(y_test, pred))
    errors.append(metrics.r2_score(y_test, pred))
    del(df)
    del(df_traveltime)
    return errors

In [45]:
df_cluster= pd.read_csv('/data/cluser1.csv')

In [46]:
df_cluster.shape

(228097, 29)

In [47]:
df_cluster.head(10)

Unnamed: 0,index,dayofservice,tripid,plannedtime_arr_from,plannedtime_dep_from,actualtime_arr_from,actualtime_dep_from,plannedtime_arr_to,actualtime_arr_to,routeid,fromstop,tostop,traveltime,dwelltime,distance,speed,dt,date,day,month,hour,year,dewpt,msl,rain,rhum,temp,vappr,wetb
0,40,03-JAN-17 00:00:00,4089854,26003,26003,25729,25729,26023,25746,31_16,553,554,17,0,0.309288,65.496177,2017-01-03,2017-01-03,1,1,7,2017,0.5,1031.2,0.0,92,1.6,6.3,1.2
1,6693,03-JAN-17 00:00:00,4089907,27433,27433,27159,27172,27474,27200,31_15,553,554,28,13,0.309288,39.765536,2017-01-03,2017-01-03,1,1,7,2017,0.5,1031.2,0.0,92,1.6,6.3,1.2
2,16790,03-JAN-17 00:00:00,4089941,29098,29098,28591,28591,29120,28610,31_15,553,554,19,0,0.309288,58.601843,2017-01-03,2017-01-03,1,1,7,2017,0.5,1031.2,0.0,92,1.6,6.3,1.2
3,83,04-JAN-17 00:00:00,4089854,26003,26003,25890,25890,26023,25909,31_16,553,554,19,0,0.309288,58.601843,2017-01-04,2017-01-04,2,1,7,2017,4.0,1028.9,0.0,94,4.8,8.1,4.4
4,6745,04-JAN-17 00:00:00,4089907,27433,27433,27235,27245,27474,27270,31_15,553,554,25,10,0.309288,44.537401,2017-01-04,2017-01-04,2,1,7,2017,4.0,1028.9,0.0,94,4.8,8.1,4.4
5,16842,04-JAN-17 00:00:00,4089941,29098,29098,28575,28575,29120,28593,31_15,553,554,18,0,0.309288,61.857501,2017-01-04,2017-01-04,2,1,7,2017,4.0,1028.9,0.0,94,4.8,8.1,4.4
6,126,05-JAN-17 00:00:00,4089854,26003,26003,25863,25863,26023,25880,31_16,553,554,17,0,0.309288,65.496177,2017-01-05,2017-01-05,3,1,7,2017,-1.4,1031.7,0.0,95,-0.8,5.5,-1.0
7,6797,05-JAN-17 00:00:00,4089907,27433,27433,27428,27438,27474,27463,31_15,553,554,25,10,0.309288,44.537401,2017-01-05,2017-01-05,3,1,7,2017,-1.4,1031.7,0.0,95,-0.8,5.5,-1.0
8,16894,05-JAN-17 00:00:00,4089941,29098,29098,28579,28579,29120,28595,31_15,553,554,16,0,0.309288,69.589688,2017-01-05,2017-01-05,3,1,7,2017,-1.4,1031.7,0.0,95,-0.8,5.5,-1.0
9,169,06-JAN-17 00:00:00,4089854,26003,26003,25954,25968,26023,25996,31_16,553,554,28,14,0.309288,39.765536,2017-01-06,2017-01-06,4,1,7,2017,7.7,1027.8,0.1,86,9.9,10.5,8.8


In [48]:
df_cluster['fromstop'].unique()

array([ 553, 3276, 2167, 3578, 4552, 3576,  554, 1630, 3914, 3329, 4608,
       3725, 1210, 1551, 3912, 1550, 3277, 4323, 6119, 4559, 7389, 3393,
       7140, 3387, 6109, 2967])

In [49]:
df=df_cluster[[ 'dewpt', 'msl', 'rain', 'rhum', 'temp', 'vappr', 'wetb','distance']]

In [50]:
df_traveltime=df_cluster[['traveltime']]

In [51]:
day_dummies = pd.get_dummies(df_cluster.day, prefix='day')
hour_dummies = pd.get_dummies(df_cluster.hour, prefix='hour')

In [52]:
fromstop_dummies=pd.get_dummies(df_cluster.fromstop, prefix='fromstop')
tostop_dummies=pd.get_dummies(df_cluster.tostop, prefix='tostop')

In [53]:
df=pd.concat([df, day_dummies, hour_dummies], axis=1)

In [54]:
df.shape

(228097, 33)

In [60]:
df_reduced=df[:50000]

In [61]:
df_reduced.shape

(50000, 33)

In [62]:
df_reduced_traveltime=df_traveltime[:50000]

In [63]:
err=neural_networks(df_reduced,df_reduced_traveltime)

In [64]:
err

[4.970289862711456, 43.50231007082941, 0.16127288823820607]

In [66]:
err_xgb=xgb(df_reduced,df_reduced_traveltime)

In [67]:
err_xgb

[4.938751832008362, 48.21394805230225, 0.17746744303629391]