In [1]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.cross_validation import train_test_split
from dbanalysis import stop_tools
from dbanalysis import headers as hd
import time
from sklearn.metrics import mean_absolute_error
import xgboost
from xgboost import XGBRegressor
from sklearn import metrics
#from sklearn.utils import check_arrays

pd.set_option('display.max_columns', 500)



In [2]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    MAPE=np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    del(y_true)
    del(y_pred)
    return MAPE

In [3]:
def neural_networks(df, df_traveltime):
    errors=[]
    x_train, x_test, y_train, y_test=train_test_split(df, df_traveltime, test_size=0.2, random_state=4)
    nn=MLPRegressor(activation='relu',solver='adam',hidden_layer_sizes=(100, 60,20),random_state=1, max_iter=1000)
    nn.fit(x_train,y_train.values.ravel())
    pred=nn.predict(x_test)
    errors.append(mean_absolute_error(y_test, pred))
    errors.append(mean_absolute_percentage_error(y_test, pred))
    errors.append(metrics.r2_score(y_test, pred))
    del(df)
    del(df_traveltime)
    return errors
    

In [4]:
def xgb(df, df_traveltime):
    errors=[]
    x_train, x_test, y_train, y_test=train_test_split(df, df_traveltime, test_size=0.2, random_state=4)
    xgb=XGBRegressor()
    xgb.fit(x_train,y_train.values.ravel())
    pred=xgb.predict(x_test)
    errors.append(mean_absolute_error(y_test, pred))
    errors.append(mean_absolute_percentage_error(y_test, pred))
    errors.append(metrics.r2_score(y_test, pred))
    del(df)
    del(df_traveltime)
    return errors

In [7]:
avg_MAE_nn=0
avg_MAPE_nn=0
avg_MAE_xgb=0
avg_MAPE_xgb=0
count_links_processed_nn=0
count_links_processed_xgb=0
r2_scores_nn=0
r2_scores_xgb=0
for i in range(20):
    df_stop=stop_tools.random_stop_data()
    df=df_stop[[ 'dewpt', 'msl', 'rain', 'rhum', 'temp', 'vappr', 'wetb']]
    df_traveltime=df_stop[['traveltime']]
    day_dummies = pd.get_dummies(df_stop.day, prefix='day')
    hour_dummies = pd.get_dummies(df_stop.hour, prefix='hour')
    df=pd.concat([df, day_dummies, hour_dummies], axis=1)
    df_traveltime[df_traveltime['traveltime'] != 0]
    err_nn=neural_networks(df, df_traveltime)
    #print('Mean absolute error for NN : ', err_nn[0])
    print('Mean absolute percentage error for NN : ', err_nn[1])
    avg_MAE_nn+=err_nn[0]
    print('r2 score for nn : ', err_nn[2])
    if err_nn[1]<100:
        avg_MAPE_nn+=err_nn[1]
        count_links_processed_nn+=1
    r2_scores_nn+=err_nn[2]
    err_xgb=xgb(df,df_traveltime)
    #print('Mean absolute error for XGB : ', err_xgb[0])
    print('Mean absolute percentage error for XGB : ', err_xgb[1])
    avg_MAE_xgb+=err_xgb[0]
    if err_xgb[1]<100:
        avg_MAPE_xgb+=err_xgb[1]
        count_links_processed_xgb+=1
    r2_scores_xgb+=err_xgb[2]
    print('r2 score for xgb : ', err_xgb[2])
    del(df)
    
print('Errors for NN')    
#print('Average error ', avg_MAE_nn/10)
print('Average MAPE ', avg_MAPE_nn/count_links_processed_nn)
print('r2 :', r2_scores_nn/count_links_processed_nn)
print('Errors for XGB')
#print('Average error ', avg_MAE_xgb/10)
print('Average MAPE ', avg_MAPE_xgb/count_links_processed_xgb)
print('r2 :', r2_scores_xgb/count_links_processed_xgb)

Mean absolute percentage error for NN :  51.00045686295201
r2 score for nn :  -0.047260775521005716
Mean absolute percentage error for XGB :  55.04627571725654
r2 score for xgb :  0.047167920761142734
Mean absolute percentage error for NN :  14.582525862985815
r2 score for nn :  0.016612211550022682
Mean absolute percentage error for XGB :  15.352983754849573
r2 score for xgb :  0.06170512499206593
Mean absolute percentage error for NN :  16.390433579027274
r2 score for nn :  0.0700379851265801
Mean absolute percentage error for XGB :  16.477221102224394
r2 score for xgb :  0.17780300601420196
Mean absolute percentage error for NN :  235.73310083981474
r2 score for nn :  -0.008574818706121201
Mean absolute percentage error for XGB :  202.01834763824996
r2 score for xgb :  -0.1648547849389539
Mean absolute percentage error for NN :  38.76149801734364
r2 score for nn :  0.08271158123854294
Mean absolute percentage error for XGB :  42.28177173927749
r2 score for xgb :  0.1917526848034925


  This is separate from the ipykernel package so we can avoid doing imports until


Mean absolute percentage error for NN :  inf
r2 score for nn :  0.009864871527831442


  This is separate from the ipykernel package so we can avoid doing imports until


Mean absolute percentage error for XGB :  inf
r2 score for xgb :  0.03465224073476503
Mean absolute percentage error for NN :  48.87784959803932
r2 score for nn :  0.0020919218997544897
Mean absolute percentage error for XGB :  48.81051106408499
r2 score for xgb :  0.1864886427788549
Mean absolute percentage error for NN :  15.200301427065757
r2 score for nn :  -0.15037526738325768
Mean absolute percentage error for XGB :  15.413245567808914
r2 score for xgb :  -0.12927758151379498
Mean absolute percentage error for NN :  19.420611013320798
r2 score for nn :  0.06338122742314889
Mean absolute percentage error for XGB :  20.71669528243635
r2 score for xgb :  0.12777997637635063
Mean absolute percentage error for NN :  14.533991614601769
r2 score for nn :  -0.002176811681787516
Mean absolute percentage error for XGB :  15.12602432060359
r2 score for xgb :  -0.0004837364276186751
Errors for NN
Average MAPE  25.96068222035387
r2 : -0.013550297669661938
Errors for XGB
Average MAPE  27.46104

In [49]:
count_links_processed_nn

23

In [50]:
count_links_processed_xgb

23

In [7]:
df.columns

Index(['index', 'dayofservice', 'tripid', 'plannedtime_arr_from',
       'plannedtime_dep_from', 'actualtime_arr_from', 'actualtime_dep_from',
       'plannedtime_arr_to', 'actualtime_arr_to', 'routeid', 'fromstop',
       'tostop', 'traveltime', 'dwelltime', 'distance', 'speed', 'dt', 'date',
       'day', 'month', 'hour', 'year', 'dewpt', 'msl', 'rain', 'rhum', 'temp',
       'vappr', 'wetb'],
      dtype='object')