In [1]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.cross_validation import train_test_split
from dbanalysis import stop_tools
from dbanalysis import headers as hd
import time
from sklearn.metrics import mean_absolute_error
import xgboost
from xgboost import XGBRegressor
from sklearn import metrics
#from sklearn.utils import check_arrays

pd.set_option('display.max_columns', 500)



In [2]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    MAPE=np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    del(y_true)
    del(y_pred)
    return MAPE

In [3]:
def neural_networks(df, df_traveltime):
    errors=[]
    x_train, x_test, y_train, y_test=train_test_split(df, df_traveltime, test_size=0.2, random_state=4)
    nn=MLPRegressor(activation='relu',solver='adam',hidden_layer_sizes=(100, 60,20),random_state=1, max_iter=1000)
    nn.fit(x_train,y_train.values.ravel())
    pred=nn.predict(x_test)
    errors.append(mean_absolute_error(y_test, pred))
    errors.append(mean_absolute_percentage_error(y_test, pred))
    errors.append(metrics.r2_score(y_test, pred))
    del(df)
    del(df_traveltime)
    return errors
    

In [4]:
def xgb(df, df_traveltime):
    errors=[]
    x_train, x_test, y_train, y_test=train_test_split(df, df_traveltime, test_size=0.2, random_state=4)
    xgb=XGBRegressor()
    xgb.fit(x_train,y_train.values.ravel())
    pred=xgb.predict(x_test)
    errors.append(mean_absolute_error(y_test, pred))
    errors.append(mean_absolute_percentage_error(y_test, pred))
    errors.append(metrics.r2_score(y_test, pred))
    del(df)
    del(df_traveltime)
    return errors

In [5]:
avg_MAE_nn=0
avg_MAPE_nn=0
avg_MAE_xgb=0
avg_MAPE_xgb=0
count_links_processed_nn=0
count_links_processed_xgb=0
r2_scores_nn=0
r2_scores_xgb=0
for i in range(50):
    df_stop=stop_tools.random_stop_data()
    df=df_stop[[ 'dewpt', 'msl', 'rain', 'rhum', 'temp', 'vappr', 'wetb']]
    df_traveltime=df_stop[['traveltime']]
    day_dummies = pd.get_dummies(df_stop.day, prefix='day')
    hour_dummies = pd.get_dummies(df_stop.hour, prefix='hour')
    df=pd.concat([df, day_dummies, hour_dummies], axis=1)
    df_traveltime[df_traveltime['traveltime'] != 0]
    err_nn=neural_networks(df, df_traveltime)
    #print('Mean absolute error for NN : ', err_nn[0])
    print('Mean absolute percentage error for NN : ', err_nn[1])
    avg_MAE_nn+=err_nn[0]
    print('r2 score for nn : ', err_nn[2])
    if err_nn[1]<100:
        avg_MAPE_nn+=err_nn[1]
        count_links_processed_nn+=1
    r2_scores_nn+=err_nn[2]
    err_xgb=xgb(df,df_traveltime)
    #print('Mean absolute error for XGB : ', err_xgb[0])
    print('Mean absolute percentage error for XGB : ', err_xgb[1])
    avg_MAE_xgb+=err_xgb[0]
    if err_xgb[1]<100:
        avg_MAPE_xgb+=err_xgb[1]
        count_links_processed_xgb+=1
    r2_scores_xgb+=err_xgb[2]
    print('r2 score for xgb : ', err_xgb[2])
    del(df)
    
print('Errors for NN')    
#print('Average error ', avg_MAE_nn/10)
print('Average MAPE ', avg_MAPE_nn/count_links_processed_nn)
print('r2 :', r2_scores_nn/count_links_processed_nn)
print('Errors for XGB')
#print('Average error ', avg_MAE_xgb/10)
print('Average MAPE ', avg_MAPE_xgb/count_links_processed_xgb)
print('r2 :', r2_scores_xgb/count_links_processed_xgb)

Mean absolute percentage error for NN :  11.257964712485238
r2 score for nn :  0.0010903678413085593
Mean absolute percentage error for XGB :  11.903314153928058
r2 score for xgb :  0.09270000992148131
Mean absolute percentage error for NN :  16.225588001165356
r2 score for nn :  0.02156088915593024
Mean absolute percentage error for XGB :  18.646163928080004
r2 score for xgb :  -2.601787067640987
Mean absolute percentage error for NN :  25.340552380192793
r2 score for nn :  -0.016122889739528468
Mean absolute percentage error for XGB :  23.946410788812308
r2 score for xgb :  0.07440784636310271


  This is separate from the ipykernel package so we can avoid doing imports until


Mean absolute percentage error for NN :  inf
r2 score for nn :  -0.1241086031808809


  This is separate from the ipykernel package so we can avoid doing imports until


Mean absolute percentage error for XGB :  inf
r2 score for xgb :  0.016204693538447557
Mean absolute percentage error for NN :  12.602160063775003
r2 score for nn :  0.006045917401829581
Mean absolute percentage error for XGB :  14.783644378423036
r2 score for xgb :  0.21781837778520774
Mean absolute percentage error for NN :  19.710373534211463
r2 score for nn :  0.005549467017372134
Mean absolute percentage error for XGB :  19.907423158019842
r2 score for xgb :  0.035077688151492925
Mean absolute percentage error for NN :  15.772789974482462
r2 score for nn :  -0.004886268674727967
Mean absolute percentage error for XGB :  15.659391677549609
r2 score for xgb :  0.009926641539696135
Mean absolute percentage error for NN :  32.900540311647234
r2 score for nn :  -1.5308791601137912
Mean absolute percentage error for XGB :  32.17083351061204
r2 score for xgb :  -16.771216404064738
Mean absolute percentage error for NN :  27.311147823154453
r2 score for nn :  -1.7763166992227197
Mean abso

  This is separate from the ipykernel package so we can avoid doing imports until


Mean absolute percentage error for NN :  inf
r2 score for nn :  0.0014967852010450144


  This is separate from the ipykernel package so we can avoid doing imports until


Mean absolute percentage error for XGB :  inf
r2 score for xgb :  0.010814969350340542
Mean absolute percentage error for NN :  15.547719525503052
r2 score for nn :  -0.0015971444399660584
Mean absolute percentage error for XGB :  15.949473325100701
r2 score for xgb :  0.018718694796437174
Mean absolute percentage error for NN :  54.32923476827492
r2 score for nn :  -0.02734620641687968
Mean absolute percentage error for XGB :  67.43755041485296
r2 score for xgb :  0.10467599503076952
Mean absolute percentage error for NN :  35.723117142434816
r2 score for nn :  0.12024910977743875
Mean absolute percentage error for XGB :  37.91402109324667
r2 score for xgb :  0.12529512584143376
Mean absolute percentage error for NN :  20.32265045787947
r2 score for nn :  0.0412085389734671
Mean absolute percentage error for XGB :  20.804975825090956
r2 score for xgb :  0.09071806913230462
Mean absolute percentage error for NN :  10.93102941641439
r2 score for nn :  -0.00026586038624376584
Mean absolu

ValueError: Found array with 0 sample(s) (shape=(0, 7)) while a minimum of 1 is required.

In [49]:
count_links_processed_nn

23

In [50]:
count_links_processed_xgb

23

In [7]:
df.columns

Index(['index', 'dayofservice', 'tripid', 'plannedtime_arr_from',
       'plannedtime_dep_from', 'actualtime_arr_from', 'actualtime_dep_from',
       'plannedtime_arr_to', 'actualtime_arr_to', 'routeid', 'fromstop',
       'tostop', 'traveltime', 'dwelltime', 'distance', 'speed', 'dt', 'date',
       'day', 'month', 'hour', 'year', 'dewpt', 'msl', 'rain', 'rhum', 'temp',
       'vappr', 'wetb'],
      dtype='object')