In [1]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.cross_validation import train_test_split
from dbanalysis import stop_tools
from dbanalysis import headers as hd
import time
from sklearn.metrics import mean_absolute_error
import xgboost
from xgboost import XGBRegressor
from sklearn import metrics
#from sklearn.utils import check_arrays

pd.set_option('display.max_columns', 500)



In [2]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    MAPE=np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    del(y_true)
    del(y_pred)
    return MAPE

In [3]:
def neural_networks(df, df_traveltime):
    errors=[]
    x_train, x_test, y_train, y_test=train_test_split(df, df_traveltime, test_size=0.2, random_state=4)
    nn=MLPRegressor(activation='relu',solver='adam',hidden_layer_sizes=(100, 60,20),random_state=1, max_iter=1000)
    nn.fit(x_train,y_train.values.ravel())
    pred=nn.predict(x_test)
    errors.append(mean_absolute_error(y_test, pred))
    errors.append(mean_absolute_percentage_error(y_test, pred))
    errors.append(metrics.r2_score(y_test, pred))
    del(df)
    del(df_traveltime)
    return errors
    

In [4]:
def xgb(df, df_traveltime):
    errors=[]
    x_train, x_test, y_train, y_test=train_test_split(df, df_traveltime, test_size=0.2, random_state=4)
    xgb=XGBRegressor()
    xgb.fit(x_train,y_train.values.ravel())
    pred=xgb.predict(x_test)
    errors.append(mean_absolute_error(y_test, pred))
    errors.append(mean_absolute_percentage_error(y_test, pred))
    errors.append(metrics.r2_score(y_test, pred))
    del(df)
    del(df_traveltime)
    return errors

In [5]:
avg_MAE_nn=0
avg_MAPE_nn=0
avg_MAE_xgb=0
avg_MAPE_xgb=0
count_links_processed_nn=0
count_links_processed_xgb=0
r2_scores_nn=0
r2_scores_xgb=0
for i in range(20):
    df_stop=stop_tools.random_stop_data()
    df=df_stop[[ 'dewpt', 'msl', 'rain', 'rhum', 'temp', 'vappr', 'wetb']]
    df_traveltime=df_stop[['traveltime']]
    day_dummies = pd.get_dummies(df_stop.day, prefix='day')
    hour_dummies = pd.get_dummies(df_stop.hour, prefix='hour')
    df=pd.concat([df, day_dummies, hour_dummies], axis=1)
    df_traveltime[df_traveltime['traveltime'] != 0]
    err_nn=neural_networks(df, df_traveltime)
    #print('Mean absolute error for NN : ', err_nn[0])
    print('Mean absolute percentage error for NN : ', err_nn[1])
    avg_MAE_nn+=err_nn[0]
    print('r2 score for nn : ', err_nn[2])
    if err_nn[1]<100:
        avg_MAPE_nn+=err_nn[1]
        count_links_processed_nn+=1
    r2_scores_nn+=err_nn[2]
    err_xgb=xgb(df,df_traveltime)
    #print('Mean absolute error for XGB : ', err_xgb[0])
    print('Mean absolute percentage error for XGB : ', err_xgb[1])
    avg_MAE_xgb+=err_xgb[0]
    if err_xgb[1]<100:
        avg_MAPE_xgb+=err_xgb[1]
        count_links_processed_xgb+=1
    r2_scores_xgb+=err_xgb[2]
    print('r2 score for xgb : ', err_xgb[2])
    del(df)
    
print('Errors for NN')    
#print('Average error ', avg_MAE_nn/10)
print('Average MAPE ', avg_MAPE_nn/count_links_processed_nn)
print('r2 :', r2_scores_nn/count_links_processed_nn)
print('Errors for XGB')
#print('Average error ', avg_MAE_xgb/10)
print('Average MAPE ', avg_MAPE_xgb/count_links_processed_xgb)
print('r2 :', r2_scores_xgb/count_links_processed_xgb)

Mean absolute percentage error for NN :  7.857400913622467
r2 score for nn :  -0.1225524718262283
Mean absolute percentage error for XGB :  7.707005193340809
r2 score for xgb :  0.04225566466111996
Mean absolute percentage error for NN :  15.712383569816316
r2 score for nn :  0.06454155176740273
Mean absolute percentage error for XGB :  17.01155512721498
r2 score for xgb :  0.18621801766244306
Mean absolute percentage error for NN :  14.639382899451142
r2 score for nn :  0.0052774719589694374
Mean absolute percentage error for XGB :  14.509118110883943
r2 score for xgb :  0.2943935160146919
Mean absolute percentage error for NN :  7.9994962386339905
r2 score for nn :  -0.005418023659189197
Mean absolute percentage error for XGB :  8.506641784578715
r2 score for xgb :  0.03305675901671212
Mean absolute percentage error for NN :  24.46069719671816
r2 score for nn :  0.2282339702694891
Mean absolute percentage error for XGB :  25.92053368100026
r2 score for xgb :  0.31221990526419785
Mean

MemoryError: 

In [49]:
count_links_processed_nn

23

In [50]:
count_links_processed_xgb

23

In [7]:
df.columns

Index(['index', 'dayofservice', 'tripid', 'plannedtime_arr_from',
       'plannedtime_dep_from', 'actualtime_arr_from', 'actualtime_dep_from',
       'plannedtime_arr_to', 'actualtime_arr_to', 'routeid', 'fromstop',
       'tostop', 'traveltime', 'dwelltime', 'distance', 'speed', 'dt', 'date',
       'day', 'month', 'hour', 'year', 'dewpt', 'msl', 'rain', 'rhum', 'temp',
       'vappr', 'wetb'],
      dtype='object')