In [211]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro

from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures, RobustScaler
from sklearn.linear_model import SGDRegressor

from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVR

from sklearn import model_selection

import joblib

### Dataset

In [2]:
engine = create_engine('postgres+psycopg2://jcds:pwdk2020@127.0.0.1:5432/gpstrajectory')

In [4]:
df = pd.read_sql('trip_train_after_mm_2', engine)

In [5]:
df.head()

Unnamed: 0,level_0,index,device_id,license_plate,driver,vehicle_group,departure_time,arrival_time,distance,interval,...,n_left_turns,n_right_turns,n_u_turns,n_go_straight,matched_distance,mapbox_est_duration,matched_trajectory,repeat_mapmatch,day_of_week,hour_of_day
0,0,0,1019939,B9338SDB,ASEP BACHTIAR,DC Cikarang,2020-04-01 03:19:41+00:00,2020-04-01 06:55:25+00:00,74.41,12944.0,...,9,2,0,0,45.6323,5040.3,"[[107.326558, -6.353522], [107.332499, -6.3547...",0,2,3
1,1,1,1019939,B9338SDB,ASEP BACHTIAR,DC Cikarang,2020-04-02 01:48:12+00:00,2020-04-02 03:58:57+00:00,49.02,7845.0,...,25,16,0,2,30.6105,5093.7,"[[107.288255, -6.329157], [107.289817, -6.3291...",0,3,1
2,2,2,1019939,B9338SDB,ASEP BACHTIAR,DC Cikarang,2020-04-02 04:23:46+00:00,2020-04-02 07:09:46+00:00,54.49,9960.0,...,19,7,2,2,35.5462,4061.5,"[[107.274384, -6.310544], [107.27816, -6.30377...",0,3,4
3,3,3,1019939,B9338SDB,ASEP BACHTIAR,DC Cikarang,2020-04-03 03:03:34+00:00,2020-04-03 07:07:19+00:00,77.42,14625.0,...,17,16,3,2,65.0469,13290.6,"[[107.151024, -6.364126], [107.153708, -6.3601...",0,4,3
4,4,4,1019939,B9338SDB,ASEP BACHTIAR,DC Cikarang,2020-04-06 01:30:51+00:00,2020-04-06 03:22:00+00:00,42.3,6669.0,...,23,2,1,1,23.1868,2376.0,"[[107.281357, -6.332143], [107.281196, -6.3320...",0,0,1


In [8]:
df.drop(['level_0','index','repeat_mapmatch'], axis=1, inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8550 entries, 0 to 8549
Data columns (total 33 columns):
device_id              8550 non-null object
license_plate          8550 non-null object
driver                 8550 non-null object
vehicle_group          8550 non-null object
departure_time         8550 non-null datetime64[ns, UTC]
arrival_time           8550 non-null datetime64[ns, UTC]
distance               8550 non-null float64
interval               8550 non-null float64
origin_region          8550 non-null object
destination_region     8550 non-null object
departure_hour         8550 non-null float64
trip_time_cat          8550 non-null object
trip_type              8550 non-null object
average_speed          8550 non-null float64
max_speed              8550 non-null float64
trajectory_arr         8550 non-null object
timestamps             8550 non-null object
trajectory_size        8550 non-null int64
n_intersections        8550 non-null int64
n_tolls                8550 

### Feature Description

- device_id : identifier of the gps receiver installed in the vehicle
- license_plate : license plate of the vehicle
- driver : driver name
- vehicle_group : vehicle group name. usually belongs to which distribution center it belongs
- departure_time : start time of the trip
- arrival_time : end time of the trip
- distance : distance covered in the trip
- interval : time duration of the trip
- origin_region : Origin point of the trip (predefined by company)
- destination_region : destination point of the trip
- departure_hour : start time of the trip (in hour)
- trip_time_cat : label for departure time (morning, noon, afternoon, out-of-office hour)
- trip_type: round-trip or point-to-point
- average_speed : average speed during the trip (kmh)
- max_speed : maximum speed reached during the trip
- trajectory_size : number of trajectory data recorded during the trip
- n_intersection : number of road intersection passed during the trip
- n_tolls : number of tolls leg in the route
- n_motorways : number of motorways leg in the route (intercity road)
- n_bridges : number of bridges leg in route
- n_tunnels : number of tunnels leg in route
- n_steps: number of direction steps (turns, uturn, go straight) in the matched route
- n_left_turns: number of left turns
- n_right_turns: number of right turns
- n_u_turns: number of u turns
- n_go_straight: number of go straight steps in intersection or fork

In [10]:
df.describe()

Unnamed: 0,distance,interval,departure_hour,average_speed,max_speed,trajectory_size,n_intersections,n_tolls,n_bridges,n_tunnels,n_motorways,n_steps,n_left_turns,n_right_turns,n_u_turns,n_go_straight,matched_distance,mapbox_est_duration,day_of_week,hour_of_day
count,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0,8550.0
mean,9.157731,2696.330058,10.952164,21.901743,42.930292,130.333099,75.925146,0.657661,0.0,0.166316,1.52924,14.052398,5.102573,3.432047,0.78538,0.359064,8.250141,1264.428877,2.150994,3.546316
std,14.933954,4322.817671,2.02951,9.640708,20.803422,194.112116,118.741223,3.323635,0.0,0.750989,6.412828,19.561674,7.837251,5.474171,2.229471,0.908601,13.650744,1960.261851,1.50618,2.322797
min,0.001,22.0,4.97,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.349,92.0,9.38,15.158421,27.0,9.0,5.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.256425,59.125,1.0,2.0
50%,1.2495,345.0,10.67,22.206923,41.0,24.0,14.0,0.0,0.0,0.0,0.0,4.0,2.0,1.0,0.0,0.0,1.0517,252.25,2.0,3.0
75%,14.3975,4817.0,12.33,28.11074,58.0,216.0,115.0,0.0,0.0,0.0,0.0,21.0,7.0,5.0,1.0,0.0,12.6798,1949.375,3.0,5.0
max,145.67,51544.0,20.88,72.25,123.0,1561.0,1291.0,63.0,0.0,11.0,81.0,149.0,107.0,57.0,40.0,12.0,118.5774,15368.1,6.0,23.0


**Attributes Column**: ['distance', 'departure_hour', 'average_speed', 'average_altitude', 'max_speed', 'n_intersection', 'n_tolls', 'n_motorways','n_bridges', 'n_tunnels']
       
**Target Column**: ['Interval']

In [24]:
attr_cols = [
    'distance', 
    'departure_hour', 
    'average_speed', 
    'max_speed', 
    'n_intersections', 
    'n_tolls', 
    'n_motorways',
    'n_tunnels', 
    'n_steps',
    'n_left_turns', 
    'n_right_turns', 
    'n_u_turns', 
    'n_go_straight',
    'day_of_week'
]
target_cols = 'interval'

## Preparing Dataset

In [12]:
# check type of distribution in each attributes

def quick_check_n_dist(df, columns=None, ci=.05):
    result = {}
    if columns is None:
        columns = list(df.describe().columns) # check only numerical columns    
    for col in columns:
        result[col] = [shapiro(df[col])[1]]
    df = pd.DataFrame(result).T
    df.rename(columns={0:'p-val'}, inplace=True)
    df['normal dist'] = df.apply(lambda x: 'y' if x['p-val'] > ci else 'n', axis=1)
    return df

In [13]:
# routine to remove all outliers from all attributes columns

def get_outlier_idx_iqr(x):
    # x is a series
    iqr = x.quantile(.75) - x.quantile(.25)
    low = x.quantile(.25) - (1.5*iqr)
    high = x.quantile(.75) + (1.5*iqr)
    return x[(x < low) | (x > high)].index

def get_outlier_idx_stdev(x):
    std = x.std()
    mean_ = x.mean()
    return x[(x < (mean_-3*std)) | (x > (mean_+3*std))].index

def get_outlier_idx_compound(d, columns=None):
    outlier_idx = set()
    
    if columns is None:
        columns = list(df.describe().columns)
        
    norm_table = quick_check_n_dist(d, columns)
    
    for c in columns:
        if norm_table.loc[c]['normal dist'] == 'y':
            #filter non-outlier based on stdev
            outlier_idx.update(list(get_outlier_idx_stdev(d[c])))
        else:
            #filter non-outlier based on iqr
            outlier_idx.update(list(get_outlier_idx_iqr(d[c])))
    
    return pd.Index(outlier_idx)

In [14]:
# df_no_outlier = df.iloc[df.index.difference(get_outlier_idx_compound(df, attr_cols))].copy()

In [15]:
RScaler = RobustScaler()

In [28]:
def create_set_testing_dataset(df, attr_cols, prefix):
    '''
        return a dictionary of dataset where
        apply following treatment:
        - untouched
        - scaled
        - outlier removed
        - outlier removed + scaled
    '''
    
    df = df.reset_index()
    
    ret_dict = dict()
    ret_dict[prefix] = df # untouched
    
    # scaling 1
    df_scaled = df.copy()
    for c in attr_cols:
        df_scaled[c] = RScaler.fit_transform(df_scaled[[c]])
    
    ret_dict[prefix+'_scaled'] = df_scaled
    
    # remove outlier
    df_no_outlier = df.iloc[df.index.difference(get_outlier_idx_compound(df, attr_cols))].copy()
    ret_dict[prefix+'_no_outlier'] = df_no_outlier
    
    # scaling the df that has outlier removed
    df_no_outlier_scaled = df_no_outlier.copy()
    for c in attr_cols:
        df_no_outlier_scaled[c] = RScaler.fit_transform(df_no_outlier_scaled[[c]])
    
    ret_dict[prefix+'_no_outlier_scaled'] = df_no_outlier_scaled
    
    return ret_dict
    
    

1. df_ptp

In [176]:
ptp = create_set_testing_dataset(df[df['trip_type'] == 'point-to-point'], attr_cols, 'df_ptp')



3. df_base

In [177]:
df_base = df.copy()

In [178]:
df_base = pd.get_dummies(df_base, prefix=['triptype'], columns=['trip_type'])

In [179]:
base = create_set_testing_dataset(df_base, attr_cols, 'df_base')



### Model Testing

In [180]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [194]:
eval_matrix_wholedata = []

In [None]:
# prepare models
models = []

polyReg = make_pipeline(
        PolynomialFeatures(3, include_bias=False),
        LinearRegression()
    )

models.append(('MLR', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('SVR', SVR()))
models.append(('PolynomialRegression', polyReg))

# evaluate each model in turn

for dfname in base.keys():
    
    X = dfs[dfname][attr_cols + ['triptype_point-to-point', 'triptype_round-trip']].values    
    y = dfs[dfname][target_cols].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        eval_ = {
            'name': dfname+'_'+name,
            'MAE': metrics.mean_absolute_error(y_test, y_pred),
            '%MAPE': mean_absolute_percentage_error(y_test, y_pred),
            'RMSE': np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
            '%RMSPE': (np.sqrt(np.mean(np.square((y_test - y_pred) / y_test)))) * 100,
            'r2': round(metrics.r2_score(y_test, y_pred), 2)
        }
        
        eval_matrix_wholedata.append(eval_)
        
        print(eval_)
        

In [196]:
eval_mtrx_wholedata = pd.DataFrame(eval_matrix_wholedata).set_index('name')

In [197]:
eval_mtrx_wholedata.sort_values('RMSE', ascending=True)

Unnamed: 0_level_0,MAE,%MAPE,RMSE,%RMSPE,r2
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df_base_no_outlier_PolynomialRegression,625.33554,283.770549,1637.489011,614.068233,0.47
df_base_no_outlier_scaled_PolynomialRegression,625.335543,283.770552,1637.489114,614.068142,0.47
df_base_no_outlier_Lasso,593.466297,369.268411,1689.221604,724.371848,0.43
df_base_no_outlier_Ridge,593.870577,369.850664,1689.365139,726.614735,0.43
df_base_no_outlier_scaled_MLR,593.890389,369.88293,1689.36955,726.792782,0.43
df_base_no_outlier_MLR,593.890389,369.88293,1689.36955,726.792782,0.43
df_base_no_outlier_scaled_Ridge,593.613634,369.378312,1689.467378,725.851341,0.43
df_base_no_outlier_scaled_Lasso,591.885934,366.457676,1689.83551,719.688125,0.43
df_base_no_outlier_ElasticNet,583.970483,347.620562,1697.113897,636.994084,0.43
df_base_no_outlier_scaled_ElasticNet,549.006196,219.790578,1745.357653,375.488278,0.39


The RMSE still too high to be considered as accurate prediction. 1689s is more than 28 minutes.
The (probable) reasons are:

- There are many round-trips data
- In round trips, the ML didn't account for in-between stops. there are more variables needed to add to current prediction model

Now let's try to separate the data for Point-to-Point trip only

In [214]:
eval_matrix_ptp = []

In [None]:
# prepare models
models = []

polyReg = make_pipeline(
        PolynomialFeatures(3, include_bias=False),
        LinearRegression()
    )

models.append(('MLR', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('SVR', SVR()))
models.append(('PolynomialRegression', polyReg))

# evaluate each model in turn

for dfname in ptp.keys():
    
    X = dfs[dfname][attr_cols].values    
    y = dfs[dfname][target_cols].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        eval_ = {
            'name': dfname+'_'+name,
            'MAE': metrics.mean_absolute_error(y_test, y_pred),
            '%MAPE': mean_absolute_percentage_error(y_test, y_pred),
            'RMSE': np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
            '%RMSPE': (np.sqrt(np.mean(np.square((y_test - y_pred) / y_test)))) * 100,
            'r2': round(metrics.r2_score(y_test, y_pred), 2)
        }
        
        eval_matrix_ptp.append(eval_)
        
        print(eval_)

In [242]:
eval_mtrx_ptp = pd.DataFrame(eval_matrix_ptp).set_index('name')
eval_mtrx_ptp = eval_mtrx_ptp.sort_values('RMSE', ascending=True)
eval_mtrx_ptp

Unnamed: 0_level_0,MAE,%MAPE,RMSE,%RMSPE,r2
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df_ptp_no_outlier_scaled_Lasso,311.723418,354.204308,608.258196,585.180145,0.12
df_ptp_no_outlier_Lasso,313.084564,356.989332,608.605151,589.729861,0.12
df_ptp_no_outlier_scaled_Ridge,313.690669,357.32624,608.755243,590.38769,0.12
df_ptp_no_outlier_Ridge,313.86882,357.734702,608.803726,591.06068,0.12
df_ptp_no_outlier_scaled_MLR,313.915856,357.775631,608.812118,591.126898,0.12
df_ptp_no_outlier_MLR,313.915856,357.775631,608.812118,591.126898,0.12
df_ptp_no_outlier_ElasticNet,297.613444,338.877243,611.448255,560.187984,0.11
df_ptp_no_outlier_scaled_ElasticNet,259.63379,248.642173,615.391757,390.20534,0.1
df_ptp_no_outlier_scaled_SGDRegressor,346.937379,423.43194,630.77326,708.34314,0.05
df_ptp_no_outlier_scaled_SVR,170.131969,49.580496,649.789128,81.584954,-0.0


The RMSE score is far better than the whole data. Although it's still too high and considered as underfit for production

In [245]:
attr_cols

['distance',
 'departure_hour',
 'average_speed',
 'max_speed',
 'n_intersections',
 'n_tolls',
 'n_motorways',
 'n_tunnels',
 'n_steps',
 'n_left_turns',
 'n_right_turns',
 'n_u_turns',
 'n_go_straight',
 'day_of_week']

#### SGD Regressor

In [217]:
ptp_scaled = {
    'df_ptp_scaled': ptp['df_ptp_scaled'],
    'df_ptp_no_outlier_scaled': ptp['df_ptp_no_outlier_scaled']
}

In [218]:
# prepare models
models = []

models.append(('SGDRegressor', SGDRegressor(max_iter=1000)))

# evaluate each model in turn

for dfname in ptp_scaled.keys():
    
    X = dfs[dfname][attr_cols].values    
    y = dfs[dfname][target_cols].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        eval_ = {
            'name': dfname+'_'+name,
            'MAE': metrics.mean_absolute_error(y_test, y_pred),
            '%MAPE': mean_absolute_percentage_error(y_test, y_pred),
            'RMSE': np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
            '%RMSPE': (np.sqrt(np.mean(np.square((y_test - y_pred) / y_test)))) * 100,
            'r2': round(metrics.r2_score(y_test, y_pred), 2)
        }
        
        eval_matrix_ptp.append(eval_)
        
        print(eval_)

{'name': 'df_ptp_scaled_SGDRegressor', 'MAE': 431.3279965432428, '%MAPE': 296.65820295665833, 'RMSE': 1109.0822109880958, '%RMSPE': 523.070153748394, 'r2': 0.53}
{'name': 'df_ptp_no_outlier_scaled_SGDRegressor', 'MAE': 346.93737936851375, '%MAPE': 423.43194022889134, 'RMSE': 630.7732599496927, '%RMSPE': 708.3431404286952, 'r2': 0.05}


In [220]:
eval_mtrx_ptp = pd.DataFrame(eval_matrix_ptp).set_index('name')
eval_mtrx_ptp.sort_values('RMSE', ascending=True)

Unnamed: 0_level_0,MAE,%MAPE,RMSE,%RMSPE,r2
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
df_ptp_no_outlier_scaled_Lasso,311.723418,354.204308,608.258196,585.180145,0.12
df_ptp_no_outlier_Lasso,313.084564,356.989332,608.605151,589.729861,0.12
df_ptp_no_outlier_scaled_Ridge,313.690669,357.32624,608.755243,590.38769,0.12
df_ptp_no_outlier_Ridge,313.86882,357.734702,608.803726,591.06068,0.12
df_ptp_no_outlier_scaled_MLR,313.915856,357.775631,608.812118,591.126898,0.12
df_ptp_no_outlier_MLR,313.915856,357.775631,608.812118,591.126898,0.12
df_ptp_no_outlier_ElasticNet,297.613444,338.877243,611.448255,560.187984,0.11
df_ptp_no_outlier_scaled_ElasticNet,259.63379,248.642173,615.391757,390.20534,0.1
df_ptp_no_outlier_scaled_SGDRegressor,346.937379,423.43194,630.77326,708.34314,0.05
df_ptp_no_outlier_scaled_SVR,170.131969,49.580496,649.789128,81.584954,-0.0


However due to many outliers in our dataset, we should also test for cross validation

## Cross Validation

In [227]:
cv_ptp = []

In [228]:
# prepare configuration for cross validation test harness
seed = 7

# prepare models
models = []


models.append(('MLR', LinearRegression()))
models.append(('Ridge', Ridge()))
models.append(('Lasso', Lasso()))
models.append(('ElasticNet', ElasticNet()))
models.append(('SVR', SVR()))

# evaluate each model in turn

results = []
names = []
scoring = 'neg_root_mean_squared_error'

for dfname in ptp.keys():
    
    X = dfs[dfname][attr_cols] 
    Y = dfs[dfname][target_cols]
    
    for name, model in models:
        kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
        cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(dfname+'_'+name)
        
        cv_ptp.append({
            'name': dfname+'_'+name,
            'RMSE_': cv_results.mean(),
            'RMSE_std': cv_results.std()
        })
        
        msg = "%s: %f (%f)" % (name+'_'+dfname, cv_results.mean(), cv_results.std())
        print(msg)

MLR_df_ptp: -1169.200763 (588.422137)
Ridge_df_ptp: -1169.192543 (588.427882)
Lasso_df_ptp: -1168.451092 (588.935052)
ElasticNet_df_ptp: -1166.118090 (590.617431)
SVR_df_ptp: -1570.158888 (491.683518)
MLR_df_ptp_scaled: -1169.200763 (588.422137)
Ridge_df_ptp_scaled: -1169.169819 (588.470132)
Lasso_df_ptp_scaled: -1168.304453 (589.264605)
ElasticNet_df_ptp_scaled: -1182.745985 (597.156305)
SVR_df_ptp_scaled: -1562.477482 (492.998658)
MLR_df_ptp_no_outlier: -1074.363731 (731.812177)
Ridge_df_ptp_no_outlier: -1074.355771 (731.822129)
Lasso_df_ptp_no_outlier: -1074.152469 (731.949717)
ElasticNet_df_ptp_no_outlier: -1078.099948 (732.842356)
SVR_df_ptp_no_outlier: -1123.896050 (730.124165)
MLR_df_ptp_no_outlier_scaled: -1074.363731 (731.812177)
Ridge_df_ptp_no_outlier_scaled: -1074.336293 (731.842938)
Lasso_df_ptp_no_outlier_scaled: -1074.019047 (732.067201)
ElasticNet_df_ptp_no_outlier_scaled: -1086.284198 (737.178779)
SVR_df_ptp_no_outlier_scaled: -1120.337229 (732.003884)


In [229]:
pd.DataFrame(cv_ptp).set_index('name').sort_values('RMSE_', ascending=False)

Unnamed: 0_level_0,RMSE_,RMSE_std
name,Unnamed: 1_level_1,Unnamed: 2_level_1
df_ptp_no_outlier_scaled_Lasso,-1074.019047,732.067201
df_ptp_no_outlier_Lasso,-1074.152469,731.949717
df_ptp_no_outlier_scaled_Ridge,-1074.336293,731.842938
df_ptp_no_outlier_Ridge,-1074.355771,731.822129
df_ptp_no_outlier_MLR,-1074.363731,731.812177
df_ptp_no_outlier_scaled_MLR,-1074.363731,731.812177
df_ptp_no_outlier_ElasticNet,-1078.099948,732.842356
df_ptp_no_outlier_scaled_ElasticNet,-1086.284198,737.178779
df_ptp_no_outlier_scaled_SVR,-1120.337229,732.003884
df_ptp_no_outlier_SVR,-1123.89605,730.124165


It seems that Lasso with scaled attributes is the best among others for now

Train on all set

In [231]:
df_train = ptp['df_ptp_no_outlier_scaled']

In [232]:
X = df_train[attr_cols]
y = df_train[target_cols]

model_lasso = Lasso()
model_lasso.fit(X, y)


Lasso()

In [238]:
RScaler.fit(X,y)

RobustScaler()

## Analysis

Seems that the model evaluation results are still **Underfit** to be implemented in production

Recommendation to improve:

- Reduce round trip sample, customer premise should be tagged or we can use clustering the raw trajectory (although it will took long time)
- Increase trip sampling for at least a year to see any correlation with time of the year (public holiday, important event)
- Add more trip features that can be inverred from historical gps data (lane count, congestion, pavement type)
- Apply model to each segment of the road. 
- Add traffic data

### Dump Model

In [237]:
joblib.dump(model_lasso, 'model_lasso')

['model_lasso']

In [240]:
joblib.dump(RScaler, 'robust_scaler')

['robust_scaler']

In [247]:
df.columns

Index(['device_id', 'license_plate', 'driver', 'vehicle_group',
       'departure_time', 'arrival_time', 'distance', 'interval',
       'origin_region', 'destination_region', 'departure_hour',
       'trip_time_cat', 'trip_type', 'average_speed', 'max_speed',
       'trajectory_arr', 'timestamps', 'trajectory_size', 'n_intersections',
       'n_tolls', 'n_bridges', 'n_tunnels', 'n_motorways', 'n_steps',
       'n_left_turns', 'n_right_turns', 'n_u_turns', 'n_go_straight',
       'matched_distance', 'mapbox_est_duration', 'matched_trajectory',
       'day_of_week', 'hour_of_day'],
      dtype='object')

In [248]:
df[['device_id', 'license_plate', 'driver', 'vehicle_group',
       'departure_time', 'arrival_time', 'distance', 'interval',
       'origin_region', 'destination_region', 'departure_hour',
       'trip_time_cat', 'trip_type', 'average_speed', 'max_speed',
       'timestamps', 'trajectory_size', 'n_intersections',
       'n_tolls', 'n_bridges', 'n_tunnels', 'n_motorways', 'n_steps',
       'n_left_turns', 'n_right_turns', 'n_u_turns', 'n_go_straight',
       'matched_distance',
       'day_of_week', 'hour_of_day']].to_json(orient='records', path_or_buf='dataset.json')