In [1]:
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme.expressions import Beta, Variable, log, exp
from biogeme import models
from biogeme import results as res

In [2]:
#data_file = "http://transp-or.epfl.ch/data/lpmc.dat"
data_file='lpmc10.dat'
lpmc = pd.read_csv(data_file, sep='\t')
lpmc

Unnamed: 0,trip_id,household_id,person_n,trip_n,travel_mode,purpose,fueltype,faretype,bus_scale,survey_year,...,dur_pt_access,dur_pt_rail,dur_pt_bus,dur_pt_int,pt_interchanges,dur_driving,cost_transit,cost_driving_fuel,cost_driving_ccharge,driving_traffic_percent
0,20,5,1,0,4,3,1,5,0.0,1,...,0.381667,0.000000,0.062222,0.000000,0,0.117222,0.00,0.41,0.0,0.097156
1,41,9,3,0,4,3,1,5,0.0,1,...,0.146944,0.000000,0.225000,0.000000,0,0.200833,0.00,0.48,0.0,0.378976
2,69,13,2,1,4,3,1,1,1.0,1,...,0.029444,0.083333,0.735833,0.398056,3,0.716944,6.00,2.16,0.0,0.582720
3,102,20,2,0,2,3,1,1,1.0,1,...,0.339722,0.183333,0.116667,0.266667,1,0.250833,3.00,0.89,0.0,0.170543
4,105,21,0,1,4,3,1,1,1.0,1,...,0.126389,0.000000,0.150000,0.000000,0,0.125833,1.50,0.37,0.0,0.154525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,80998,17591,0,5,4,3,1,5,0.0,3,...,0.189167,0.000000,0.117778,0.000000,0,0.082500,0.00,0.22,10.5,0.239057
4996,81000,17592,0,0,3,3,6,5,0.0,3,...,0.105278,0.000000,0.220278,0.000000,0,0.213611,0.00,0.52,0.0,0.412224
4997,81015,17597,0,3,4,3,1,5,0.0,3,...,0.343056,0.000000,0.177500,0.000000,0,0.189444,0.00,0.76,0.0,0.086510
4998,81041,17604,2,4,3,1,1,2,0.0,3,...,0.344444,0.316667,0.000000,0.083333,1,0.386111,1.05,0.98,0.0,0.340288


In [3]:
database = db.Database('trips', lpmc)

In [6]:
# define variables
for col in lpmc.columns:
    globals()[col]=Variable(col)

# Model 0

In [39]:
asc_walk = Beta('asc_walk', 0, None, None, 1)
asc_cycling = Beta('asc_cycling', 0, None, None, 0)
asc_public = Beta('asc_public', 0, None, None, 0)
asc_driving = Beta('asc_driving', 0, None, None, 0)

In [40]:
cost_driving = cost_driving_fuel + cost_driving_ccharge #total costs of driving
dur_public = dur_pt_access + dur_pt_rail + dur_pt_bus + dur_pt_int #total duration of public transportation

In [41]:
beta_cost = Beta('beta_cost', 0, None, None, 0)
beta_time = Beta('beta_time', 0, None, None, 0)

In [42]:
V_walk = asc_walk + beta_time * dur_walking
V_cycling = asc_cycling + beta_time * dur_cycling
V_driving = asc_driving + beta_time * dur_driving + beta_cost * cost_driving
V_public = asc_public + beta_time * dur_public + beta_cost * cost_transit

In [59]:
V = {1: V_walk, 2: V_cycling, 3: V_public, 4: V_driving}

# define choice sets for individuals
# av = {1: 1, 2: 1, 3: 1, 4: driving_license}

logprob = models.loglogit(V, None, travel_mode)

In [60]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'model_base'

In [61]:
results0 = biogeme.estimate()

In [63]:
print(results0.printGeneralStatistics())

Number of estimated parameters:	5
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-4722.972
Final log likelihood:	-4722.972
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.00106
Akaike Information Criterion:	9455.944
Bayesian Information Criterion:	9488.53
Final gradient norm:	3.2270E-03
Nbr of threads:	16



In [62]:
results0.getEstimatedParameters()

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
asc_cycling,-3.66082,0.100795,-36.319479,0.0
asc_driving,-1.335133,0.076403,-17.474928,0.0
asc_public,-0.559065,0.05262,-10.624619,0.0
beta_cost,-0.14985,0.01378,-10.874275,0.0
beta_time,-5.410243,0.188334,-28.726794,0.0


In [58]:
# results0.data.htmlFileName

'model_base~01.html'

# Model 1


In [64]:
asc_walk = Beta('asc_walk', 0, None, None, 1)
asc_cycling = Beta('asc_cycling', 0, None, None, 0)
asc_public = Beta('asc_public', 0, None, None, 0)
asc_driving = Beta('asc_driving', 0, None, None, 0)

In [65]:
cost_driving = cost_driving_fuel + cost_driving_ccharge
dur_public = dur_pt_access + dur_pt_rail + dur_pt_bus + dur_pt_int #total duration of public transportation

In [66]:
beta_time_walk = Beta('beta_time_walk', 0, None, None, 0)
beta_time_cycling = Beta('beta_time_cycling', 0, None, None, 0)
beta_time_driving = Beta('beta_time_driving', 0, None, None, 0)
beta_time_public = Beta('beta_time_public', 0, None, None, 0)

In [67]:
V_walk = asc_walk + beta_time_walk * dur_walking
V_cycling = asc_cycling + beta_time_cycling * dur_cycling
V_driving = asc_driving + beta_time_driving * dur_driving + beta_cost * cost_driving
V_public = asc_public + beta_time_public * dur_public + beta_cost * cost_transit

In [68]:
V = {1: V_walk, 2: V_cycling, 3: V_public, 4: V_driving}

logprob = models.loglogit(V, None, travel_mode)

In [69]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'model_1'

In [70]:
results1 = biogeme.estimate()

In [71]:
print(results1.printGeneralStatistics())

Number of estimated parameters:	8
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-4412.008
Final log likelihood:	-4412.008
Likelihood ratio test for the init. model:	2.305769e-07
Rho-square for the init. model:	2.61e-11
Rho-square-bar for the init. model:	-0.00181
Akaike Information Criterion:	8840.016
Bayesian Information Criterion:	8892.154
Final gradient norm:	2.1274E-02
Nbr of threads:	16



In [72]:
results1.getEstimatedParameters()

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
asc_cycling,-4.590558,0.179038,-25.640137,0.0
asc_driving,-2.07217,0.120171,-17.243491,0.0
asc_public,-2.439714,0.121955,-20.004987,0.0
beta_cost,-0.141794,0.015193,-9.332676,0.0
beta_time_cycling,-5.195468,0.423895,-12.256504,0.0
beta_time_driving,-5.874442,0.359744,-16.329512,0.0
beta_time_public,-3.199848,0.230718,-13.869103,0.0
beta_time_walk,-8.367751,0.36013,-23.235347,0.0


In [27]:
# results1.data.htmlFileName

'model_1~01.html'

## Comparing Models 0 and 1

In [28]:
general_statistics_model_0 = results0.getGeneralStatistics()
print(results0.printGeneralStatistics())

Number of estimated parameters:	5
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-4722.972
Final log likelihood:	-4722.972
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.00106
Akaike Information Criterion:	9455.944
Bayesian Information Criterion:	9488.53
Final gradient norm:	3.2270E-03
Nbr of threads:	16



In [29]:
general_statistics_model_1 = results1.getGeneralStatistics()
print(results1.printGeneralStatistics())

Number of estimated parameters:	8
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-4412.008
Final log likelihood:	-4412.008
Likelihood ratio test for the init. model:	2.373126e-07
Rho-square for the init. model:	2.69e-11
Rho-square-bar for the init. model:	-0.00181
Akaike Information Criterion:	8840.016
Bayesian Information Criterion:	8892.154
Final gradient norm:	2.1557E-02
Nbr of threads:	16



### Likelihood ratio test

$H_O$: $\beta\_time\_walk=\beta\_time\_driving=\beta\_time\_public=\beta\_time\_cycling$

Because model 0 is an restricted version of model 1, we can apply the likelihood ratio test.

In [32]:
results1.likelihood_ratio_test(results0, alpha)

LRTuple(message='H0 can be rejected at level 5.0%', statistic=621.9276592287497, threshold=7.814727903251179)

According to the test result, the null hypothesis is rejected at the 5% level.

# Model 2

1. One alternative attribute: pt_interchanges for public transport  
The assumption is that the number of interchanges is relevant to the overall experience of taking public transport. 
2. One socio-economic characteristics: driving_liscence to interact with ASC_driving  
The assumption is that having a driving lisence changes the ...?

Model 2 specification:

$ V_{walk}=ASC_{walk}+\beta_{time\_walk}time_{walk} $

$ V_{cycling}=ASC_{cycling}+\beta_{time\_cycle}time_{cycle} $

$ V_{public}=ASC_{public}+\beta_{time\_public}time_{public}+\beta_{cost}cost_{public}+\beta_{traffic}driving\_traffic\_percent$

$ V_{driving}=ASC_{driving}+\beta_{time\_driving}time_{driving}+\beta_{cost}cost_{driving}+\beta_{driving_liscence}drivingliscence$


In [76]:
asc_walk = Beta('asc_walk', 0, None, None, 1) # Normalizing asc_walk to 0
asc_cycling = Beta('asc_cycling', 0, None, None, 0)
asc_public = Beta('asc_public', 0, None, None, 0)
asc_driving = Beta('asc_driving', 0, None, None, 0)

In [77]:
cost_driving = cost_driving_fuel + cost_driving_ccharge
dur_public = dur_pt_access + dur_pt_rail + dur_pt_bus + dur_pt_int # total duration of public transportation

In [108]:
beta_time_walk = Beta('beta_time_walk', 0, None, None, 0)
beta_time_cycling = Beta('beta_time_cycling', 0, None, None, 0)
beta_time_driving = Beta('beta_time_driving', 0, None, None, 0)
beta_time_public = Beta('beta_time_public', 0, None, None, 0)
beta_cost = Beta('beta_cost', 0, None, None, 0)
beta_interchange = Beta('beta_interchange',0, None, None,0)
beta_drivingliscence = Beta('beta_drivingliscence',0,None,None,0)
beta_faretype = Beta('beta_faretype',0, None, None, 0)
beta_traffic = Beta('beta_traffic',0, None, None, 0)

In [127]:
# model 2 specification 
V_walk = asc_walk + beta_time_walk * dur_walking
V_cycling = asc_cycling + beta_time_cycling * dur_cycling
V_public = asc_public + beta_time_public * dur_public + beta_cost * cost_transit + beta_interchange * pt_interchanges
# V_public = asc_public + beta_time_public * dur_public + beta_cost * cost_transit
V_driving = asc_driving + beta_time_driving * dur_driving + beta_cost * cost_driving + beta_drivingliscence * driving_license

V = {1: V_walk, 2: V_cycling, 3: V_public, 4: V_driving}

logprob = models.loglogit(V, None, travel_mode)

biogeme.modelName = 'model_2'

In [128]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'model_2'

In [129]:
results2 = biogeme.estimate()

In [130]:
print(results2.printGeneralStatistics())

Number of estimated parameters:	10
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-4692.366
Final log likelihood:	-4207.638
Likelihood ratio test for the init. model:	969.455
Rho-square for the init. model:	0.103
Rho-square-bar for the init. model:	0.101
Akaike Information Criterion:	8435.277
Bayesian Information Criterion:	8500.449
Final gradient norm:	1.6101E-02
Nbr of threads:	16



In [131]:
results2.getEstimatedParameters()

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
asc_cycling,-4.658131,0.185022,-25.176143,0.0
asc_driving,-2.925308,0.134302,-21.781559,0.0
asc_public,-2.495268,0.125829,-19.830644,0.0
beta_cost,-0.130477,0.013883,-9.398482,0.0
beta_drivingliscence,1.359167,0.069276,19.61966,0.0
beta_interchange,-0.030149,0.085147,-0.354087,0.723274
beta_time_cycling,-5.401868,0.443314,-12.185207,0.0
beta_time_driving,-6.437742,0.374058,-17.210535,0.0
beta_time_public,-3.394145,0.259028,-13.103413,0.0
beta_time_walk,-8.564794,0.370564,-23.112851,0.0


In [245]:
# results2.data.htmlFileName

'model_2.html'

## Compare model 2 and 1
Since model 1 is a restricted version of model 2, we perform a likelihood ratio test. Or we can simply perform t-test on the coefficients $\beta_{driving\_liscence}$ and $\beta_{pt_interchanges}$, as given by the estimation table.

In [97]:
results2.likelihood_ratio_test(results1, alpha)

LRTuple(message='H0 can be rejected at level 5.0%', statistic=408.60707939159147, threshold=3.841458820694124)

Thus, model 2 is preferred to model 1.

# Model 3

We perform box-cox transformation on travel time for every alternatives.

In [133]:
# define lambda
lambda_boxcox = Beta('lambda_boxcox', 1, None, None, 0)
boxcox_time_1 = models.boxcox(dur_walking, lambda_boxcox)
boxcox_time_2 = models.boxcox(dur_cycling, lambda_boxcox)
boxcox_time_3 = models.boxcox(dur_public, lambda_boxcox)
boxcox_time_4 = models.boxcox(dur_driving, lambda_boxcox)

In [134]:
# model 3 specification 
V_walk = asc_walk + beta_time_walk * boxcox_time_1
V_cycling = asc_cycling + beta_time_cycling * boxcox_time_2
V_public = asc_public + beta_time_public * boxcox_time_3+ beta_cost * cost_transit + beta_interchange * pt_interchanges
# V_public = asc_public + beta_time_public * dur_public + beta_cost * cost_transit
V_driving = asc_driving + beta_time_driving * boxcox_time_4 + beta_cost * cost_driving + beta_drivingliscence * driving_license

V = {1: V_walk, 2: V_cycling, 3: V_public, 4: V_driving}

logprob = models.loglogit(V, None, travel_mode)

biogeme.modelName = 'model_3'

In [135]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'model_3'
results3 = biogeme.estimate()
print(results3.printGeneralStatistics())


Number of estimated parameters:	11
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-9534.858
Final log likelihood:	-4135.707
Likelihood ratio test for the init. model:	10798.3
Rho-square for the init. model:	0.566
Rho-square-bar for the init. model:	0.565
Akaike Information Criterion:	8293.415
Bayesian Information Criterion:	8365.104
Final gradient norm:	2.1883E-02
Nbr of threads:	16



In [136]:
results3.getEstimatedParameters()

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
asc_cycling,-2.195416,0.276622,-7.936516,1.998401e-15
asc_driving,-1.025655,0.223989,-4.579032,4.67133e-06
asc_public,1.779871,0.172822,10.298884,0.0
beta_cost,-0.120164,0.01358,-8.848625,0.0
beta_drivingliscence,1.365801,0.069714,19.591526,0.0
beta_interchange,-0.087404,0.076757,-1.138699,0.2548289
beta_time_cycling,-3.383667,0.262838,-12.873591,0.0
beta_time_driving,-3.565112,0.268147,-13.29535,0.0
beta_time_public,-2.60359,0.187867,-13.85866,0.0
beta_time_walk,-5.625222,0.250279,-22.475829,0.0


## Compare model 3 and 2
Under the null hypothesis that $\lambda=1$, the statistic $$\frac{\widehat{\lambda}-1}{\widehat{\sigma}_\lambda}$$ follows approximatively a $N(0, 1)$.  
Thus, we perform a t-test on $\lambda$.

In [141]:
estimated_parameters_3=results3.getEstimatedParameters()
t_test_lambda = (estimated_parameters_3.loc['lambda_boxcox','Value'] - 1) / estimated_parameters_3.loc['lambda_boxcox', 'Rob. Std err']
t_test_lambda

-14.598611164329755

In [144]:
from scipy.stats import norm
from biogeme.results import calcPValue

In [145]:
norm.ppf(.95)

1.6448536269514722

In [146]:
calcPValue(t_test_lambda)

0.0

Because the $p-value<0.05$, we reject the null hypothesis that $\lambda=1$. Thus, model 3 is preferred.

Though not exactly relevant as one is not the linear restriction of the other, we also perform a likelihood ratio test to check.

In [147]:
results3.likelihood_ratio_test(results2, alpha)

LRTuple(message='H0 can be rejected at level 5.0%', statistic=143.86210105255668, threshold=3.841458820694124)

A more general way is to perform a **Davidson and McKinnon J test**  on model 2 and 3. 