In [28]:
import pandas as pd
import biogeme.database as db
import biogeme.biogeme as bio
from biogeme.expressions import Beta, Variable, log, exp
from biogeme import models
from biogeme import results as res

In [29]:
#data_file = "http://transp-or.epfl.ch/data/lpmc.dat"
data_file='lpmc10.dat'
lpmc = pd.read_csv(data_file, sep='\t')
lpmc

Unnamed: 0,trip_id,household_id,person_n,trip_n,travel_mode,purpose,fueltype,faretype,bus_scale,survey_year,...,dur_pt_access,dur_pt_rail,dur_pt_bus,dur_pt_int,pt_interchanges,dur_driving,cost_transit,cost_driving_fuel,cost_driving_ccharge,driving_traffic_percent
0,20,5,1,0,4,3,1,5,0.0,1,...,0.381667,0.000000,0.062222,0.000000,0,0.117222,0.00,0.41,0.0,0.097156
1,41,9,3,0,4,3,1,5,0.0,1,...,0.146944,0.000000,0.225000,0.000000,0,0.200833,0.00,0.48,0.0,0.378976
2,69,13,2,1,4,3,1,1,1.0,1,...,0.029444,0.083333,0.735833,0.398056,3,0.716944,6.00,2.16,0.0,0.582720
3,102,20,2,0,2,3,1,1,1.0,1,...,0.339722,0.183333,0.116667,0.266667,1,0.250833,3.00,0.89,0.0,0.170543
4,105,21,0,1,4,3,1,1,1.0,1,...,0.126389,0.000000,0.150000,0.000000,0,0.125833,1.50,0.37,0.0,0.154525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,80998,17591,0,5,4,3,1,5,0.0,3,...,0.189167,0.000000,0.117778,0.000000,0,0.082500,0.00,0.22,10.5,0.239057
4996,81000,17592,0,0,3,3,6,5,0.0,3,...,0.105278,0.000000,0.220278,0.000000,0,0.213611,0.00,0.52,0.0,0.412224
4997,81015,17597,0,3,4,3,1,5,0.0,3,...,0.343056,0.000000,0.177500,0.000000,0,0.189444,0.00,0.76,0.0,0.086510
4998,81041,17604,2,4,3,1,1,2,0.0,3,...,0.344444,0.316667,0.000000,0.083333,1,0.386111,1.05,0.98,0.0,0.340288


In [30]:
database = db.Database('trips', lpmc)

In [31]:
# define variables
for col in lpmc.columns:
    globals()[col]=Variable(col)

# Model 0

In [32]:
asc_walk = Beta('asc_walk', 0, None, None, 1)
asc_cycling = Beta('asc_cycling', 0, None, None, 0)
asc_public = Beta('asc_public', 0, None, None, 0)
asc_driving = Beta('asc_driving', 0, None, None, 0)

In [33]:
cost_driving = cost_driving_fuel + cost_driving_ccharge #total costs of driving
dur_public = dur_pt_access + dur_pt_rail + dur_pt_bus + dur_pt_int #total duration of public transportation

In [34]:
beta_cost = Beta('beta_cost', 0, None, None, 0)
beta_time = Beta('beta_time', 0, None, None, 0)

In [35]:
V_walk = asc_walk + beta_time * dur_walking
V_cycling = asc_cycling + beta_time * dur_cycling
V_driving = asc_driving + beta_time * dur_driving + beta_cost * cost_driving
V_public = asc_public + beta_time * dur_public + beta_cost * cost_transit

In [36]:
V = {1: V_walk, 2: V_cycling, 3: V_public, 4: V_driving}

# define choice sets for individuals
# av = {1: 1, 2: 1, 3: 1, 4: driving_license}

logprob = models.loglogit(V, None, travel_mode)

In [37]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'model_base'

In [38]:
results0 = biogeme.estimate()

In [39]:
print(results0.printGeneralStatistics())

Number of estimated parameters:	5
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-4722.972
Final log likelihood:	-4722.972
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.00106
Akaike Information Criterion:	9455.944
Bayesian Information Criterion:	9488.53
Final gradient norm:	1.0691E-03
Nbr of threads:	16



In [40]:
results0.getEstimatedParameters()

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
asc_cycling,-3.660824,0.100795,-36.319469,0.0
asc_driving,-1.335138,0.076403,-17.474949,0.0
asc_public,-0.559066,0.05262,-10.624628,0.0
beta_cost,-0.14985,0.01378,-10.87426,0.0
beta_time,-5.410269,0.188336,-28.726735,0.0


In [41]:
# results0.data.htmlFileName

# Model 1


In [42]:
asc_walk = Beta('asc_walk', 0, None, None, 1)
asc_cycling = Beta('asc_cycling', 0, None, None, 0)
asc_public = Beta('asc_public', 0, None, None, 0)
asc_driving = Beta('asc_driving', 0, None, None, 0)

In [43]:
cost_driving = cost_driving_fuel + cost_driving_ccharge
dur_public = dur_pt_access + dur_pt_rail + dur_pt_bus + dur_pt_int #total duration of public transportation

In [44]:
beta_time_walk = Beta('beta_time_walk', 0, None, None, 0)
beta_time_cycling = Beta('beta_time_cycling', 0, None, None, 0)
beta_time_driving = Beta('beta_time_driving', 0, None, None, 0)
beta_time_public = Beta('beta_time_public', 0, None, None, 0)

In [45]:
V_walk = asc_walk + beta_time_walk * dur_walking
V_cycling = asc_cycling + beta_time_cycling * dur_cycling
V_driving = asc_driving + beta_time_driving * dur_driving + beta_cost * cost_driving
V_public = asc_public + beta_time_public * dur_public + beta_cost * cost_transit

In [46]:
V = {1: V_walk, 2: V_cycling, 3: V_public, 4: V_driving}

logprob = models.loglogit(V, None, travel_mode)

In [47]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'model_1'

In [48]:
results1 = biogeme.estimate()

In [49]:
print(results1.printGeneralStatistics())

Number of estimated parameters:	8
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-4412.008
Final log likelihood:	-4412.008
Likelihood ratio test for the init. model:	1.30538e-07
Rho-square for the init. model:	1.48e-11
Rho-square-bar for the init. model:	-0.00181
Akaike Information Criterion:	8840.016
Bayesian Information Criterion:	8892.153
Final gradient norm:	2.4664E-02
Nbr of threads:	16



In [50]:
results1.getEstimatedParameters()

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
asc_cycling,-4.590516,0.179036,-25.640195,0.0
asc_driving,-2.072022,0.120164,-17.243356,0.0
asc_public,-2.439476,0.121947,-20.004382,0.0
beta_cost,-0.141781,0.015193,-9.331745,0.0
beta_time_cycling,-5.195903,0.423908,-12.257152,0.0
beta_time_driving,-5.875437,0.359795,-16.329937,0.0
beta_time_public,-3.200595,0.230746,-13.870611,0.0
beta_time_walk,-8.367595,0.360101,-23.236797,0.0


In [51]:
# results1.data.htmlFileName

## Comparing Models 0 and 1

In [52]:
general_statistics_model_0 = results0.getGeneralStatistics()
print(results0.printGeneralStatistics())

Number of estimated parameters:	5
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-4722.972
Final log likelihood:	-4722.972
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	-0.00106
Akaike Information Criterion:	9455.944
Bayesian Information Criterion:	9488.53
Final gradient norm:	1.0691E-03
Nbr of threads:	16



In [53]:
general_statistics_model_1 = results1.getGeneralStatistics()
print(results1.printGeneralStatistics())

Number of estimated parameters:	8
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-4412.008
Final log likelihood:	-4412.008
Likelihood ratio test for the init. model:	1.30538e-07
Rho-square for the init. model:	1.48e-11
Rho-square-bar for the init. model:	-0.00181
Akaike Information Criterion:	8840.016
Bayesian Information Criterion:	8892.153
Final gradient norm:	2.4664E-02
Nbr of threads:	16



### Likelihood ratio test

$H_O$: $\beta\_time\_walk=\beta\_time\_driving=\beta\_time\_public=\beta\_time\_cycling$

Because model 0 is an restricted version of model 1, we can apply the likelihood ratio test.

In [54]:
alpha=0.05
results1.likelihood_ratio_test(results0, alpha)

LRTuple(message='H0 can be rejected at level 5.0%', statistic=621.927689194652, threshold=7.814727903251179)

According to the test result, the null hypothesis is rejected at the 5% level.

# Model 2

1. One alternative attribute: pt_interchanges for public transport  
The assumption is that the number of interchanges is relevant to the overall experience of taking public transport. 
2. One socio-economic characteristics: driving_liscence to interact with ASC_driving  
The assumption is that having a driving lisence changes the ...?

Model 2 specification:

$ V_{walk}=ASC_{walk}+\beta_{time\_walk}time_{walk} $

$ V_{cycling}=ASC_{cycling}+\beta_{time\_cycle}time_{cycle} $

$ V_{public}=ASC_{public}+\beta_{time\_public}time_{public}+\beta_{cost}cost_{public}+\beta_{traffic}driving\_traffic\_percent$

$ V_{driving}=ASC_{driving}+\beta_{time\_driving}time_{driving}+\beta_{cost}cost_{driving}+\beta_{driving_liscence}drivingliscence$


In [55]:
asc_walk = Beta('asc_walk', 0, None, None, 1) # Normalizing asc_walk to 0
asc_cycling = Beta('asc_cycling', 0, None, None, 0)
asc_public = Beta('asc_public', 0, None, None, 0)
asc_driving = Beta('asc_driving', 0, None, None, 0)

In [56]:
cost_driving = cost_driving_fuel + cost_driving_ccharge
dur_public = dur_pt_access + dur_pt_rail + dur_pt_bus + dur_pt_int # total duration of public transportation

In [57]:
beta_time_walk = Beta('beta_time_walk', 0, None, None, 0)
beta_time_cycling = Beta('beta_time_cycling', 0, None, None, 0)
beta_time_driving = Beta('beta_time_driving', 0, None, None, 0)
beta_time_public = Beta('beta_time_public', 0, None, None, 0)
beta_cost = Beta('beta_cost', 0, None, None, 0)
beta_interchange = Beta('beta_interchange',0, None, None,0)
beta_drivingliscence = Beta('beta_drivingliscence',0,None,None,0)
beta_faretype = Beta('beta_faretype',0, None, None, 0)
beta_traffic = Beta('beta_traffic',0, None, None, 0)

In [58]:
# model 2 specification 
V_walk = asc_walk + beta_time_walk * dur_walking
V_cycling = asc_cycling + beta_time_cycling * dur_cycling
V_public = asc_public + beta_time_public * dur_public + beta_cost * cost_transit + beta_interchange * pt_interchanges
# V_public = asc_public + beta_time_public * dur_public + beta_cost * cost_transit
V_driving = asc_driving + beta_time_driving * dur_driving + beta_cost * cost_driving + beta_drivingliscence * driving_license

V = {1: V_walk, 2: V_cycling, 3: V_public, 4: V_driving}

logprob = models.loglogit(V, None, travel_mode)

biogeme.modelName = 'model_2'

In [59]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'model_2'

In [60]:
results2 = biogeme.estimate()



In [61]:
print(results2.printGeneralStatistics())

Number of estimated parameters:	10
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-6931.472
Final log likelihood:	-4207.638
Likelihood ratio test for the init. model:	5447.667
Rho-square for the init. model:	0.393
Rho-square-bar for the init. model:	0.392
Akaike Information Criterion:	8435.277
Bayesian Information Criterion:	8500.449
Final gradient norm:	1.1338E-02
Nbr of threads:	16



In [62]:
results2.getEstimatedParameters()

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
asc_cycling,-4.658217,0.185019,-25.177022,0.0
asc_driving,-2.925344,0.134301,-21.781928,0.0
asc_public,-2.495342,0.125829,-19.831245,0.0
beta_cost,-0.130482,0.013882,-9.399229,0.0
beta_drivingliscence,1.359169,0.069275,19.619935,0.0
beta_interchange,-0.030145,0.085143,-0.354055,0.723298
beta_time_cycling,-5.400914,0.443235,-12.185221,0.0
beta_time_driving,-6.436766,0.374004,-17.210422,0.0
beta_time_public,-3.393491,0.259001,-13.102218,0.0
beta_time_walk,-8.564554,0.370559,-23.112555,0.0


In [63]:
# results2.data.htmlFileName

## Compare model 2 and 1
Since model 1 is a restricted version of model 2, we perform a likelihood ratio test. Or we can simply perform t-test on the coefficients $\beta_{driving\_liscence}$ and $\beta_{pt_interchanges}$, as given by the estimation table.

In [64]:
results2.likelihood_ratio_test(results1, alpha)

LRTuple(message='H0 can be rejected at level 5.0%', statistic=408.7390114093705, threshold=5.991464547107979)

Thus, model 2 is preferred to model 1.

# Model 3

We perform box-cox transformation on travel time for every alternatives.

In [65]:
# define lambda
lambda_boxcox = Beta('lambda_boxcox', 1, None, None, 0)
boxcox_time_1 = models.boxcox(dur_walking, lambda_boxcox)
boxcox_time_2 = models.boxcox(dur_cycling, lambda_boxcox)
boxcox_time_3 = models.boxcox(dur_public, lambda_boxcox)
boxcox_time_4 = models.boxcox(dur_driving, lambda_boxcox)

In [66]:
# model 3 specification 
V_walk = asc_walk + beta_time_walk * boxcox_time_1
V_cycling = asc_cycling + beta_time_cycling * boxcox_time_2
V_public = asc_public + beta_time_public * boxcox_time_3+ beta_cost * cost_transit + beta_interchange * pt_interchanges
# V_public = asc_public + beta_time_public * dur_public + beta_cost * cost_transit
V_driving = asc_driving + beta_time_driving * boxcox_time_4 + beta_cost * cost_driving + beta_drivingliscence * driving_license

V = {1: V_walk, 2: V_cycling, 3: V_public, 4: V_driving}

logprob = models.loglogit(V, None, travel_mode)

biogeme.modelName = 'model_3'

In [67]:
biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'model_3'
results3 = biogeme.estimate()
print(results3.printGeneralStatistics())


Number of estimated parameters:	11
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-6931.472
Final log likelihood:	-4135.707
Likelihood ratio test for the init. model:	5591.529
Rho-square for the init. model:	0.403
Rho-square-bar for the init. model:	0.402
Akaike Information Criterion:	8293.415
Bayesian Information Criterion:	8365.104
Final gradient norm:	2.2303E-02
Nbr of threads:	16



In [68]:
results3.getEstimatedParameters()

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
asc_cycling,-2.195568,0.27663,-7.936831,1.998401e-15
asc_driving,-1.025547,0.223982,-4.578692,4.678929e-06
asc_public,1.779555,0.172816,10.297425,0.0
beta_cost,-0.120173,0.01358,-8.849125,0.0
beta_drivingliscence,1.365768,0.069714,19.591041,0.0
beta_interchange,-0.087201,0.076757,-1.136069,0.2559277
beta_time_cycling,-3.383673,0.26283,-12.874011,0.0
beta_time_driving,-3.564961,0.268127,-13.295784,0.0
beta_time_public,-2.603897,0.187885,-13.859022,0.0
beta_time_walk,-5.625187,0.250271,-22.476356,0.0


## Compare model 3 and 2
Under the null hypothesis that $\lambda=1$, the statistic $$\frac{\widehat{\lambda}-1}{\widehat{\sigma}_\lambda}$$ follows approximatively a $N(0, 1)$.  
Thus, we perform a t-test on $\lambda$.

In [69]:
estimated_parameters_3=results3.getEstimatedParameters()
t_test_lambda = (estimated_parameters_3.loc['lambda_boxcox','Value'] - 1) / estimated_parameters_3.loc['lambda_boxcox', 'Rob. Std err']
t_test_lambda

-14.600459948421262

In [70]:
from scipy.stats import norm
from biogeme.results import calcPValue

In [71]:
norm.ppf(.95)

1.6448536269514722

In [72]:
calcPValue(t_test_lambda)

0.0

Because the $p-value<0.05$, we reject the null hypothesis that $\lambda=1$. Thus, model 3 is preferred.

Though not exactly relevant as one is not the linear restriction of the other, we also perform a likelihood ratio test to check.

In [73]:
results3.likelihood_ratio_test(results2, alpha)

LRTuple(message='H0 can be rejected at level 5.0%', statistic=143.8621227926742, threshold=3.841458820694124)

A more general way is to perform a **Davidson and McKinnon J test**  on model 2 and 3. 

### Model 4

We define Model 4N as a nested version of Model 3 where we regroup motorized (public transports and car) and non-motorized transportation modes (walking and cycling).

In [80]:
MU_motorized = Beta('MU_motorized', 1, 0, None, 0)

In [90]:
help(models.lognested)

Help on function lognested in module biogeme.models:

lognested(V, availability, nests, choice)
    Implements the log of a nested logit model as a MEV model.
    
    :param V: dict of objects representing the utility functions of
        each alternative, indexed by numerical ids.
    
    :type V: dict(int:biogeme.expressions.expr.Expression)
    
    :param availability: dict of objects representing the availability of each
        alternative (:math:`a_i` in the above formula), indexed
        by numerical ids. Must be consistent with V, or
        None. In this case, all alternatives are supposed to be
        always available.
    
    :type availability: dict(int:biogeme.expressions.expr.Expression)
    
    :param nests: A tuple containing as many items as nests.
        Each item is also a tuple containing two items:
    
        - an object of type biogeme.expressions. expr.Expression representing
          the nest parameter,
        - a list containing the list of identifi

In [92]:
# model 4N specification 
V_walk = asc_walk + beta_time_walk * boxcox_time_1
V_cycling = asc_cycling + beta_time_cycling * boxcox_time_2
V_public = asc_public + beta_time_public * boxcox_time_3+ beta_cost * cost_transit + beta_interchange * pt_interchanges + MU_motorized
# V_public = asc_public + beta_time_public * dur_public + beta_cost * cost_transit
V_driving = asc_driving + beta_time_driving * boxcox_time_4 + beta_cost * cost_driving + beta_drivingliscence * driving_license + MU_motorized

travel_mode = Variable('travel_mode')
PT_AV = Variable('PT_AV')
DRIVING_AV = Variable('DRIVING_AV')
WALK_AV = Variable('WALK_AV')
CYCL_AV = Variable('CYCLING_AV')
V = {1: V_walk, 2: V_cycling, 3: V_public, 4: V_driving}
av = {1: 1, 2: 1, 3: 1, 4: driving_license}

motorized = MU_motorized, [3, 4]
non_motorized = 1.0, [1, 2]
nests = motorized, non_motorized
logprob = models.lognested(V, av, nests, travel_mode)

biogeme = bio.BIOGEME(database, logprob)
biogeme.modelName = 'model_4N'
model_4N_results = biogeme.estimate(recycle=True)

#logprob = models.loglogit(V, None, travel_mode)





In [93]:
print(model_4N_results.printGeneralStatistics())

Number of estimated parameters:	12
Sample size:	5000
Excluded observations:	0
Init log likelihood:	-1.797693e+308
Final log likelihood:	-1.797693e+308
Likelihood ratio test for the init. model:	-0
Rho-square for the init. model:	0
Rho-square-bar for the init. model:	0
Akaike Information Criterion:	inf
Bayesian Information Criterion:	inf
Final gradient norm:	8.0732E+02
Nbr of threads:	16



In [94]:
model_4N_results.getEstimatedParameters()

Unnamed: 0,Value,Rob. Std err,Rob. t-test,Rob. p-value
MU_motorized,1.0,0.028003,35.71008,0.0
asc_cycling,-2.195568,0.600975,-3.653344,0.0002588472
asc_driving,-1.025547,0.099465,-10.310641,0.0
asc_public,1.779555,0.204024,8.722273,0.0
beta_cost,-0.120173,0.014658,-8.198503,2.220446e-16
beta_drivingliscence,1.365768,0.099465,13.73116,0.0
beta_interchange,-0.087201,0.092701,-0.940669,0.3468743
beta_time_cycling,-3.383673,0.448317,-7.547494,4.440892e-14
beta_time_driving,-3.564961,0.196813,-18.11349,0.0
beta_time_public,-2.603897,0.177448,-14.674137,0.0
