In [3]:
# PS2 - CE264
# GSI: Mustapha Harb - Mengqiao Yu - Andrew Campbell

# importing the requried libraries
from collections import OrderedDict    # For recording the model specification 

import pandas as pd                    # For file input/output
import numpy as np                     # For vectorized math operations

import pylogit as pl                   # For MNL model estimation and
                                       # conversion from wide to long format

# reading the data file 
data_01 = pd.read_csv("Air_Travel_Survey.csv",sep=",")


In [4]:
#look at the columns and the data
data_01.columns

Index([u'personID', u'gender', u'age', u'purpose', u'income', u'classTicket',
       u'payment', u'AA_FFP', u'CO_FFP', u'DL_FFP', u'B6_FFP', u'WN_FFP',
       u'UA_FFP', u'US_FFP', u'a1aircraft', u'a1departMAM', u'a1connections',
       u'a1travtime', u'a1arriveMAM', u'a1timediff', u'a1performance',
       u'a1fare', u'a1airline', u'a2aircraft', u'a2departMAM',
       u'a2connections', u'a2travtime', u'a2arriveMAM', u'a2timediff',
       u'a2performance', u'a2fare', u'a2airline', u'a1_AV', u'a2_AV',
       u'choice', u'choiceSituationID'],
      dtype='object')

In [5]:
data_01.head()

Unnamed: 0,personID,gender,age,purpose,income,classTicket,payment,AA_FFP,CO_FFP,DL_FFP,...,a2travtime,a2arriveMAM,a2timediff,a2performance,a2fare,a2airline,a1_AV,a2_AV,choice,choiceSituationID
0,1,1,5,1,10,3,1,1,1,1,...,895,1020,120,60,450,2,1,1,1,1
1,1,1,5,1,10,3,1,1,1,1,...,985,900,0,90,1050,7,1,1,1,2
2,1,1,5,1,10,3,1,1,1,1,...,980,1020,120,60,600,2,1,1,2,3
3,1,1,5,1,10,3,1,1,1,1,...,960,960,60,80,600,6,1,1,1,4
4,1,1,5,1,10,3,1,1,1,1,...,1130,900,0,90,900,7,1,1,2,5


In [6]:
# Converting the data from wide format to long format
# Create the list of individual specific variables

ind_variables = data_01.columns.tolist()[:14]

In [7]:
# Specify the variables that vary across individuals and some or all alternatives
# The keys are the column names that will be used in the long format dataframe.
# The values are dictionaries whose key-value pairs are the alternative id and
# the column name of the corresponding column that encodes that variable for
# the given alternative.


alt_varying_variables = {u'aircraft_type': dict([(1, 'a1aircraft'),
                                                 (2, 'a2aircraft')]),
                          u'departure_time': dict([(1, 'a1departMAM'),
                                                   (2, 'a2departMAM')]),
                          u'connections': dict([(1, 'a1connections'),
                                                (2, 'a2connections')]),
                          u'travel_time': dict([(1, 'a1travtime'),
                                                (2, 'a2travtime')]),                         
                          u'arrival_time': dict([(1, 'a1arriveMAM'),
                                                 (2, 'a2arriveMAM')]),       
                          u'time_diff': dict([(1, 'a1timediff'),
                                              (2, 'a2timediff')]), 
                          u'performance': dict([(1, 'a1performance'),
                                                (2, 'a2performance')]), 
                          u'fare': dict([(1, 'a1fare'),
                                         (2, 'a2fare')]), 
                          u'airline': dict([(1, 'a1airline'),
                                            (2, 'a2airline')])}


In [8]:
# Specify the availability variables
# Note that the keys of the dictionary are the alternative id's.
# The values are the columns denoting the availability for the
# given alternative in the dataset.
availability_variables = {1: 'a1_AV',
                          2: 'a2_AV'}

In [9]:
##########
# Determine the columns for: alternative ids, the observation ids and the choice
##########
# The 'custom_alt_id' is the name of a column to be created in the long-format data
# It will identify the alternative associated with each row.
custom_alt_id = "alternative_id"

# Create a custom id column that ignores the fact that this is a 
# panel/repeated-observations dataset. 
obs_id_column = "choiceSituationID"


# Create a variable recording the choice column
choice_column = "choice"

In [10]:
# Perform the conversion to long-format
data_long = pl.convert_wide_to_long(data_01, 
                                    ind_variables, 
                                    alt_varying_variables, 
                                    availability_variables, 
                                    obs_id_column, 
                                    choice_column,
                                    new_alt_id_name=custom_alt_id)
# Look at the resulting long-format dataframe
data_long.head(10)

Unnamed: 0,choiceSituationID,alternative_id,choice,personID,gender,age,purpose,income,classTicket,payment,...,US_FFP,connections,arrival_time,travel_time,aircraft_type,airline,performance,time_diff,fare,departure_time
0,1,1,1,1,1,5,1,10,3,1,...,1,0,960,980,3,6,60,60,1050,40
1,1,2,0,1,1,5,1,10,3,1,...,1,0,1020,895,4,2,60,120,450,185
2,2,1,1,1,1,5,1,10,3,1,...,1,2,900,1045,4,6,90,0,900,1355
3,2,2,0,1,1,5,1,10,3,1,...,1,1,900,985,4,7,90,0,1050,1415
4,3,1,0,1,1,5,1,10,3,1,...,1,1,900,985,4,7,70,0,900,1415
5,3,2,1,1,1,5,1,10,3,1,...,1,0,1020,980,3,2,60,120,600,100
6,4,1,1,1,1,5,1,10,3,1,...,1,0,840,1065,4,2,90,-60,600,1275
7,4,2,0,1,1,5,1,10,3,1,...,1,2,960,960,3,6,80,60,600,60
8,5,1,0,1,1,5,1,10,3,1,...,1,0,960,980,3,7,80,60,1050,40
9,5,2,1,1,1,5,1,10,3,1,...,1,2,900,1130,3,7,90,0,900,1270


In [11]:


##########
# Create scaled variables so the estimated coefficients are of similar magnitudes
##########
# Scale the travel time column by 60 to convert raw units (minutes) to hours
data_long["travel_time_hrs"] = data_long["travel_time"] / 60.0

# Scale the fare column by 100 to convert raw units ($) to 100$
data_long["fare_100$"] = data_long["fare"] / 100.0


# specifying the utility equations

# NOTE: - Specification and variable names must be ordered dictionaries.
#       - Keys should be variables within the long format dataframe.
#         The sole exception to this is the "intercept" key.
#       - For the specification dictionary, the values should be lists
#         of integers or or lists of lists of integers. Within a list, 
#         or within the inner-most list, the integers should be the 
#         alternative ID's of the alternative whose utility specification 
#         the explanatory variable is entering. Lists of lists denote 
#         alternatives that will share a common coefficient for the variable
#         in question.


basic_specification = OrderedDict()
basic_names = OrderedDict()

#basic_specification["intercept"] = [1, 2]
#basic_names["intercept"] = ['ASC Alternative 1',
#                            'ASC Alternative 2']

basic_specification["travel_time_hrs"] = [1, 2]
basic_names["travel_time_hrs"] = ['Travel Time, units:hrs Alternative 1',
                                  'Travel Time, units:hrs Alternative 2']

basic_specification["fare_100$"] = [1, 2]
basic_names["fare_100$"] = ['Fare, units:hundredth Alternative 1',
                                'Fare, units:hundredth Alternative 2']


In [12]:
# Estimate the binary logit model (
air_travel_logit = pl.create_choice_model(data=data_long,
                                        alt_id_col=custom_alt_id,
                                        obs_id_col=obs_id_column,
                                        choice_col=choice_column,
                                        specification=basic_specification,
                                        model_type="MNL",
                                        names=basic_names)

# Specify the initial values and method for the optimization.
air_travel_logit.fit_mle(np.zeros(4)) # 4 being the total number of parameters to be estimated



Log-likelihood at zero: -4,868.6658
Initial Log-likelihood: -4,868.6658
Estimation Time: 0.16 seconds.
Final log-likelihood: -4,057.1481




In [13]:
# Look at the estimation results
air_travel_logit.get_statsmodels_summary()

0,1,2,3
Dep. Variable:,choice,No. Observations:,7024.0
Model:,Multinomial Logit Model,Df Residuals:,7020.0
Method:,MLE,Df Model:,4.0
Date:,"Tue, 30 Jan 2018",Pseudo R-squ.:,0.167
Time:,10:58:31,Pseudo R-bar-squ.:,0.166
converged:,False,Log-Likelihood:,-4057.148
,,LL-Null:,-4868.666

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
"Travel Time, units:hrs Alternative 1",-0.5512,0.021,-26.079,0.000,-0.593,-0.510
"Travel Time, units:hrs Alternative 2",-0.5423,0.021,-25.982,0.000,-0.583,-0.501
"Fare, units:hundredth Alternative 1",-0.4692,0.021,-22.773,0.000,-0.510,-0.429
"Fare, units:hundredth Alternative 2",-0.5247,0.022,-24.112,0.000,-0.567,-0.482
