In [11]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import norm
from scipy.stats import chi2
import statsmodels.api as sm
from scipy.special import expit
np.set_printoptions(suppress=True)

In [12]:
address = 'C:/Users/moham/My Drive/Projects/Structural Econometrics Projects/Topics in Advanced Econometrics/4- GEV'
camping = pd.read_csv(address + '/camping.csv')
camping.rename(columns={'camper_id': 'id'}, inplace=True)

# Problem 1: Generalized Extreme Value Models

The data here is for camping choices. It is an individual level data. Each individual has 5 alternatives of parks to camp in. We have the choice of each individual in column visit. The data is in the long format. For each park, we have two attributes of time and cost. Also, we have two categories of parks. One, beach parks and the other is mountain parks. 

In [3]:
camping.head(5)

Unnamed: 0,id,park_id,park,visit,mountain,beach,cost,time
0,1,1,Mount Greylock,0,1,0,53.125051,204.096987
1,1,2,October Mountain,1,1,0,72.147801,262.922037
2,1,3,Horseneck Beach,0,0,1,97.150615,212.117346
3,1,4,Salisbury Beach,0,0,1,108.296025,181.254675
4,1,5,Scusset Beach,0,0,1,107.127049,151.80865


## a. Multinomial Logit model

Suppose the choice model is as follows:

$$
V_{nj} = \beta_1 C_{nj} + \beta_2 T_{nj} + \beta_3 M_j
$$

$M_j$ is a binary indicator of being in the mountain or not.
Here, we will use MLE to estimate the parameters. I will use the code from previous projects.

In [4]:
def loglikelihood(params, data):
    # Create a copy of the input DataFrame to avoid modifying the original one
    df = data.copy()
    # Independent variables as a NumPy array
    X = df[['cost', 'time', 'mountain']]
    # Calculating the exponential utility
    df['exp_Util'] = np.exp(np.dot(X, params))
    # Summing exp_Util for each id
    df['sum_exp_Util'] = df.groupby('id')['exp_Util'].transform('sum')
    # Probability calculation
    df['prob'] = df['exp_Util'] / df['sum_exp_Util']
    # Calculating the log-likelihood
    loglikelihood_value = np.sum(df['visit'] * np.log(df['prob']))
    # Return negative log-likelihood (for minimization)
    return -loglikelihood_value

In [None]:
initial_params = np.zeros(3)
result = minimize(loglikelihood, 
                  initial_params, 
                  args=(camping[['id', 'visit', 'time', 'cost', 'mountain']],), 
                  method='BFGS', 
                  options={'disp': False})
parameters = result.x

In [6]:
parameters

array([-0.01478385, -0.00163202, -0.33104973])

In [7]:
std = np.sqrt(result.hess_inv.diagonal())
z_stat = parameters / std
p_values = 2 * (1 - norm.cdf(abs(z_stat)))
summary_table = pd.DataFrame({
    'Parameter': ['Cost', 'time', 'Mountain'],
    'Coefficient': parameters,
    'Standard Error': std,
    'Z-statistic': z_stat,
    'P-value': p_values
})
print(summary_table.round(3).to_string(index=False))

Parameter  Coefficient  Standard Error  Z-statistic  P-value
     Cost       -0.015           0.004       -3.914    0.000
     time       -0.002           0.000       -3.929    0.000
 Mountain       -0.331           0.180       -1.839    0.066


The campers in our dataset obtain less utility, ceteris paribus, from camping in the mountains than from camping at the beach. Also, as expected, the marginal utilities of both cost and time are negative.

### Dollor value of time and mountains

In [8]:
dollor_value = parameters[0]
value_time = parameters[1]/parameters[0] * 60
value_mountain = parameters[2]/parameters[0]
print('Value of Time:', value_time.round(3), '$')
print('Value of Mountain:', value_mountain.round(3), '$')

Value of Time: 6.624 $
Value of Mountain: 22.393 $


So, campers value going to the mountain 22 dollor less than going to a beach.

### Caveat of this model

Notice that, when using simple logit, we assume that, for each individual, the error terms of all alternatives are i.i.d. Therefore, we are ruling out any correlations of this errors. Then, what if agents have individual preference for the mountains or the beach, which would create correlations among parks with the same setting. In other words, as some alternatives are inherently more substitutable, the assumption of logit might be problematic. That is why we can switch to nested logit for better results.

## b. Nested Logit model

Again, the model is as follows:
$$
V_{nj} = \beta_1 C_{nj} + \beta_2 T_{nj} + \beta_3 M_j
$$

We remember that, in the nested logit model, the choice probabilities are as follows (for the case of only two nests as here):

$$
P_{ni}=\frac{e^{V_{ni}/\lambda_m}\left(\sum_{j\in B_m}e^{V_{nj}/\lambda_m}\right)^{\lambda_m-1}}{\left(\sum_{j\in B_b}e^{V_{nj}/\lambda_b}\right)^{\lambda_b} + \left(\sum_{j\in B_m}e^{V_{nj}/\lambda_m}\right)^{\lambda_m}}
$$

Here, we have two nests $m,b$, and in the above equation, we suppose that alternative $i$ is in nest $m$. Using this idea, we can see that the parameter space is:
$$
\theta = \left(\beta_1,\beta_2,\beta_3, \lambda_{b}, \lambda{m}\right)
$$

Therefore, we can again use MLE to solve for the parameters.

In [None]:
def loglikelihood_nested(params, data):
    # Create a copy of the input DataFrame to avoid modifying the original one
    df = data.copy()
    params_util = params[:3]
    params_nest = params[3:]
    X = df[['cost', 'time', 'mountain']]
    
    df['Util_no_nest'] = np.dot(X, params_util)
    df['exp_Util_nest'] = np.exp(np.where(df['mountain'] == 0,
                                   df['Util_no_nest']/params_nest[0],
                                   df['Util_no_nest']/params_nest[1]))
    df['sum_exp_Util'] = df.groupby('id', 'mountain')['exp_Util_nest'].transform('sum')
    
    # df['sum_exp_Util'] = df.groupby('id')['exp_Util'].transform('sum')
    # df['prob'] = df['exp_Util_nest'] / df['sum_exp_Util']
    # df['prob'] = df['exp_Util'] / df['sum_exp_Util']
    # loglikelihood_value = np.sum(df['visit'] * np.log(df['prob']))
    return -loglikelihood_value

In [23]:
params = np.random.rand(5)

In [None]:
X = camping[['cost', 'time', 'mountain']]
camping['Util_no_nest'] = np.dot(X, params[:3])
camping['Util_no_nest'] = np.clip(camping['Util_no_nest'], -100, 100)
camping['exp_Util_nest'] = np.exp(np.where(camping['mountain'] == 0,
                                      camping['Util_no_nest']/params[3],
                                      camping['Util_no_nest']/params[4]))
camping['sum_exp_Util'] = camping.groupby('id', 'mountain')['exp_Util_nest'].transform('sum')