In [74]:
# !pip install statsmodels linearmodels
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import norm
from scipy.stats import chi2
import statsmodels.api as sm
from scipy.special import expit
np.set_printoptions(suppress=True)

In [2]:
address = 'C:\\Users\\moham\\My Drive\\Projects\\Structural Econometrics Projects\\Topics in Advanced Econometrics\\2- Logit'

In [3]:
commute_data = pd.read_csv(address + '\\commute_multinomial.csv')

In this project, we will do some likelihood ratio tests. So, I will write a function here to use it later.

In [4]:
def likelihood_ratio_test(ll_U, ll_R, df):
    test_stat = 2 * (ll_U - ll_R)
    p_value = 1 - chi2.cdf(test_stat, df)
    
    if p_value < 0.01:
        result = f'Reject the null hypothesis (p-value: {p_value:.4f})'
    else:
        result = f'Fail to reject the null hypothesis (p-value: {p_value:.4f})'
    
    return result

# Problem 1: Maximum Likelihood Estimation

## a. 
Again, the problem is for commute. Each agent has four alternatives to choose from. The utility is as follows:

$$
V_{nj} = \beta C_{nj} + \gamma T_{nj}
$$
Therefore, given a multinomial logit model, the log-likelihood function is as follows:
$$
\ln L(\theta \mid y, \boldsymbol{X})=\sum_{n=1}^N \sum_{i=1}^J y_{n i} \ln \left[\frac{e^{\beta C_{n,i} + \gamma T_{n,i}}}{\sum_j e^{\beta C_{n,j} + \gamma T_{n,j}}}\right]
$$

So, we will define a function that, given the parameters and data as inputs, and solve for the loglikelihood function. But, first, we need to prepare the data.

In [155]:
# List of modes
modes = ['bike', 'bus', 'car', 'walk']

# Reshape the DataFrame
rows = []
for index, row in commute_data.iterrows():
    for mode in modes:
        rows.append({
            'id': row['id'],
            'mode': mode,
            'time': row[f'time.{mode}'],
            'cost': row[f'cost.{mode}'],
            'choice': 1 if row['mode'] == mode else 0
        })

commute_long = pd.DataFrame(rows)

In [156]:
commute_long.head(4)

Unnamed: 0,id,mode,time,cost,choice
0,1,bike,20,0.0,0
1,1,bus,20,0.0,1
2,1,car,16,0.82,0
3,1,walk,55,0.0,0


In [157]:
def loglikelihood(params, data):
    # Create a copy of the input DataFrame to avoid modifying the original one
    df = data.copy()
    # Independent variables as a NumPy array
    X = df[['time', 'cost']]
    # Calculating the exponential utility
    df['exp_Util'] = np.exp(np.dot(X, params))
    # Summing exp_Util for each id
    df['sum_exp_Util'] = df.groupby('id')['exp_Util'].transform('sum')
    # Probability calculation
    df['prob'] = df['exp_Util'] / df['sum_exp_Util']
    # Calculating the log-likelihood
    loglikelihood_value = np.sum(df['choice'] * np.log(df['prob']))
    # Return negative log-likelihood (for minimization)
    return -loglikelihood_value

Now, we will find the optimal parameters.

In [158]:
initial_params = np.zeros(2)
result = minimize(loglikelihood, 
                  initial_params, 
                  args=(commute_long[['id', 'choice', 'time', 'cost']],), 
                  method='BFGS', 
                  options={'disp': True})
parameters = result.x

Optimization terminated successfully.
         Current function value: 1219.844944
         Iterations: 11
         Function evaluations: 51
         Gradient evaluations: 17


Now, from MLE, we remember that the asymptotic distribution is as follows:

$$
\widehat{\boldsymbol{\theta}} \stackrel{a}{\sim} \mathcal{N}\left(\boldsymbol{\theta}_0, I\left(\boldsymbol{\theta}_0\right)^{-1}\right) \,\, ,\,\, I\left(\boldsymbol{\theta}_0\right)=-E_0\left[\frac{\partial^2 \ln L\left(\boldsymbol{\theta}_0\right)}{\partial \boldsymbol{\theta}_0 \partial \boldsymbol{\theta}_0^{\prime}}\right]
$$
Then:
$$
\widehat{\operatorname{Var}}(\widehat{\boldsymbol{\theta}})=\left\{-\left.\frac{\partial^2 \ln L(\boldsymbol{\theta})}{\partial \boldsymbol{\theta} \partial \boldsymbol{\theta}^{\prime}}\right|_{\boldsymbol{\theta}=\widehat{\boldsymbol{\theta}}}\right\}^{-1}
$$

In [159]:
std = np.sqrt(result.hess_inv.diagonal())
z_stat = parameters / std
p_values = 2 * (1 - norm.cdf(abs(z_stat)))
summary_table = pd.DataFrame({
    'Parameter': ['Time', 'Cost'],
    'Coefficient': parameters,
    'Standard Error': std,
    'Z-statistic': z_stat,
    'P-value': p_values
})
print(summary_table.round(3).to_string(index=False))

Parameter  Coefficient  Standard Error  Z-statistic  P-value
     Time       -0.126           0.009      -14.191      0.0
     Cost       -1.002           0.171       -5.854      0.0


## b. 

Now, we change the model to the following:

$$
V_{nj} = \alpha_{j} + \beta C_{nj} + \gamma T_{nj}
$$

So, we have alternative specific intercepts. Here, the parameter space is $\{\alpha_{bus},\alpha_{car}, \alpha_{walk},\beta, \gamma\}$. So, we need to modify the model to add dummies for alternatives. 

In [160]:
# List of modes
modes = ['bike', 'bus', 'car', 'walk']

# Reshape the DataFrame
rows = []
for index, row in commute_data.iterrows():
    for mode in modes:
        rows.append({
            'id': row['id'],
            'mode': mode,
            'time': row[f'time.{mode}'],
            'cost': row[f'cost.{mode}'],
            'choice': 1 if row['mode'] == mode else 0
        })

commute_long_b = pd.DataFrame(rows)

for mode in modes:
    commute_long_b[f'dummy_{mode}'] = (commute_long_b['mode'] == mode).astype(int)

In [161]:
commute_long_b.head(4)

Unnamed: 0,id,mode,time,cost,choice,dummy_bike,dummy_bus,dummy_car,dummy_walk
0,1,bike,20,0.0,0,1,0,0,0
1,1,bus,20,0.0,1,0,1,0,0
2,1,car,16,0.82,0,0,0,1,0
3,1,walk,55,0.0,0,0,0,0,1


In [162]:
def loglikelihood_b(params, data):
    # Create a copy of the input DataFrame to avoid modifying the original one
    df = data.copy()
    X = df[['dummy_bike', 'dummy_bus', 'dummy_car',
            'dummy_walk', 'time', 'cost']]
    N = X.shape[1]
    n = len(params)
    for i in range(N - n):
        params = np.insert(params, 0, 0)
    # Calculating the exponential utility
    df['exp_Util'] = np.exp(np.dot(X, params))
    # Summing exp_Util for each id
    df['sum_exp_Util'] = df.groupby('id')['exp_Util'].transform('sum')
    # Probability calculation
    df['prob'] = df['exp_Util'] / df['sum_exp_Util']
    # Calculating the log-likelihood
    loglikelihood_value = np.sum(df['choice'] * np.log(df['prob']))
    # Return negative log-likelihood (for minimization)
    return -loglikelihood_value

In [163]:
initial_params_b = np.zeros(5) # alpha_bike, alpha_bus, alpha_car, alpha_walk, beta, gamma
result = minimize(loglikelihood_b, 
                  initial_params_b, 
                  args=(commute_long_b[['dummy_bike', 'dummy_bus', 'dummy_car', 'dummy_walk',
                                      'id', 'choice', 'time', 'cost']],),
                  method='BFGS', 
                  options={'disp': False})
parameters = result.x
ll_U = -result.fun

In [164]:
std = np.sqrt(result.hess_inv.diagonal())
z_stat = parameters / std
p_values = 2 * (1 - norm.cdf(abs(z_stat)))
summary_table_b = pd.DataFrame({
    'Parameters': ['Bus_intecept', 'Car_intercept', 'Walk_intercept', 'Time', 'Cost'],
    'Coefficient': parameters,
    'Standard Error': std,
    'Z-statistic': z_stat,
    'P-value': p_values
})
print(summary_table_b.round(3).to_string(index=False))

    Parameters  Coefficient  Standard Error  Z-statistic  P-value
  Bus_intecept        1.760           0.253        6.958    0.000
 Car_intercept        2.925           0.202       14.511    0.000
Walk_intercept        3.172           0.735        4.316    0.000
          Time       -0.296           0.085       -3.469    0.001
          Cost       -6.055           1.027       -5.895    0.000


Based on these results, as the intercepts are positive, ceteris paribus, students prefer all three alternatives to biking (as it is the base case with zero intercept).

### Running a likelihood ratio test

Now, we want to test the following:

$$
H_0: \alpha_{bus} = \alpha_{car} = \alpha_{walk} = 0
$$

So, it is done as follows:

$$
-2 \ln \lambda \sim \chi^2(J) \,\, \,\, ,\,\,\,\, -2 \ln \lambda=2\left(\ln L\left(\hat{\boldsymbol{\theta}}_U\right)-\ln L\left(\widehat{\boldsymbol{\theta}}_R\right)\right)
$$

with $J = 3$.

In [165]:
initial_params_restricted = np.zeros(2) #beta, gamma
result = minimize(loglikelihood_b,
                  initial_params_restricted, 
                  args=(commute_long_b[['dummy_bike', 'dummy_bus', 'dummy_car', 'dummy_walk',
                                      'id', 'choice', 'time', 'cost']],),
                  method='BFGS', 
                  options={'disp': True})
ll_R = -result.fun

Optimization terminated successfully.
         Current function value: 1219.844944
         Iterations: 11
         Function evaluations: 51
         Gradient evaluations: 17


In [166]:
likelihood_ratio_test(ll_U, ll_R, 3)

'Reject the null hypothesis (p-value: 0.0000)'

## c: Intercept and time specific alternatives

Here, the model looks as follows:

$$
V_{nj}=\alpha_j+\beta C_{nj}+\gamma_jT_{nj}
$$

So, the number of paremeters is $3 + 1 + 4 = 8$.

In [167]:
# List of modes
modes = ['bike', 'bus', 'car', 'walk']

# Reshape the DataFrame
rows = []
for index, row in commute_data.iterrows():
    for mode in modes:
        rows.append({
            'id': row['id'],
            'mode': mode,
            'time': row[f'time.{mode}'],
            'cost': row[f'cost.{mode}'],
            'choice': 1 if row['mode'] == mode else 0
        })

commute_long_c = pd.DataFrame(rows)

for mode in modes:
    commute_long_c[f'dummy_{mode}'] = (commute_long_c['mode'] == mode).astype(int)
    commute_long_c[f'time_{mode}'] = commute_long_c['time'] * commute_long_c[f'dummy_{mode}']

In [168]:
def loglikelihood_c(params, data):
    # Create a copy of the input DataFrame to avoid modifying the original one
    df = data.copy()
    X = df[['dummy_bike', 'dummy_bus', 'dummy_car',
            'dummy_walk', 'time_bike', 'time_bus', 'time_car', 'time_walk',
            'cost']]
    N = X.shape[1]
    n = len(params)
    for i in range(N - n):
        params = np.insert(params, 0, 0)
        
    # Calculating the exponential utility
    df['exp_Util'] = np.exp(np.dot(X, params))
    # Summing exp_Util for each id
    df['sum_exp_Util'] = df.groupby('id')['exp_Util'].transform('sum')
    # Probability calculation
    df['prob'] = df['exp_Util'] / df['sum_exp_Util']
    # Calculating the log-likelihood
    loglikelihood_value = np.sum(df['choice'] * np.log(df['prob']))
    # Return negative log-likelihood (for minimization)
    return -loglikelihood_value

In [169]:
initial_params_c = np.zeros(8) # alpha_bike, alpha_bus, alpha_car, alpha_walk, beta, gamma
result = minimize(loglikelihood_c, 
                  initial_params_c, 
                  args=(commute_long_c[['dummy_bike', 'dummy_bus', 'dummy_car', 'dummy_walk',
                                      'id', 'choice', 'time_bike', 'time_bus',
                                      'time_car', 'time_walk', 'cost']],),
                  method='BFGS', 
                  options={'disp': True})
parameters = result.x
ll_U = -result.fun

         Current function value: 982.356064
         Iterations: 22
         Function evaluations: 270
         Gradient evaluations: 30


  res = _minimize_bfgs(fun, x0, args, jac, callback, **options)


In [170]:
std = np.sqrt(result.hess_inv.diagonal())
z_stat = parameters / std
p_values = 2 * (1 - norm.cdf(abs(z_stat)))
summary_table_b = pd.DataFrame({
    'Parameters': ['Bus_intecept', 'Car_intercept', 'Walk_intercept',
                   'time_bike', 'time_bus',
                    'time_car', 'time_walk',
                    'Cost'],
    'Coefficient': parameters,
    'Standard Error': std,
    'Z-statistic': z_stat,
    'P-value': p_values
})
print(summary_table_b.round(3).to_string(index=False))

    Parameters  Coefficient  Standard Error  Z-statistic  P-value
  Bus_intecept       -0.219           0.372       -0.589    0.556
 Car_intercept        2.746           0.366        7.510    0.000
Walk_intercept        2.975           1.083        2.747    0.006
     time_bike       -0.289           0.039       -7.488    0.000
      time_bus       -0.143           0.040       -3.616    0.000
      time_car       -0.405           0.039      -10.436    0.000
     time_walk       -0.297           0.043       -6.857    0.000
          Cost       -2.604           0.986       -2.640    0.008


These results suggest that, ceteris paribus, driving or walking would be preferred to taking the bus or biking. Also, we can see the heterogeneity in the time coefficients, depending on the alternatives.

### Running a likelihood ratio test
$$
H_0: \gamma_{bike}=\gamma_{bus}=\gamma_{car}=\gamma_{walk}
$$

In [None]:
initial_params_c_restricted = np.zeros(8) # alpha_bike, alpha_bus, alpha_car, alpha_walk, beta, gamma
constraints = ({'type': 'eq', 'fun': lambda x: x[4] - x[5]},
               {'type': 'eq', 'fun': lambda x: x[5] - x[6]},
               {'type': 'eq', 'fun': lambda x: x[6] - x[7]})
result = minimize(loglikelihood_c, 
                  initial_params_c_restricted, 
                  args=(commute_long_c[['dummy_bike', 'dummy_bus', 'dummy_car', 'dummy_walk',
                                      'id', 'choice', 'time_bike', 'time_bus',
                                      'time_car', 'time_walk', 'cost']],),
                  method='SLSQP',
                  constraints=constraints, 
                  options={'disp': True})
parameters = result.x
ll_R = -result.fun

In [174]:
likelihood_ratio_test(ll_U, ll_R, 3)

'Reject the null hypothesis (p-value: 0.0000)'

So, we have rejected the null. Therefore, having heterogeneity in the time coefficient of different alternatives is a better fit.

# Problem 2: Generalized Method of Moments

Now, we get back to the case of a binary logit model. We only have two alternatives, bus or car. The values are as follows:

$$
\begin{aligned}&V_{nc}=\alpha+\beta C_{nc}+\gamma_{car}T_{nc}\\&V_{nb}=\gamma_{bus}T_{nb}\end{aligned}
$$

So, we have both intercept specific and time specific parameters. We have:
$$
V_{nc}-V_{nb}=\alpha+\beta C_{nc}+\gamma_{car}T_{nc}-\gamma_{bus}T_{nb}
$$
And the probability of choosing to drive is:

$$
P_{nc}= \frac{e^{V_{nc}}}{e^{V_{nc}} + e^{V_{nb}}} = \frac1{1+e^{-(V_{nc}-V_{nb})}}
$$

Here, we have 4 parameters. Therefore, at least we need 4 moment conditions.

In MLE, we remember that the optimization problem is:

$$
\max \sum_{n=1}^N \sum_{i=1}^J y_{n i} \ln \left[\frac{e^{\beta'x_{ni}}}{\sum_j e^{\beta'x_{nj}}}\right]
$$

Then, the FOCs look as follows:

$$
\sum_{n=1}^N\sum_{i=1}^J\left(y_{ni}-P_{ni}(x_n,\beta)\right)x_{ni}=0
$$
This indeed can be a moment condition as follows:
$$
E[\left(y_{ni}-P_{ni}(x_n,\beta)\right)x_{ni}] = 0
$$
In other words, our data characteristics is orthogonal to the error. 


In the model here, $x_{ni} = \{1, C_{nc}, T_{nc}, -T_{nb}\}$. So, we will have four moment conditions and four parameters. So, this model is just-identified using GMM.

In [7]:
commute_binary = pd.read_csv(address + '\\commute_binary.csv')
commute_binary['choice'] = (commute_binary['mode'] == 'car').astype(int)

Here, I will setup the full two step GMM from scratch.

In [96]:
def mm_fn(params, data):
    X = data[['cost.car', 'time.car', 'time.bus']]
    X = sm.add_constant(X)
    X = X.values
    y = data['choice'].values
    prob = expit(np.dot(X, params))
    # prob = 1/(1 + np.exp(-np.dot(X, params)))
    error = y - prob
    return error[:, np.newaxis] * X

def gmm_obj(params, data, weight):
    moments = mm_fn(params, data)
    N = len(data)
    moments_sum = (np.sum(moments, axis=0) / N)[:, np.newaxis] ## N*4 -> 4*1, sum on axis = 0
    obj = np.dot(np.dot(moments_sum.T, weight), moments_sum) / N
    return obj

In [97]:
initial_params = np.zeros(4)

N = len(commute_binary)

weight = np.random.rand(4,4)

result = minimize(gmm_obj, initial_params, 
                  args=(commute_binary, weight,), 
                  method='Nelder-Mead',
                  options={'xatol': 1e-10, 'maxiter': 10000})
parameters_first = result.x

print("First stage done")
#print("First stage results")
#print(parameters_first)

moments = mm_fn(parameters_first, commute_binary)
covariance_matrix = np.cov(moments, rowvar=False)
try:
    inv_cov_matrix = np.linalg.inv(covariance_matrix)
    print("Inverse of covariance matrix of empirical moments is calculated")
    result = minimize(gmm_obj, initial_params, 
                  args=(commute_binary, inv_cov_matrix,), 
                  method='Nelder-Mead',
                  options={'xatol': 1e-10, 'maxiter': 10000})
    parameters_second = result.x
    value = gmm_obj(parameters_second, commute_binary, inv_cov_matrix)
    print("Second stage done")
    print("Second stage results")
    print(parameters_second)
    # print("Value of the GMM objective function at the second stage is:")
    # print(value)
except np.linalg.LinAlgError:
    print('Singular matrix')
    print('only results of first stage are available')

First stage done
Inverse of covariance matrix of empirical moments is calculated
Second stage done
Second stage results
[ 2.23326705 -2.07715637 -0.33221545  0.13257431]


Now, we need to find the asymptotic variance of this estimator. 

$$

\widehat{\boldsymbol{\theta}} \stackrel{a}{\sim} \mathcal{N}\left(\boldsymbol{\theta}_0,{Var}(\widehat{\boldsymbol{\theta}})\right) \\
\widehat{Var}(\widehat{\boldsymbol{\theta}})=\frac{1}{n}\left(\widehat{\boldsymbol{G}}^{\prime}\widehat{\boldsymbol{S}}^{-1}\widehat{\boldsymbol{G}}\right)^{-1} \\\begin{aligned}&\widehat{\boldsymbol{G}}=\frac1n\sum_{i=1}^n\left.\frac{\partial\boldsymbol{m}(y_i,\boldsymbol{x}_i,\boldsymbol{z}_i,\boldsymbol{\theta})}{\partial\boldsymbol{\theta}^{\prime}}\right|_{\theta=\widehat{\boldsymbol{\theta}}}\\&\widehat{\boldsymbol{S}}=\frac1n\sum_{i=1}^n\boldsymbol{m}(y_i,x_i,z_i,\widehat{\theta})\boldsymbol{m}(y_i,x_i,z_i,\widehat{\theta})^{\prime}\end{aligned}
$$

First, we write a function to find the first order derivate of moments with respect to parameters.

In [86]:
def compute_jacobian_and_sum(params, data):
    X = data[['cost.car', 'time.car', 'time.bus']]
    X = np.hstack([np.ones((X.shape[0], 1)), X])  # Add intercept term (constant)
    y = data['choice'].values
    prob = expit(np.dot(X, params))  # Sigmoid function to get probabilities
    # Initialize the sum of Jacobians as a zero 4x4 matrix
    sum_jacobian = np.zeros((4, 4))
    # Loop through each data point
    for i in range(len(y)):
        xi = X[i]  # 1x4 vector
        pi = prob[i]  # scalar
        error_i = y[i] - pi  # scalar
        # Gradient of the probability with respect to params: pi * (1 - pi) * xi
        gradient = pi * (1 - pi) * np.outer(xi, xi)  # 4x4 matrix
        # Contribution to the Jacobian from the i-th data point
        jacobian_i = -gradient  # 4x4 matrix
        # Sum the Jacobian
        sum_jacobian += jacobian_i
    
    return sum_jacobian

# Calculate the sum of Jacobians
Jacobian = compute_jacobian_and_sum(parameters_second, commute_binary)/N

In [90]:
moments = mm_fn(parameters_second, commute_binary)
hat_S = np.cov(moments, rowvar=False)
hat_S_inv = np.linalg.inv(hat_S)
Var_estimator = np.linalg.inv(Jacobian.T @ hat_S_inv @ Jacobian)/N

Therefore, the result of this model is as follows:

In [101]:
parameters_second[3] = -parameters_second[3]
std = np.sqrt(np.diag(Var_estimator))
z_stat = parameters_second / std
p_values = 2 * (1 - norm.cdf(abs(z_stat)))
summary_table_gmm_a = pd.DataFrame({
    'Parameters': ['Intercept_car', 'Cost', 'Time.car', 'Time.Bus'],
    'Coefficient': parameters_second,
    'Standard Error': std,
    'Z-statistic': z_stat,
    'P-value': p_values
})
print(summary_table_gmm_a.round(3).to_string(index=False))

   Parameters  Coefficient  Standard Error  Z-statistic  P-value
Intercept_car        2.233           0.375        5.949    0.000
         Cost       -2.077           0.718       -2.892    0.004
     Time.car       -0.332           0.038       -8.755    0.000
     Time.Bus       -0.133           0.032       -4.150    0.000


So, ceteris paribus, the positive intercept of car shows that, relatively, students likes to drive than to take the bus. Also, as expected, the marginal utilities of cost and time are negative and significant. Also, the negative MU of time of bus is less than time of car. Therefore, students prefer time on the bus rather than driving.

### b. Endogeneity

It might be the case that time and cost are endogenous in this model. For example, someone who prefers to drive has chosen to live far away from the destination because they do not mind the extra cost
and time spent driving, and a student who enjoys taking the bus is more likely to live close to a
bus stop so the bus commute time is less.

So, we need instruments for them. We have four other variables we can use here as instruments. price_gas, snowfall, construction, and bus_detour.
Price_gas only affects the choice through cost. In other words, fixing the cost, a change in the price of gas would not change the choice variable. Moreover, snowfall, construction and bus_detour only affect time of bus and time of car and then the output variable. Therefore, these variables can be taken as valid instruments.

Now, the exogenous terms are:

$$
Z = \{z_1 = \text{intercept}, z_2 = \text{price-gas}, z_3 = \text{snowfall}, z_4=\text{construction},
z_5 = \text{bus-detour}\}
$$

Therefore, we have four parameters to estimate (like before), yet, five moment conditions. The moment conditions are:
$$
E[\left(y_{ni}-P_{ni}(x_n,\beta)\right)z_{ni}] = 0
$$

In [116]:
def mm_fn_endog(params, data):
    Z = data[['price_gas', 'snowfall', 'construction', 'bus_detour']]
    Z = sm.add_constant(Z)
    X = data[['cost.car', 'time.car', 'time.bus']]
    X = sm.add_constant(X)
    X = X.values
    Z = Z.values
    y = data['choice'].values
    prob = expit(np.dot(X, params))
    # prob = 1/(1 + np.exp(-np.dot(X, params)))
    error = y - prob
    return error[:, np.newaxis] * Z

def gmm_obj_endog(params, data, weight):
    moments = mm_fn_endog(params, data)
    N = len(data)
    moments_sum = (np.sum(moments, axis=0) / N)[:, np.newaxis] ## N*4 -> 4*1, sum on axis = 0
    obj = np.dot(np.dot(moments_sum.T, weight), moments_sum) / N
    return obj

In [145]:
initial_params = np.zeros(4)

N = len(commute_binary)

weight = np.eye(5)

result = minimize(gmm_obj_endog, initial_params, 
                  args=(commute_binary, weight,), 
                  method='Nelder-Mead',
                  options={'xatol': 1e-10, 'maxiter': 10000})
parameters_first = result.x

print("First stage done")
#print("First stage results")
#print(parameters_first)

moments = mm_fn_endog(parameters_first, commute_binary)
covariance_matrix = np.cov(moments, rowvar=False)
try:
    inv_cov_matrix = np.linalg.inv(covariance_matrix)
    print("Inverse of covariance matrix of empirical moments is calculated")
    result = minimize(gmm_obj_endog, initial_params, 
                  args=(commute_binary, inv_cov_matrix,), 
                  method='Nelder-Mead',
                  options={'xatol': 1e-10, 'maxiter': 10000})
    parameters_second = result.x
    print("Second stage done")
    print("Second stage results")
    print(parameters_second)
    # print("Value of the GMM objective function at the second stage is:")
    # print(value)
except np.linalg.LinAlgError:
    print('Singular matrix')
    print('only results of first stage are available')

First stage done
Inverse of covariance matrix of empirical moments is calculated
Second stage done
Second stage results
[ 2.91199065 -3.98605088 -0.35094517  0.15029961]


In [146]:
def compute_jacobian_and_sum_endog(params, data):
    Z = data[['price_gas', 'snowfall', 'construction', 'bus_detour']]
    Z = np.hstack([np.ones((Z.shape[0], 1)), Z])
    X = data[['cost.car', 'time.car', 'time.bus']]
    X = np.hstack([np.ones((X.shape[0], 1)), X])  # Add intercept term (constant)
    y = data['choice'].values
    prob = expit(np.dot(X, params))  # Sigmoid function to get probabilities
    # Initialize the sum of Jacobians as a zero 4x4 matrix
    sum_jacobian = np.zeros((5, 4))
    # Loop through each data point
    for i in range(len(y)):
        zi = Z[i]  # 1x5 vector
        xi = X[i] # 1x4 vector
        pi = prob[i]  # scalar
        error_i = y[i] - pi  # scalar
        # Gradient of the probability with respect to params: pi * (1 - pi) * xi
        gradient = pi * (1 - pi) * np.outer(zi, xi)  # 5x4 matrix
        # Contribution to the Jacobian from the i-th data point
        jacobian_i = -gradient  # 4x4 matrix
        # Sum the Jacobian
        sum_jacobian += jacobian_i
    
    return sum_jacobian

# Calculate the sum of Jacobians
Jacobian = compute_jacobian_and_sum_endog(parameters_second, commute_binary)/N

In [148]:
moments = mm_fn_endog(parameters_second, commute_binary)
hat_S = np.cov(moments, rowvar=False)
hat_S_inv = np.linalg.inv(hat_S)
Var_estimator = np.linalg.inv(Jacobian.T @ hat_S_inv @ Jacobian)/N

In [149]:
parameters_second[3] = -parameters_second[3]
std = np.sqrt(np.diag(Var_estimator))
z_stat = parameters_second / std
p_values = 2 * (1 - norm.cdf(abs(z_stat)))
summary_table_gmm_a = pd.DataFrame({
    'Parameters': ['Intercept_car', 'Cost', 'Time.car', 'Time.Bus'],
    'Coefficient': parameters_second,
    'Standard Error': std,
    'Z-statistic': z_stat,
    'P-value': p_values
})
print(summary_table_gmm_a.round(3).to_string(index=False))

   Parameters  Coefficient  Standard Error  Z-statistic  P-value
Intercept_car        2.912           3.813        0.764    0.445
         Cost       -3.986           8.057       -0.495    0.621
     Time.car       -0.351           0.123       -2.855    0.004
     Time.Bus       -0.150           0.053       -2.859    0.004


The parameter estimates are roughly the same as those in the previous model. However,
the intercept and cost parameters now have much larger standard errors, rendering those
parameters not statistically significant. Using instruments can reduce the precision of our
parameter estimates, especially if they are not sufficiently correlated with the relevant variables,
which may be the case here.

#### Overidentification test

As the number of parameters is less than moments, we can run this test to see if empirical moments are actually close enough to zero. 

$$
OIR=\left[\frac{1}{n}\sum_{i=1}^{n}\boldsymbol{m}(y_{i},\boldsymbol{x}_{i},\boldsymbol{z}_{i},\widehat{\boldsymbol{\theta}})\right]^{\prime}\widetilde{\boldsymbol{S}}^{-1}\left[\frac{1}{n}\sum_{i=1}^{n}\boldsymbol{m}(y_{i},\boldsymbol{x}_{i},\boldsymbol{z}_{i},\widehat{\boldsymbol{\theta}})\right] \\ OIR\overset{a}{\operatorname*{\sim}}\chi^2(L-K)
$$

In [187]:
OIR = gmm_obj_endog(parameters_second, commute_binary, hat_S_inv)[0,0]*N
p_value = 1 - chi2.cdf(OIR, 1)
if p_value < 0.01:
    result = f'Reject the null hypothesis (p-value: {p_value:.4f})'
else:
    result = f'Fail to reject the null hypothesis (p-value: {p_value:.4f})'
print(result)

Fail to reject the null hypothesis (p-value: 0.3703)


So, we failed to reject the null. So we conclude that all
empirical moments are sufficiently close to zero.