In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Sample data generation (Replace with your actual data)
np.random.seed(0)
n_samples = 100

# Generating random data
data = {
    'individual_var': np.random.rand(n_samples),  # Individual-specific variable
    'cost_car': np.random.rand(n_samples),
    'time_car': np.random.rand(n_samples),
    'cost_bus': np.random.rand(n_samples),
    'time_bus': np.random.rand(n_samples),
    'cost_bike': np.random.rand(n_samples),
    'time_bike': np.random.rand(n_samples),
    'cost_walk': np.random.rand(n_samples),
    'time_walk': np.random.rand(n_samples),
    'choice': np.random.choice(['car', 'bus', 'bike', 'walk'], size=n_samples)
}

df = pd.DataFrame(data)

# Encoding the choice column to numerical values
df['choice'] = df['choice'].astype('category').cat.codes

In [2]:
# Define choices and corresponding columns
choices = ['car', 'bus', 'bike', 'walk']
time_cols = [f'time_{choice}' for choice in choices]
cost_cols = [f'cost_{choice}' for choice in choices]

# Initialize the design matrix
X = pd.DataFrame()

# Add cost and time columns, only including relevant columns for each alternative
for choice in choices:
    for col in ['cost', 'time']:
        X[f'{col}_{choice}'] = np.where(df['choice'] == choices.index(choice), df[f'{col}_{choice}'], 0)

# Add intercepts (alternative-specific)
for choice in choices[1:]:  # Avoid multicollinearity with baseline (car)
    X[f'intercept_{choice}'] = (df['choice'] == choices.index(choice)).astype(int)

# Adding the individual-specific variable (it is the same for all alternatives)
X['individual_var'] = df['individual_var']

# Adding a constant for the baseline intercept
X = sm.add_constant(X)

# Dependent variable
y = df['choice']