This is the **Car: Stated Preferences for Car Choice** from the mlogit package in R.

information of the dataset is available [here](https://rdrr.io/cran/mlogit/man/Car.html)

In [31]:
import os
path = os.getcwd()
path

'C:\\Users\\thwai\\Mercor'

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from statsmodels.discrete.discrete_model import MNLogit
from statsmodels.discrete.conditional_models import ConditionalLogit
from scipy.stats import norm

In [20]:
# Avoids verbose outputs
warnings.filterwarnings("ignore")

In [21]:
path = r"C:\Users\thwai\OneDrive\Documents\Mercor\Task 9 Choice Based Conjoint\Car.csv"
wide = pd.read_csv(path)
wide['id'] = np.arange(len(wide))
wide['choice'] = wide['choice'].str.replace('choice','').astype(int)

# shorthand way of turning the dataset from wide to long format, without brute forcing it via pd.melt()
long = pd.wide_to_long(
    wide,
    stubnames=["price","range","acc","speed","pollution","size","space","cost","station","fuel","type"],
    i="id", j="alt_id", sep="", suffix=r"\d+"
).reset_index()

long['choice'] = np.where(long['choice'] == long['alt_id'],1,0)

# establish baselines: 'regcar' and 'gasoline'
long['type'] = pd.Categorical(
    long['type'],
    categories = ["regcar","sportuv","sportcar","stwagon","truck","van"],
    ordered=False
)
long['fuel'] = pd.Categorical(
    long['fuel'],
    categories = ["gasoline","methanol","cng","electric"],
    ordered=False
)

In [22]:
# generate random values of income from a normal distribution
income = norm(loc=77700,scale=23000).rvs(size=long.shape[0], random_state=42)
income = np.where(income < 10000,10000,income)
print(f'The average income in the population is ${round(income.mean(),0)}')
income_log = np.log(income)
print(f'The average log income in the population is {round(income_log.mean(),3)}')
long['income'] = income
long['ln_income'] = np.log(income)
long['price'] = long['price']*income_log*1000 
print(f'The average vehicle prince is ${round(long['price'].mean(),3)}')

The average income in the population is $77761.0
The average log income in the population is 11.21
The average vehicle prince is $47151.052


In [27]:
long.head()

Unnamed: 0,id,alt_id,choice,college,coml5,hsg2,price,range,acc,speed,pollution,size,space,cost,station,fuel,type,income,ln_income
0,0,1,1,0,0,0,47589.69783,250,4.0,95,0.6,3,0.7,4,0.1,cng,van,89124.425519,11.397789
1,1,1,0,1,1,1,37144.922025,125,2.5,85,0.0,3,0.7,4,0.0,methanol,regcar,74519.921073,11.218822
2,2,1,0,0,0,1,46196.61541,300,6.0,140,0.1,2,1.0,6,0.1,cng,regcar,92596.836376,11.43601
3,3,1,0,0,1,0,82196.62684,200,4.0,100,0.0,2,1.0,8,0.0,methanol,regcar,112729.686697,11.632748
4,4,1,0,0,0,1,64829.544286,75,4.0,85,0.1,1,0.7,6,0.3,cng,regcar,72314.472381,11.18878


In [23]:
# one hot encode type and fuel (drop baselines set above)
type_d = pd.get_dummies(long['type'], prefix='type', drop_first=True)
fuel_d = pd.get_dummies(long['fuel'], prefix='fuel', drop_first=True)  

# create X: the are the independent variables for the prediction
X = pd.concat([
    type_d, fuel_d,
    long[['price', 'range','acc', 'speed', 'pollution', 'size', 'space', 'cost', 'station']]
],
             axis=1)
# create df of the X and y variables together
model_df = pd.concat([
    long[['id','alt_id','choice']].reset_index(drop=True),
    X.reset_index(drop=True)
    ], axis=1)

# get the working dataframe: we don't want to alter the model_df because we are going to manipulate the values in est_df
est_df = model_df.copy()

# split out id from non-id columns: we don't want these to be the Independent Variables. We need the "id" to identify which choices are grouped together
id_cols = ["id","alt_id","choice"]

# get the actual columns we want for the Independent Variables (IVs)
X_cols = [c for c in est_df.columns if c not in id_cols]  # NOTE: no 'alt'/'alt_id' here

# force all columns to numeric
X_numeric = est_df[X_cols].apply(pd.to_numeric, errors="coerce")

# remove bad rows (if there are any)
bad = X_numeric.isna().any(axis=1) | ~np.isfinite(X_numeric).all(axis=1)
clean = est_df.loc[~bad].copy()
X = X_numeric.loc[~bad].astype(float).to_numpy()
y = clean["choice"].astype(int).to_numpy()

# group all the 6 choices a user saw together
groups = clean["id"].to_numpy()

# Drop any constant columns (variance==0)
col_vars = X.var(axis=0)

if np.any(col_vars == 0):
    keep = col_vars != 0
    X = X[:, keep]
    X_cols = [c for c,v in zip(X_cols, keep) if v]

    # Fit Conditional Logit
cl = ConditionalLogit(y, X, groups=groups)
res = cl.fit(method="bfgs", disp=True)
print(res.summary(xname=X_cols))

         Current function value: 0.298627
         Iterations: 0
         Function evaluations: 114
         Gradient evaluations: 102
                  Conditional Logit Model Regression Results                  
Dep. Variable:                      y   No. Observations:                27924
Model:               ConditionalLogit   No. groups:                       4654
Log-Likelihood:               -8338.8   Min group size:                      6
Method:                          bfgs   Max group size:                      6
Date:                Tue, 30 Sep 2025   Mean group size:                   6.0
Time:                        21:58:59                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
type_sportuv           0      0.114          0      1.000      -0.223       0.223
type_sportcar          0      0.119          0      1.000      -0.

In [26]:
pd.DataFrame(X).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
count,27924.0,27924.0,27924.0,27924.0,27924.0,27924.0,27924.0,27924.0,27924.0,27924.0,27924.0,27924.0,27924.0,27924.0,27924.0,27924.0,27924.0
mean,0.03753,0.031514,0.159218,0.201547,0.178771,0.248961,0.251253,0.250609,47151.051672,237.632503,4.166667,99.771523,0.364328,2.3526,0.925483,4.692881,0.432066
std,0.190061,0.174706,0.365886,0.401163,0.383167,0.432419,0.433742,0.433371,21313.262223,94.328979,1.433747,24.043677,0.30314,0.789627,0.129626,2.408741,0.400257
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6425.078985,50.0,2.5,55.0,0.0,0.0,0.7,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33807.688353,125.0,2.5,85.0,0.1,2.0,1.0,2.0,0.1
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45559.213257,250.0,4.0,95.0,0.25,3.0,1.0,4.0,0.3
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,58565.180304,300.0,6.0,110.0,0.6,3.0,1.0,6.0,0.7
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,203898.47543,400.0,6.0,140.0,1.0,3.0,1.0,8.0,1.0


In [None]:
# Make params 1-D and align names
params_1d = np.asarray(res.params).reshape(-1)

# Prefer names from the result (if exposed); otherwise fall back to X_cols
names = getattr(res, "xnames", None) or X_cols

# Safety check: handle any length mismatch defensively
if len(params_1d) != len(names):
    # Try to sync lengths; warn if they still disagree
    m = min(len(params_1d), len(names))
    print(f"[warn] params ({len(params_1d)}) and names ({len(names)}) differ; truncating to {m}.")
    params_1d = params_1d[:m]
    names = names[:m]

coefs = pd.Series(params_1d, index=names)

# Build readable part-worth tables
partworths = {}

# Vehicle type (baseline "regcar" = 0)
type_levels = ["regcar","sportuv","sportcar","stwagon","truck","van"]
type_pw = {"type_regcar": 0.0}
for lvl in type_levels[1:]:
    name = f"type_{lvl}"
    if name in coefs:
        type_pw[name] = coefs[name]
partworths["type"] = pd.Series(type_pw)

# Fuel (baseline "gasoline" = 0)
fuel_levels = ["gasoline","methanol","cng","electric"]
fuel_pw = {"fuel_gasoline": 0.0}
for lvl in fuel_levels[1:]:
    name = f"fuel_{lvl}"
    if name in coefs:
        fuel_pw[name] = coefs[name]
partworths["fuel"] = pd.Series(fuel_pw)

# Continuous attributes: utility per unit
cont_cols = ['price','range','acc','speed','pollution','size','space','cost','station']
partworths["continuous"] = coefs[coefs.index.isin(cont_cols)].sort_index()

for k,v in partworths.items():
    print(f"\n=== {k.upper()} PART-WORTHS ===")
    print(v.sort_index())

In [15]:
# --- after you've created `coefs` (Series indexed by names) and you still have `long` ---
beta_price = float(coefs['price'])

# 1) get log_income per person if available; else use a reference scalar
# Expect either a column named 'log_income' OR 'income' we can log.
log_income_col = None
for cand in ['log_income', 'ln_income', 'income_log', 'income_ln', 'income']:
    if cand in long.columns:
        log_income_col = cand
        break

if log_income_col is None:
    # Fallback: pick a reference log-income (e.g.,  median of an external value).
    # If you only have the transformed price, you cannot back out income,
    # so choose a sensible constant (say,  median U.S. household income ≈ $75k):
    ref_income = 75000.0
    log_income_ref = np.log(ref_income)
    long['_log_income_used_'] = log_income_ref
    print(f"[info] No income/log_income column found; using reference income=${ref_income:,.0f} "
          f"(log={log_income_ref:.3f}) for WTP & RI.")
else:
    if log_income_col == 'income':
        long['_log_income_used_'] = np.log(long['income'].astype(float))
    else:
        long['_log_income_used_'] = long[log_income_col].astype(float)

# 2) helper: WTP for a coefficient value b, given each person's log_income
def wtp_from_beta(b, log_inc, beta_p):
    return b * (log_inc / abs(beta_p))

# --- collect coefficients ---
cont_cols = ['range','acc','speed','pollution','size','space','cost','station']  # exclude 'price' itself
type_levels = ["regcar","sportuv","sportcar","stwagon","truck","van"]
fuel_levels = ["gasoline","methanol","cng","electric"]

# build categorical part-worths (baseline=0)
type_pw = {'type_regcar': 0.0}
for lvl in type_levels[1:]:
    nm = f"type_{lvl}"
    if nm in coefs: type_pw[nm] = float(coefs[nm])

fuel_pw = {'fuel_gasoline': 0.0}
for lvl in fuel_levels[1:]:
    nm = f"fuel_{lvl}"
    if nm in coefs: fuel_pw[nm] = float(coefs[nm])

# 3) per-person WTP ranges for each attribute
ranges_by_person = []

# compute design ranges from your actual data (you can replace with experimental design bounds)
design_ranges = {a: (long[a].min(), long[a].max()) for a in ['price']+cont_cols}

# iterate over people (ids)
for pid, g in long.groupby('id', sort=False):
    Li = g['_log_income_used_'].iloc[0]  # one income per person
    # CATEGORICAL: take max-min of level WTPs (include baseline=0)
    type_wtps = [wtp_from_beta(v, Li, beta_price) for v in type_pw.values()]
    fuel_wtps = [wtp_from_beta(v, Li, beta_price) for v in fuel_pw.values()]
    type_range = (max(type_wtps) - min(type_wtps)) if len(type_wtps) else 0.0
    fuel_range = (max(fuel_wtps) - min(fuel_wtps)) if len(fuel_wtps) else 0.0

    # CONTINUOUS: linear, so range = |WTP_per_unit| * (hi - lo)
    cont_ranges = {}
    for a in cont_cols:
        if a in coefs.index and pd.notna(coefs[a]):
            lo, hi = design_ranges[a]
            span = float(hi - lo)
            wtp_per_unit = wtp_from_beta(float(coefs[a]), Li, beta_price)
            cont_ranges[a] = abs(wtp_per_unit) * span
        else:
            cont_ranges[a] = 0.0

    pieces = {'type': type_range, 'fuel': fuel_range, **cont_ranges}
    total = sum(pieces.values())
    if total <= 0:
        rel_imp = {k: 0.0 for k in pieces}
    else:
        rel_imp = {k: 100.0 * v / total for k, v in pieces.items()}
    ranges_by_person.append(pd.Series(rel_imp, name=pid))

# 4) aggregate to a single RI vector
ri_df = pd.DataFrame(ranges_by_person).fillna(0.0)
ri_mean = ri_df.mean().sort_values(ascending=False).round(1)
print("\n=== RELATIVE IMPORTANCE from WTP (%, averaged across people) ===")
print(ri_mean)

# (Optional) also return the reference-income RI if you used a fixed reference:
if log_income_col is None:
    # This equals the same computation above (everyone shares the same log_income_ref),
    # but we print it explicitly for clarity.
    print("\n[info] These RIs reflect the chosen reference income; using another reference will rescale WTPs "
          "but RIs remain identical because all ranges scale proportionally.")


[info] No income/log_income column found; using reference income=$75,000 (log=11.225) for WTP & RI.

=== RELATIVE IMPORTANCE from WTP (%, averaged across people) ===
type         0.0
fuel         0.0
range        0.0
acc          0.0
speed        0.0
pollution    0.0
size         0.0
space        0.0
cost         0.0
station      0.0
dtype: float64

[info] These RIs reflect the chosen reference income; using another reference will rescale WTPs but RIs remain identical because all ranges scale proportionally.


In [16]:
type_long = wide.melt(
    id_vars=['id','choice'],
    value_vars=['type1','type2', 'type3','type4','type5','type6'],
    var_name='alt',
    value_name= 'type')

fuel_long = wide.melt(
    id_vars=['id','choice'],
    value_vars=['fuel1', 'fuel2', 'fuel3', 'fuel4', 'fuel5', 'fuel6'],
    var_name='alt',
    value_name= 'fuel')

price_long = wide.melt(
    id_vars=['id','choice'],
    value_vars=['price1', 'price2', 'price3', 'price4', 'price5', 'price6'],
    var_name='alt',
    value_name= 'price')

range_long = wide.melt(
    id_vars=['id','choice'],
    value_vars=['range1', 'range2', 'range3', 'range4', 'range5', 'range6'],
    var_name='alt',
    value_name= 'range')

acc_long = wide.melt(
    id_vars=['id','choice'],
    value_vars=['acc1', 'acc2', 'acc3', 'acc4', 'acc5', 'acc6'],
    var_name='alt',
    value_name= 'acc')

speed_long = wide.melt(
    id_vars=['id','choice'],
    value_vars=['speed1', 'speed2', 'speed3','speed4', 'speed5', 'speed6'],
    var_name='alt',
    value_name= 'speed')

pollution_long = wide.melt(
    id_vars=['id','choice'],
    value_vars=['pollution1', 'pollution2', 'pollution3','pollution4', 'pollution5', 'pollution6'],
    var_name='alt',
    value_name= 'pollution')

size_long = wide.melt(
    id_vars=['id','choice'],
    value_vars=['size1', 'size2', 'size3','size4', 'size5', 'size6'],
    var_name='alt',
    value_name= 'size')

space_long = wide.melt(
    id_vars=['id','choice'],
    value_vars=['space1', 'space2', 'space3', 'space4','space5', 'space6'],
    var_name='alt',
    value_name= 'space')

cost_long = wide.melt(
    id_vars=['id','choice'],
    value_vars=['cost1', 'cost2', 'cost3', 'cost4', 'cost5',
       'cost6'],
    var_name='alt',
    value_name= 'cost')

station_long = wide.melt(
    id_vars=['id','choice'],
    value_vars=['station1', 'station2', 'station3', 'station4', 'station5',
       'station6'],
    var_name='alt',
    value_name= 'station')

In [110]:
# Set up the initial table
type_long['alt'] = type_long['alt'].str.replace(r'\D+','',regex=True)
df = type_long.copy()

# create loop for joining all the other tables
tables = [fuel_long, price_long, range_long, acc_long, speed_long, pollution_long, size_long, space_long, cost_long, station_long]

for table in tables:
    table['alt'] = table['alt'].str.replace(r'\D+','',regex=True)
    df = df.merge(table, how='left', on=['id','choice','alt'])

# convert from string to float to match 'choice'
df['alt'] = df['alt'].astype(int) 

#  only returns 1 when the choice number matches the alteranative, 0 else where
df['choice'] = np.where(df['choice'] == df['alt'], 1, 0)