In [None]:
import os
root = "../../Foundation_of_Advanced_Quantitative_Marketing_Python"
os.chdir(root)

In [5]:
import os
import numpy as np
import pandas as pd
from src import logit as lgt
from scipy.optimize import minimize
import statsmodels.api as sm

# Logit Model Estimation

In [None]:
df = pd.read_excel('../Data/Yogurt100N.xlsx')

In [2]:
p=4 # number of alternatives
# construct X and y
cols =['Feature 1', 'Price 1', 'Feature 2', 'Price 2',
       'Feature 3', 'Price 3', 'Feature 4', 'Price 4']
features = df[cols].values
N = features.shape[0]
# add brand dummies
bdummy=np.vstack((np.eye(p-1),np.zeros((p-1,1)).T))
bdummy=np.tile(bdummy, (N,1)).reshape(N, p, 3)  # (N*4, 3)
# hstack bdummy and features
features = features.reshape(N, p, 2)  # (N, 4, 2)
features = np.concatenate([bdummy, features], axis=2)  # (N, 4, 5)
# features = features.reshape(N, p*5)  # (N, 20)
X = features
# construct y (choice)
brands = df[['Brand 1', 'Brand  2', 'Brand 3', 'Brand 4']].values
y = np.argmax(brands, axis=1)
X.shape, y.shape

((2430, 4, 5), (2430,))

In [3]:
model = lgt.LogitModel()
model.fit(X, y)

Optimized parameters:
[  1.38771887   0.64348839  -3.08609171   0.48743159 -37.05719774]
Maximized LL: -2658.5566966130473


<logit.LogitModel at 0x1621774f0>

In [4]:
print(lgt.Metrics.rho2(model, X, y))
print(lgt.Metrics.AIC(model))
print(lgt.Metrics.BIC(model))
print(lgt.Metrics.HQC(model))
print(lgt.Metrics.CAIC(model))

0.21080523390278283
5327.113393226095
5356.091625907768
5337.6490476412155
5327.138145701342


# Nested Logit Model Estimation

In [None]:
df = pd.read_excel('./Data/Yogurt100N.xlsx')

In [10]:
p=4 # number of alternatives
# construct X and y
cols =['Feature 1', 'Price 1', 'Feature 2', 'Price 2',
       'Feature 3', 'Price 3', 'Feature 4', 'Price 4']
features = df[cols].values
N = features.shape[0]
# add brand dummies
bdummy=np.vstack((np.eye(p-1),np.zeros((p-1,1)).T))
bdummy=np.tile(bdummy, (N,1)).reshape(N, p, 3)  # (N*4, 3)
# hstack bdummy, group id and features
group_ids = np.array([0, 0, 0, 1])  # Group IDs for the nested structure
group_ids = np.tile(group_ids, (N, 1)).reshape(N, p, 1)  # (N, 4, 1)
features = features.reshape(N, p, 2)  # (N, 4, 2)
features = np.concatenate([bdummy, features,group_ids], axis=2)  # (N, 4, 3+2+1=6)
# features = features.reshape(N, p*5)  # (N, 20)
X = features
# construct y (choice)
brands = df[['Brand 1', 'Brand  2', 'Brand 3', 'Brand 4']].values
y = np.argmax(brands, axis=1)
X.shape, y.shape

model = lgt.NestedLogitModel()
model.fit(X, y, 2, nsingle_class=1)

Optimized parameters for features:
[  1.3816447    0.83942657  -1.65835188   0.3744476  -26.58028504]
Optimized parameters for within-group correlations (rho = 0.5 will be assigned to groups with only one product):
[0.64 0.5 ]
Maximized LL: -2653.764665903218


<logit.NestedLogitModel at 0x1620e5d80>

In [11]:
print(lgt.Metrics.rho2(model, X, y))
print(lgt.Metrics.AIC(model))
print(lgt.Metrics.BIC(model))
print(lgt.Metrics.HQC(model))
print(lgt.Metrics.CAIC(model))

0.235958513035898
5319.529331806436
5354.303211024444
5332.172117104581
5319.5639995736665


# LCM Estimation
Realize global optimization with basinhopping.

In [None]:
df = pd.read_excel('./Data/24_Yogurt100_2SegSolution (1).xlsx')

In [5]:
p=4 # number of alternatives
# construct X and y
cols =['Feature 1', 'Price 1', 'Feature 2', 'Price 2',
       'Feature 3', 'Price 3', 'Feature 4', 'Price 4']
features = df[cols].values
indiv_id = df['Pan I.D.'].values
N = features.shape[0]
# add brand dummies
bdummy=np.vstack((np.eye(p-1),np.zeros((p-1,1)).T))
bdummy=np.tile(bdummy, (N,1)).reshape(N, p, 3)  # (N*4, 3)
# hstack bdummy and features
features = features.reshape(N, p, 2)  # (N, 4, 2)
features = np.concatenate([bdummy, features], axis=2)  # (N, 4, 5)
# features = features.reshape(N, p*5)  # (N, 20)
X = features
# construct y (choice)
brands = df[['Brand 1', 'Brand  2', 'Brand 3', 'Brand 4']].values
y = np.argmax(brands, axis=1)
X.shape, y.shape

((2431, 4, 5), (2431,))

In [9]:
# Model with two segments
model = lgt.LatentClassModel()
model.fit(X, y, nclasses=2, indiv_id=indiv_id)
print('rho2: ' + str(lgt.Metrics.rho2(model, X, y)))
print('AIC: ' + str(lgt.Metrics.AIC(model)))
print('BIC: ' + str(lgt.Metrics.BIC(model)))
print('HQC: ' + str(lgt.Metrics.HQC(model)))
print('CAIC: ' + str(lgt.Metrics.CAIC(model)))

Current NLL: 1915.8010532041665
Current NLL: 1915.8013574514746
Current NLL: 1915.8019621214269
Current NLL: 1915.8015617107574
Current NLL: 1915.801580974075
Current NLL: 1915.8010657174586
Current NLL: 1915.8011467928704
Current NLL: 1915.8031630179566
Current NLL: 1915.8015869215685
Current NLL: 1915.8010535486228
Current NLL: 1915.8027657431355
Optimized parameters for features:
[  2.69780807   3.85194226  -0.67362732   1.42635078 -50.36285002
   1.33346181  -1.29435176  -4.28497991   0.3774249  -36.91499758]
Optimized parameters for class weights (pie):
[0.48 0.52]
Maximized LL: -1915.8010532041665
rho2: None
AIC: 3853.602106408333
BIC: 3917.3587441258105
HQC: 3876.7817072051216
CAIC: 3853.7112424149473




In [10]:
# model with three segments
model = lgt.LatentClassModel()
model.fit(X, y, nclasses=3, indiv_id=indiv_id)
print('rho2: ' + str(lgt.Metrics.rho2(model, X, y)))
print('AIC: ' + str(lgt.Metrics.AIC(model)))
print('BIC: ' + str(lgt.Metrics.BIC(model)))
print('HQC: ' + str(lgt.Metrics.HQC(model)))
print('CAIC: ' + str(lgt.Metrics.CAIC(model)))

Current NLL: 2116.6379432343742
Current NLL: 1482.9701672487647
Current NLL: 1482.9702488571083
Current NLL: 1482.9702556794994
Current NLL: 1482.9708347154985
Current NLL: 1482.9702405593519
Current NLL: 1482.9706155486365
Current NLL: 1482.9702780190332
Current NLL: 1482.9713476911977
Current NLL: 1482.9704342727287
Current NLL: 1482.9705165406583
Optimized parameters for features:
[ -1.08632562  -2.64056985  -6.07426227   1.06046713 -20.2326853
   1.96800861   3.00786833  -1.64989002   1.31390257 -55.50993595
   5.14384351   1.40378513  -1.84316748  -0.18450514 -57.49132846]
Optimized parameters for class weights (pie):
[0.15 0.51 0.34]
Maximized LL: -1482.9701672487647
rho2: None
AIC: 2999.9403344975294
BIC: 3098.4733200609035
HQC: 3035.7633539107483
CAIC: 3000.1939606889923


In [3]:
# model with four segments
model = lgt.LatentClassModel()
model.fit(X, y, nclasses=4, indiv_id=indiv_id)
print('rho2: ' + str(lgt.Metrics.rho2(model, X, y)))
print('AIC: ' + str(lgt.Metrics.AIC(model)))
print('BIC: ' + str(lgt.Metrics.BIC(model)))
print('HQC: ' + str(lgt.Metrics.HQC(model)))
print('CAIC: ' + str(lgt.Metrics.CAIC(model)))

Current NLL: 2116.6379420988937
Current NLL: 1389.7312196574803
Current NLL: 1389.7312990693235
Current NLL: 1389.7317044019478
Current NLL: 1389.731622565172
Current NLL: 1389.7311811991895
Current NLL: 1389.731179664323
Current NLL: 1389.7313192541126
Current NLL: 1389.731411138419
Current NLL: 1389.732563421723
Current NLL: 1389.7314990345628
Optimized parameters for features:
[ -1.46097287  -4.00197312 -34.32323569   1.09791698 -14.82351352
   1.30161244   0.06910057  -2.4794909    1.02645535 -48.03713536
   2.3319257    4.04250635  -0.90529474   0.93899491 -41.28363967
   6.10593626   2.42610433  -0.84917872  -0.19424931 -54.8874037 ]
Optimized parameters for class weights (pie):
[0.09 0.23 0.37 0.3 ]
Maximized LL: -1389.731179664323
rho2: None
AIC: 2825.462359328646
BIC: 2958.771692737917
HQC: 2873.928797358295
CAIC: 2825.9210215637936




In [9]:
# assign segemt (0-3) to 101 households
model.segment_assign()

array([2, 0, 1, 1, 2, 0, 2, 2, 1, 2, 1, 2, 1, 3, 0, 0, 3, 0, 2, 3, 1, 3,
       2, 2, 1, 2, 2, 2, 2, 1, 3, 3, 2, 3, 2, 3, 2, 1, 1, 3, 2, 1, 3, 2,
       3, 3, 3, 3, 1, 2, 2, 2, 1, 2, 3, 1, 1, 2, 3, 2, 3, 2, 1, 0, 3, 2,
       3, 2, 2, 0, 3, 2, 3, 2, 1, 1, 2, 2, 3, 3, 0, 2, 2, 1, 3, 2, 1, 3,
       1, 3, 3, 2, 1, 3, 3, 2, 0, 3, 1, 1, 2])

# Random Coefficients Model

In [None]:
df = pd.read_excel('./Data/24_Yogurt100N_MVN.xlsx')
df = df[:2430]

In [2]:
p=4 # number of alternatives
# construct X and y
cols =['Feature 1', 'Price 1', 'Feature 2', 'Price 2',
       'Feature 3', 'Price 3', 'Feature 4', 'Price 4']
features = df[cols].values
indiv_id = df['Pan I.D.'].values
N = features.shape[0]
# add brand dummies
bdummy=np.vstack((np.eye(p-1),np.zeros((p-1,1)).T))
bdummy=np.tile(bdummy, (N,1)).reshape(N, p, 3)  # (N*4, 3)
# hstack bdummy and features
features = features.reshape(N, p, 2)  # (N, 4, 2)
features = np.concatenate([bdummy, features], axis=2)  # (N, 4, 5)
# features = features.reshape(N, p*5)  # (N, 20)
X = features
# construct y (choice)
brands = df[['Brand 1', 'Brand  2', 'Brand 3', 'Brand 4']].values
y = np.argmax(brands, axis=1)
X.shape, y.shape

((2430, 4, 5), (2430,))

In [4]:
# random coefficients model (10 draws by default)
model = lgt.RandomCoefficientsModel()
model.fit(X, y, indiv_id=indiv_id)
print('rho2: ' + str(lgt.Metrics.rho2(model, X, y)))
print('AIC: ' + str(lgt.Metrics.AIC(model)))
print('BIC: ' + str(lgt.Metrics.BIC(model)))
print('HQC: ' + str(lgt.Metrics.HQC(model)))
print('CAIC: ' + str(lgt.Metrics.CAIC(model)))

Current NLL: 1286.2648119579212
Current NLL: 1286.266024636101
Current NLL: 1286.2616555564186
Current NLL: 1286.259927725831
Current NLL: 1286.2600775498465
Current NLL: 1286.259548441499
Maximized LL: -1286.259548441499
The means of the coefficients are: [  2.91993964   1.59981367  -2.09072398   0.66947809 -40.9158348 ]
The standard deviations of the coefficients are: [ 3.98223563  6.04008649  5.09146379  3.86527885 81.26743473]
The covariance matrix of the coefficients is: [[  15.85820063   19.7439527    16.97446155  -10.79927602 -146.10224387]
 [  19.7439527    36.48264482   29.53227933  -14.65003081 -321.13528173]
 [  16.97446155   29.53227933   25.9230035   -13.24618147 -301.163749  ]
 [ -10.79927602  -14.65003081  -13.24618147   14.9403806     7.8348918 ]
 [-146.10224387 -321.13528173 -301.163749      7.8348918  6604.3959478 ]]
rho2: None
AIC: 2612.519096882998
BIC: 2728.4320276096896
HQC: 2654.661714543482
CAIC: 2612.867789286485


In [5]:
# random coefficients model (100 draws)
model = lgt.RandomCoefficientsModel()
model.fit(X, y, indiv_id=indiv_id, draws=100)
print('rho2: ' + str(lgt.Metrics.rho2(model, X, y)))
print('AIC: ' + str(lgt.Metrics.AIC(model)))
print('BIC: ' + str(lgt.Metrics.BIC(model)))
print('HQC: ' + str(lgt.Metrics.HQC(model)))
print('CAIC: ' + str(lgt.Metrics.CAIC(model)))

Current NLL: 1207.5147025233516
Current NLL: 1207.514686858835
Current NLL: 1207.5150378988876
Current NLL: 1207.5149823455954
Current NLL: 1207.5152481611997
Current NLL: 1207.5147953736275
Maximized LL: -1207.514686858835
The means of the coefficients are: [  4.06550969   2.26015314  -2.63906037   0.66829287 -65.92783571]
The standard deviations of the coefficients are: [ 4.11790371  4.11582394  4.23241922  0.97378671 37.78732705]
The covariance matrix of the coefficients is: [[ 1.69571310e+01  9.19533470e+00  5.97750741e+00  4.98309629e-01
  -5.36639224e+01]
 [ 9.19533470e+00  1.69400067e+01  1.23022898e+01 -1.33098334e+00
  -1.80887615e+01]
 [ 5.97750741e+00  1.23022898e+01  1.79133724e+01  9.97864961e-01
  -1.32379394e+01]
 [ 4.98309629e-01 -1.33098334e+00  9.97864961e-01  9.48260551e-01
   8.70561372e+00]
 [-5.36639224e+01 -1.80887615e+01 -1.32379394e+01  8.70561372e+00
   1.42788209e+03]]
rho2: None
AIC: 2455.02937371767
BIC: 2570.9423044443615
HQC: 2497.171991378154
CAIC: 2455.

## apply factor analysis with random coefficient model

In [None]:
df = pd.read_excel('./Data/OJ300_OnlyPurchases_Brand Map_Cts.xlsx')
df = df[:2066]

In [2]:
p=8 # number of alternatives
# construct X and y
cols =[ 'price1', 'price2', 'price3', 'price4', 'price5', 'price6', 'price7', 'price8']
features = df[cols].values
indiv_id = df['panid'].values
N = features.shape[0]
# add brand dummies
bdummy=np.vstack((np.eye(p-1),np.zeros((p-1,1)).T))
bdummy=np.tile(bdummy, (N,1)).reshape(N, p, -1)
# hstack bdummy and features
features = features.reshape(N, p, -1)  # (N, 8, 2)
features = np.concatenate([bdummy, features], axis=2)  # (N, 8, 5)
X = features
# construct y (choice)
brands = df[['br1', 'br2', 'br3', 'br4', 'br5', 'br6', 'br7', 'br8']].values
y = np.argmax(brands, axis=1)
X.shape, y.shape

((2066, 8, 8), (2066,))

In [None]:
# apply factor analysis with random coefficient model (assume price is a homogeneous attribute but seven brand intercepts are heterogeneous)
# takes 834m 35.9s
homo_covariates=np.array([0]*8)
homo_covariates[-1] = 1
model = lgt.RandomCoefficientsModel()
model.fit(X, y, indiv_id=indiv_id, method='factor-analytic', homo_covariates=homo_covariates,niteration=3)
print('rho2: ' + str(lgt.Metrics.rho2(model, X, y)))
print('AIC: ' + str(lgt.Metrics.AIC(model)))
print('BIC: ' + str(lgt.Metrics.BIC(model)))
print('HQC: ' + str(lgt.Metrics.HQC(model)))
print('CAIC: ' + str(lgt.Metrics.CAIC(model)))

Current NLL: 2845.624967553122
Current NLL: 2845.625094537688
Current NLL: 2845.624972142691
Current NLL: 2845.6250173317876
current BIC:  5805.750479851438
Current NLL: 2342.528010561873
Current NLL: 2342.528038071153
Current NLL: 2340.6946058592225
Current NLL: 2340.69461521331
current BIC:  4841.689974361716
Current NLL: 2177.77234742876
Current NLL: 2177.7724032528376
Current NLL: 2177.7723333211907
Current NLL: 2177.7723773702237
current BIC:  4561.64564718373
Current NLL: 2013.3198097846519
Current NLL: 2013.3197989361674
Current NLL: 2013.31989114719
Current NLL: 2013.3198978209555
current BIC:  4278.540796311761
Current NLL: 2000.246519282262
Current NLL: 1999.8779774355382
Current NLL: 1999.8683455887226
Current NLL: 1966.1273520203974
current BIC:  4297.438107514949
Maximized LL: -2013.3197989361674
The optimal number of factors is: 4
The means of the coefficients are: [-2.05053354 -2.21197914 -2.93255981  0.30712152 -1.8431252  -2.51406295
 -2.44795635 -1.47187396]
The stand



In [3]:
# apply factor analysis with random coefficient model (assume price is a homogeneous attribute but seven brand intercepts are heterogeneous)
# specify 2 factors
homo_covariates=np.array([0]*8)
homo_covariates[-1] = 1
model = lgt.RandomCoefficientsModel()
model.fit(X, y, indiv_id=indiv_id, method='factor-analytic', homo_covariates=homo_covariates, niteration=4, optimizer='L-BFGS-B', nfactors=2)
print('rho2: ' + str(lgt.Metrics.rho2(model, X, y)))
print('AIC: ' + str(lgt.Metrics.AIC(model)))
print('BIC: ' + str(lgt.Metrics.BIC(model)))
print('HQC: ' + str(lgt.Metrics.HQC(model)))
print('CAIC: ' + str(lgt.Metrics.CAIC(model)))

Using factor-analytic method with 2 factors.
Current NLL: 2311.944489997234
Current NLL: 2311.486713630418
Current NLL: 2311.4867147741206
Current NLL: 2311.4867492245367
Current NLL: 2311.48669830225
BIC:  4783.2741592477705
Maximized LL: -2311.48669830225
The optimal number of factors is: 2
The means of the coefficients are: [-0.82183133 -1.20932759 -4.07324214  0.57607929 -1.79348008 -0.86328796
 -2.9583783  -1.38829553]
The standard deviations of the coefficients are: [16.87513366 21.69844611 18.49089515 25.55137793 23.28268597 16.92142258
  7.01111108  0.        ]
The covariance matrix of the coefficients is: [[ 284.7701361   358.75551619 -148.85162481  381.63842068  263.95894835
   240.30334562 -118.15855149    0.        ]
 [ 358.75551619  470.82256375 -116.94824994  532.43439462  407.43175518
   342.43245796 -147.29956056    0.        ]
 [-148.85162481 -116.94824994  341.91320351   -6.22712325  142.29323223
    22.94236501   67.59066131    0.        ]
 [ 381.63842068  532.434394



# BLP

In [6]:
df = pd.read_excel('./Data/CoffeeData.xlsx')
D = 50 # 9m 1s if use packages
p=4 # alternatives
t=114 # time periods
nfeatures = 8 # number of product features (including price and intercept)
shares = df['Share'].values
outside = df['Outside'].values
col_instruments = ['Spot 1', 'Spot 2', 'Spot 3', 'Spot 4', 'Spot 5', 'Spot 6']
col_endo = ['Price', 'Brand 1', 'Brand 2', 'Brand 3', 'Brand 4']
col_exo = ['Feature', 'Display', 'F&D']
col_x = col_endo + col_exo
X = df[col_x].values
Z = df[col_instruments+col_exo].values
Z = sm.add_constant(Z)
blp_model = lgt.BLP(X, Z, shares, outside, nfeatures, D, t, p)
gamma_hat, beta_hat = blp_model.fit()
blp_model.summary()

Optimization terminated successfully.
         Current function value: 0.000000
         Iterations: 4
         Function evaluations: 2148
Estimated gamma: [ 2.88293233e-01 -3.98075356e-01  8.45357319e-01  2.86070237e+00
 -1.69930624e+00  8.95296086e-02 -6.11204593e-03  2.42500145e-01
  1.27326863e-01  4.51045895e+00  1.20644148e+00  3.57738441e-01
  1.18260690e-01 -5.54506497e-02  8.74469604e-02  1.92720285e+00
  2.78657520e-03  2.65173174e-01  1.01116616e-01  1.42557126e-01
  9.70742717e-02  1.23176789e-01 -2.84524563e-01  1.02165360e-01
  9.77894159e-02  1.00743272e-01  8.66577133e-02  9.99040094e-02
  1.00357098e-01  9.88845455e-02  9.37484622e-02  9.39884218e-02
  1.02638199e-01  9.27541295e-02  9.99425137e-02  9.90863217e-02]
Estimated beta: [ -1.98593223  24.56196481  -2.48931653  -5.92840229 -20.16094116
  -0.65980159  -0.44915894  -0.75191807]
Final loss: 5.068581515161355e-23
