## Exercise 3

We'll first just copy relevant code from the last two exercises to start where we left off.

In [1]:
import pyblp
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

pyblp.options.digits = 3
pyblp.options.verbose = False
pd.options.display.precision = 3
pd.options.display.max_columns = 50

import IPython.display
IPython.display.display(IPython.display.HTML('<style>pre { white-space: pre !important; }</style>'))

# Relevant code from exercise 1.1
product_data = pd.read_csv('../Data/products.csv')

# Relevant code from exercise 1.2
product_data['market_size'] = product_data['city_population'] * 90
product_data['market_share'] = product_data['servings_sold'] / product_data['market_size']

# Relevant code from exercise 1.4
product_data = product_data.rename(columns={
    'market': 'market_ids',
    'product': 'product_ids',
    'market_share': 'shares',
    'price_per_serving': 'prices',
})

# Relevant code from exercise 1.6
first_stage = smf.ols('prices ~ 0 + price_instrument + C(market_ids) + C(product_ids)', product_data)
first_stage_results = first_stage.fit(cov_type='HC0')
product_data = product_data.rename(columns={'price_instrument': 'demand_instruments0'})
iv_problem = pyblp.Problem(pyblp.Formulation('0 + prices', absorb='C(market_ids) + C(product_ids)'), product_data)
iv_results = iv_problem.solve(method='1s')

# Relevant code from exercise 1.7
counterfactual_market = 'C01Q2'
counterfactual_data = product_data.loc[product_data['market_ids'] == counterfactual_market, ['product_ids', 'mushy', 'prices', 'shares']]
counterfactual_data['new_prices'] = counterfactual_data['prices']
counterfactual_data.loc[counterfactual_data['product_ids'] == 'F1B04', 'new_prices'] /= 2
counterfactual_data['new_shares'] = iv_results.compute_shares(market_id=counterfactual_market, prices=counterfactual_data['new_prices'])
counterfactual_data['iv_change'] = 100 * (counterfactual_data['new_shares'] - counterfactual_data['shares']) / counterfactual_data['shares']

# Relevant code from exercise 2.1
demographic_data = pd.read_csv('../Data/demographics.csv').rename(columns={'market': 'market_ids'})
demographic_data['log_income'] = np.log(demographic_data['quarterly_income'])
demographic_variation = demographic_data.groupby('market_ids', as_index=False).agg(**{
    'log_income_mean': ('log_income', 'mean'),
    'log_income_std': ('log_income', 'std'),
})

# Relevant code from exercise 2.2
agent_data = demographic_data[['market_ids', 'log_income']].groupby('market_ids', as_index=False).sample(n=1000, replace=True, random_state=0)
agent_data[['nodes0', 'nodes1', 'nodes2']] = np.random.default_rng(seed=0).normal(size=(len(agent_data), 3))
agent_data['weights'] = 1 / agent_data.groupby('market_ids').transform('size')
product_data = product_data.merge(demographic_variation[['market_ids', 'log_income_mean']], on='market_ids')
product_data['demand_instruments1'] = product_data['log_income_mean'] * product_data['mushy']
product_formulations = (pyblp.Formulation('0 + prices', absorb='C(market_ids) + C(product_ids)'), pyblp.Formulation('0 + mushy'))
agent_formulation = pyblp.Formulation('0 + log_income')
mushy_problem = pyblp.Problem(product_formulations, product_data, agent_formulation, agent_data)
optimization = pyblp.Optimization('trust-constr', {'gtol': 1e-8, 'xtol': 1e-8})
mushy_results = mushy_problem.solve(sigma=0, pi=1, method='1s', optimization=optimization)

# Relevant code from exercise 2.4
counterfactual_data['new_shares'] = mushy_results.compute_shares(market_id=counterfactual_market, prices=counterfactual_data['new_prices'])
counterfactual_data['mushy_change'] = 100 * (counterfactual_data['new_shares'] - counterfactual_data['shares']) / counterfactual_data['shares']

# Relevant code from exercise 2.5
product_data['predicted_prices'] = first_stage_results.fittedvalues
product_data['demand_instruments2'] = product_data['log_income_mean'] * product_data['predicted_prices']
compute_differentiation = lambda x: np.sum((x.values[:, None] - x.values[None, :])**2, axis=1)
product_data['demand_instruments3'] = product_data.groupby('market_ids')['predicted_prices'].transform(compute_differentiation)
product_formulations = (pyblp.Formulation('0 + prices', absorb='C(market_ids) + C(product_ids)'), pyblp.Formulation('0 + mushy + prices'))
agent_formulation = pyblp.Formulation('0 + log_income')
rc_problem = pyblp.Problem(product_formulations, product_data, agent_formulation, agent_data)
rc_results = rc_problem.solve(
    sigma=[
        [0, 0],
        [0, 1],
    ], 
    pi=[
        [0.2],
        [1],
    ], 
    method='1s', 
    optimization=optimization,
)

# Relevant code from exercise 2.6
counterfactual_data['new_shares'] = rc_results.compute_shares(market_id=counterfactual_market, prices=counterfactual_data['new_prices'])
counterfactual_data['rc_change'] = 100 * (counterfactual_data['new_shares'] - counterfactual_data['shares']) / counterfactual_data['shares']

### 1. Use the income statistic to match a parameter on log income

First, we'll add a constant to our `X2` formulation.

In [2]:
product_formulations = (pyblp.Formulation('0 + prices', absorb='C(market_ids) + C(product_ids)'), pyblp.Formulation('1 + mushy + prices'))
agent_formulation = pyblp.Formulation('0 + log_income')
micro_problem = pyblp.Problem(product_formulations, product_data, agent_formulation, agent_data)
micro_problem

Dimensions:
 T    N      I     K1    K2    D    MD    ED 
---  ----  -----  ----  ----  ---  ----  ----
94   2256  94000   1     3     1    4     2  

Formulations:
       Column Indices:             0         1      2   
-----------------------------  ----------  -----  ------
 X1: Linear Characteristics      prices                 
X2: Nonlinear Characteristics      1       mushy  prices
       d: Demographics         log_income               

Next we'll define our micro dataset.

In [3]:
survey_markets = ['C01Q1', 'C01Q2']
compute_income_weights = lambda t, p, a: np.einsum('i,j', np.ones(a.size), np.ones(p.size))
income_dataset = pyblp.MicroDataset("Income Survey", 100, compute_income_weights, market_ids=survey_markets)
income_dataset

Income Survey: 100 Observations in 2 Markets

On it, we'll define our micro part.

In [5]:
compute_income_values = lambda t, p, a: np.einsum('i,j', a.demographics[:, 0], np.ones(p.size))
income_part = pyblp.MicroPart("E[log_income_i | j > 0]", income_dataset, compute_income_values)
income_part

E[log_income_i | j > 0] on Income Survey: 100 Observations in 2 Markets

Using this, we can define our micro moment.

In [6]:
income_moment = pyblp.MicroMoment("E[log_income_i | j > 0]", 7.9, income_part)
income_moment

E[log_income_i | j > 0]: +7.90E+00 (E[log_income_i | j > 0] on Income Survey: 100 Observations in 2 Markets)

We can use this in estimation to pin down our new parameter on income alone.

In [7]:
pyblp.options.verbose = True
micro_results = micro_problem.solve(
    sigma=[
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 6],
    ], 
    pi=[
        [1],
        [0.1],
        [-6],
    ], 
    method='1s', 
    optimization=optimization, 
    micro_moments=[income_moment],
)
pyblp.options.verbose = False

Solving the problem ...

Micro Moments:
Observed           Moment                    Part               Dataset     Observations  Markets
---------  -----------------------  -----------------------  -------------  ------------  -------
+7.90E+00  E[log_income_i | j > 0]  E[log_income_i | j > 0]  Income Survey      100          2   

Nonlinear Coefficient Initial Values:
Sigma:      1        mushy     prices    |   Pi:    log_income
------  ---------  ---------  ---------  |  ------  ----------
  1     +0.00E+00                        |    1     +1.00E+00 
mushy   +0.00E+00  +0.00E+00             |  mushy   +1.00E-01 
prices  +0.00E+00  +0.00E+00  +6.00E+00  |  prices  -6.00E+00 

Nonlinear Coefficient Lower Bounds:
Sigma:      1        mushy     prices    |   Pi:    log_income
------  ---------  ---------  ---------  |  ------  ----------
  1     +0.00E+00                        |    1       -INF    
mushy   +0.00E+00  +0.00E+00             |  mushy     -INF    
prices  +0.00E+00  +0.0

The new parameter estimate is not significantly different from zero, suggesting that our original assumption that it was zero was not too bad. This is not at all guaranteed, we may have just been lucky! (Or this imagined statistic may have been chosen by the instructor to make this happen.)

### 2. Use the diversion statistics to estimate unobserved preference heterogeneity for a constant and mushy

Let's first define our new micro dataset.

In [8]:
compute_diversion_weights = lambda t, p, a: np.einsum('i,j,k', np.ones(a.size), np.ones(p.size), np.ones(1 + p.size))
diversion_dataset = pyblp.MicroDataset("Diversion Survey", 200, compute_diversion_weights, market_ids=survey_markets)
diversion_dataset

Diversion Survey: 200 Observations in 2 Markets

Now let's define our first moment for matching outside diversion.

In [9]:
compute_outside_values = lambda t, p, a: np.einsum('i,j,k', np.ones(a.size), np.ones(p.size), np.r_[1, np.zeros(p.size)])
outside_part = pyblp.MicroPart("P(k = 0 | j > 0)", diversion_dataset, compute_outside_values)
outside_moment = pyblp.MicroMoment("P(k = 0 | j > 0)", 0.28, outside_part)
outside_moment

P(k = 0 | j > 0): +2.80E-01 (P(k = 0 | j > 0) on Diversion Survey: 200 Observations in 2 Markets)

Let's also define our second moment for matching mushy diversion.

In [10]:
compute_mushy_values = lambda t, p, a: np.einsum('i,j,k', np.ones(a.size), p.X2[:, 1], np.r_[0, p.X2[:, 1]])
mushy_part = pyblp.MicroPart("P(mushy_j and mushy_k | j > 0)", diversion_dataset, compute_mushy_values)
mushy_moment = pyblp.MicroMoment("P(mushy_j and mushy_k | j > 0)", 0.31, mushy_part)
mushy_moment

P(mushy_j and mushy_k | j > 0): +3.10E-01 (P(mushy_j and mushy_k | j > 0) on Diversion Survey: 200 Observations in 2 Markets)

Then we can re-optimize with our new micro moments, choosing some initial values for our new parameters.

In [11]:
pyblp.options.verbose = True
micro_results = micro_problem.solve(
    sigma=[
        [1, 0, 0],
        [0, 1, 0],
        [0, 0, 6],
    ], 
    pi=[
        [-0.3],
        [0.1],
        [-6],
    ], 
    method='1s', 
    optimization=optimization, 
    micro_moments=[income_moment, outside_moment, mushy_moment],
)
pyblp.options.verbose = False

Solving the problem ...

Micro Moments:
Observed               Moment                           Part                   Dataset       Observations  Markets
---------  ------------------------------  ------------------------------  ----------------  ------------  -------
+7.90E+00     E[log_income_i | j > 0]         E[log_income_i | j > 0]       Income Survey        100          2   
+2.80E-01         P(k = 0 | j > 0)                P(k = 0 | j > 0)         Diversion Survey      200          2   
+3.10E-01  P(mushy_j and mushy_k | j > 0)  P(mushy_j and mushy_k | j > 0)  Diversion Survey      200          2   

Nonlinear Coefficient Initial Values:
Sigma:      1        mushy     prices    |   Pi:    log_income
------  ---------  ---------  ---------  |  ------  ----------
  1     +1.00E+00                        |    1     -3.00E-01 
mushy   +0.00E+00  +1.00E+00             |  mushy   +1.00E-01 
prices  +0.00E+00  +0.00E+00  +6.00E+00  |  prices  -6.00E+00 

Nonlinear Coefficient Lower Bo

All the standard optimization checks look fine. The new estimates suggest that there is a good amount of unobserved preference heterogeneity for mushy, and some for the constant characteristic (i.e. the outside good).

### 3. Evaluate changes to the price cut counterfactual

Finally, let's re-run the price counterfactual with our more flexible model.

In [12]:
counterfactual_data['new_shares'] = micro_results.compute_shares(market_id=counterfactual_market, prices=counterfactual_data['new_prices'])
counterfactual_data['micro_change'] = 100 * (counterfactual_data['new_shares'] - counterfactual_data['shares']) / counterfactual_data['shares']
counterfactual_data

Unnamed: 0,product_ids,mushy,prices,shares,new_prices,new_shares,iv_change,mushy_change,rc_change,micro_change
24,F1B04,1,0.078,0.006443,0.039,0.02667,223.638,223.522,285.128,314.014
25,F1B06,1,0.141,0.1413,0.141,0.1343,-1.45,-1.478,-1.621,-4.934
26,F1B07,1,0.073,0.08789,0.073,0.08235,-1.45,-1.478,-1.85,-6.303
27,F1B09,0,0.077,0.006621,0.077,0.006572,-1.45,-1.438,-1.808,-0.75
28,F1B11,0,0.167,0.05427,0.167,0.05401,-1.45,-1.438,-1.496,-0.482
29,F1B13,0,0.092,0.02198,0.092,0.02182,-1.45,-1.438,-1.759,-0.7
30,F1B17,1,0.154,0.01055,0.154,0.01005,-1.45,-1.478,-1.575,-4.696
31,F1B30,0,0.15,0.00131,0.15,0.001303,-1.45,-1.438,-1.556,-0.526
32,F1B45,0,0.147,0.01052,0.147,0.01047,-1.45,-1.438,-1.568,-0.534
33,F2B05,0,0.099,0.05907,0.099,0.05867,-1.45,-1.438,-1.735,-0.677


Substitution and cannibalization now looks much more reasonable. We see much more substitution within mushy cereals, which makes sense because if the price drops for a mushy cereal, we would expect mainly consumers of similar cereals to substitute to it.