In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.axes as ax
from scipy import stats
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
from statsmodels.base.model import LikelihoodModel
from sklearn import preprocessing

from numpy.random import randn
from numpy.random import seed
from scipy.stats import pearsonr
import unicodedata
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns; sns.set()
from linearmodels.iv import absorbing

from linearmodels.datasets import wage_panel
from linearmodels.panel import PanelOLS
from linearmodels.iv.model import (
    COVARIANCE_ESTIMATORS,
    ClusteredCovariance,
    HeteroskedasticCovariance,
    HomoskedasticCovariance,
    KernelCovariance,
)
import statsmodels.formula.api as smf

## references
https://stackoverflow.com/questions/70954911/results-from-python-linearmodels-panelols-and-stata-areg-differ

In [4]:
data = pd.read_stata('/Users/zyy219/Dropbox/Econometric/mksc.2021.1339/data/gdpr_website.dta')
data = data[(data['date']<np.datetime64('2018-10-02')) & 
         (data['date']>np.datetime64('2017-11-14'))]

### Table 4 Change in Number of Requested Third-Party Domains and Cookies- EU firm & requests

In [6]:
def table4_1(data):
    data['trend_EU-audience'] = data['trend']*data['eu_audience']
    data['trend_nonEU-audience'] = data['trend']*data['noneu_audience']
    data['post_EU-audience'] = data['after']*data['eu_audience']
    data['post_nonEU-audience'] = data['after']*data['noneu_audience']
    data['trend_post_EU-audience'] = data['trend_after']*data['after']*data['eu_audience']
    data['trend_post_nonEU-audience'] = data['trend_after']*data['after']*data['noneu_audience']
    D = data[data['eu_location'].isin([1])& data['exclude'].isin([0])]

    cats = pd.DataFrame({'h': pd.Categorical(D['h'])})
    exog_variables = ['trend_EU-audience','trend_nonEU-audience','trend_post_EU-audience',
              'trend_post_nonEU-audience','post_EU-audience','post_nonEU-audience']
    endog_variable = ['log_requests3']

    exog = sm.tools.tools.add_constant(D[exog_variables])
    endog = D[endog_variable]

    model = absorbing.AbsorbingLS(endog, exog, absorb= cats, drop_absorbed=False)
    model_res = model.fit(cov_type= 'clustered',clusters = D['h'],debiased=True)

    print('Mean dependent variable = ',np.mean(D['log_requests3']))
    print(model_res.summary)
table4_1(data)

  x = pd.concat(x[::order], 1)


Mean dependent variable =  2.3778364658355713
                         Absorbing LS Estimation Summary                          
Dep. Variable:          log_requests3   R-squared:                          0.9120
Estimator:               Absorbing LS   Adj. R-squared:                     0.9076
No. Observations:              611919   F-statistic:                        164.82
Date:                Sat, Dec 03 2022   P-value (F-stat):                   0.0000
Time:                        16:17:03   Distribution:                  F(6,582774)
Cov. Estimator:             clustered   R-squared (No Effects):             0.0078
                                        Varaibles Absorbed:              2.914e+04
                                     Parameter Estimates                                     
                           Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
---------------------------------------------------------------------------------------------
const   

### Table 4 Change in Number of Requested Third-Party Domains and Cookies-  non-EU firm & requests

In [7]:
def table4_2(data):
    data['trend_EU-audience'] = data['trend']*data['eu_audience']
    data['trend_nonEU-audience'] = data['trend']*data['noneu_audience']
    data['post_EU-audience'] = data['after']*data['eu_audience']
    data['post_nonEU-audience'] = data['after']*data['noneu_audience']
    data['trend_post_EU-audience'] = data['trend_after']*data['after']*data['eu_audience']
    data['trend_post_nonEU-audience'] = data['trend_after']*data['after']*data['noneu_audience']
    D = data[data['eu_location'].isin([0])& data['exclude'].isin([0])]
    cats = pd.DataFrame({'h': pd.Categorical(D['h'])})
    exog_variables = ['trend_EU-audience','trend_nonEU-audience','trend_post_EU-audience',
              'trend_post_nonEU-audience','post_EU-audience','post_nonEU-audience']
    endog_variable = ['log_requests3']

    exog = sm.tools.tools.add_constant(D[exog_variables])
    endog = D[endog_variable]

    model = absorbing.AbsorbingLS(endog, exog, absorb= cats, drop_absorbed=False)
    model_res = model.fit(cov_type= 'clustered',clusters = D['h'], debiased=True)
    print('Mean dependent variable = ',np.mean(D['log_requests3']))
    print(model_res.summary)
table4_2(data)

Mean dependent variable =  2.4772861003875732
                         Absorbing LS Estimation Summary                          
Dep. Variable:          log_requests3   R-squared:                          0.9314
Estimator:               Absorbing LS   Adj. R-squared:                     0.9280
No. Observations:             1712907   F-statistic:                        192.40
Date:                Sat, Dec 03 2022   P-value (F-stat):                   0.0000
Time:                        16:17:24   Distribution:                 F(6,1631334)
Cov. Estimator:             clustered   R-squared (No Effects):             0.0020
                                        Varaibles Absorbed:              8.157e+04
                                     Parameter Estimates                                     
                           Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
---------------------------------------------------------------------------------------------
const   

### Table 4 Change in Number of Requested Third-Party Domains and Cookies-  EU firm & cookies

In [8]:
def table4_3(data):
    data['trend_EU-audience'] = data['trend']*data['eu_audience']
    data['trend_nonEU-audience'] = data['trend']*data['noneu_audience']
    data['post_EU-audience'] = data['after']*data['eu_audience']
    data['post_nonEU-audience'] = data['after']*data['noneu_audience']
    data['trend_post_EU-audience'] = data['trend_after']*data['after']*data['eu_audience']
    data['trend_post_nonEU-audience'] = data['trend_after']*data['after']*data['noneu_audience']
    D = data[data['eu_location'].isin([1])& data['exclude'].isin([0])]
    cats = pd.DataFrame({'h': pd.Categorical(D['h'])})

    exog_variables = ['trend_EU-audience','trend_nonEU-audience','trend_post_EU-audience',
              'trend_post_nonEU-audience','post_EU-audience','post_nonEU-audience']
    endog_variable = ['log_cookies3']

    exog = sm.tools.tools.add_constant(D[exog_variables])
    endog = D[endog_variable]

    model = absorbing.AbsorbingLS(endog, exog, absorb= cats, drop_absorbed=False)
    model_res = model.fit(cov_type= 'clustered',clusters = D['h'], debiased=True)
    #print(np.mean(Y))
    print(model_res.summary)
table4_3(data)

                         Absorbing LS Estimation Summary                          
Dep. Variable:           log_cookies3   R-squared:                          0.8801
Estimator:               Absorbing LS   Adj. R-squared:                     0.8741
No. Observations:              611919   F-statistic:                        247.79
Date:                Sat, Dec 03 2022   P-value (F-stat):                   0.0000
Time:                        16:17:47   Distribution:                  F(6,582774)
Cov. Estimator:             clustered   R-squared (No Effects):             0.0135
                                        Varaibles Absorbed:              2.914e+04
                                     Parameter Estimates                                     
                           Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
---------------------------------------------------------------------------------------------
const                         1.3628     0.0059     23

### Table 4 Change in Number of Requested Third-Party Domains and Cookies-  non-EU firm & cookies

In [9]:
def table4_4(data):
    data['trend_EU-audience'] = data['trend']*data['eu_audience']
    data['trend_nonEU-audience'] = data['trend']*data['noneu_audience']
    data['post_EU-audience'] = data['after']*data['eu_audience']
    data['post_nonEU-audience'] = data['after']*data['noneu_audience']
    data['trend_post_EU-audience'] = data['trend_after']*data['after']*data['eu_audience']
    data['trend_post_nonEU-audience'] = data['trend_after']*data['after']*data['noneu_audience']
    D = data[data['eu_location'].isin([0])& data['exclude'].isin([0])]
    cats = pd.DataFrame({'h': pd.Categorical(D['h'])})

    exog_variables = ['trend_EU-audience','trend_nonEU-audience','trend_post_EU-audience',
              'trend_post_nonEU-audience','post_EU-audience','post_nonEU-audience']
    endog_variable = ['log_cookies3']

    exog = sm.tools.tools.add_constant(D[exog_variables])
    endog = D[endog_variable]

    model = absorbing.AbsorbingLS(endog, exog, absorb= cats, drop_absorbed=False)
    model_res = model.fit(cov_type= 'clustered',clusters = D['h'], debiased=True)
    print(model_res.summary)
table4_4(data)

                         Absorbing LS Estimation Summary                          
Dep. Variable:           log_cookies3   R-squared:                          0.9023
Estimator:               Absorbing LS   Adj. R-squared:                     0.8974
No. Observations:             1712907   F-statistic:                        313.42
Date:                Sat, Dec 03 2022   P-value (F-stat):                   0.0000
Time:                        16:18:04   Distribution:                 F(6,1631334)
Cov. Estimator:             clustered   R-squared (No Effects):             0.0024
                                        Varaibles Absorbed:              8.157e+04
                                     Parameter Estimates                                     
                           Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
---------------------------------------------------------------------------------------------
const                         1.3755     0.0035     39

### Table 5 Change in Number of Requested Third-Party Domains and Cookies: Website Popularity- EU firm & requests

In [10]:
def table5_1(data):
    data['post_EU-audience_top'] = data['after']*data['eu_audience']*data['top']
    data['post_nonEU-audience_top'] = data['after']*data['noneu_audience']*data['top']
    D = data[data['top'].isin([0,1]) & data['eu_location'].isin([1])& data['exclude'].isin([0])]
    cats = pd.DataFrame({'h': pd.Categorical(D['h'])})
    endog_variable = ['log_requests3']
    exog_variables = ['trend_EU-audience','trend_nonEU-audience','trend_post_EU-audience',
          'trend_post_nonEU-audience','post_EU-audience','post_nonEU-audience',
          'post_EU-audience_top','post_nonEU-audience_top']
    exog = sm.tools.tools.add_constant(D[exog_variables])
    endog = D[endog_variable]

    model = absorbing.AbsorbingLS(endog, exog, absorb= cats, drop_absorbed=False)
    model_res = model.fit(cov_type= 'clustered',clusters = D['h'], debiased=True)
    print(model_res.summary)
table5_1(data)

                         Absorbing LS Estimation Summary                          
Dep. Variable:          log_requests3   R-squared:                          0.9119
Estimator:               Absorbing LS   Adj. R-squared:                     0.9075
No. Observations:              611541   F-statistic:                        123.91
Date:                Sat, Dec 03 2022   P-value (F-stat):                   0.0000
Time:                        16:18:37   Distribution:                  F(8,582412)
Cov. Estimator:             clustered   R-squared (No Effects):             0.0079
                                        Varaibles Absorbed:              2.912e+04
                                     Parameter Estimates                                     
                           Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
---------------------------------------------------------------------------------------------
const                         2.3769     0.0041     57

### Table 5 Change in Number of Requested Third-Party Domains and Cookies: Website Popularity- non-EU firm & requests

In [11]:
def table5_2(data):
    data['post_EU-audience_top'] = data['after']*data['eu_audience']*data['top']
    data['post_nonEU-audience_top'] = data['after']*data['noneu_audience']*data['top']
    D = data[data['top'].isin([0,1]) & data['eu_location'].isin([0])& data['exclude'].isin([0])]
    cats = pd.DataFrame({'h': pd.Categorical(D['h'])})
    endog_variable = ['log_requests3']
    exog_variables = ['trend_EU-audience','trend_nonEU-audience','trend_post_EU-audience',
              'trend_post_nonEU-audience','post_EU-audience','post_nonEU-audience',
              'post_EU-audience_top','post_nonEU-audience_top']

    exog = sm.tools.tools.add_constant(D[exog_variables])
    endog = D[endog_variable]

    model = absorbing.AbsorbingLS(endog, exog, absorb= cats, drop_absorbed=False)
    model_res = model.fit(cov_type= 'clustered',clusters = D['h'], debiased=True)
    print(model_res.summary)
table5_2(data)

                         Absorbing LS Estimation Summary                          
Dep. Variable:          log_requests3   R-squared:                          0.9314
Estimator:               Absorbing LS   Adj. R-squared:                     0.9280
No. Observations:             1711374   F-statistic:                        144.67
Date:                Sat, Dec 03 2022   P-value (F-stat):                   0.0000
Time:                        16:18:44   Distribution:                 F(8,1629872)
Cov. Estimator:             clustered   R-squared (No Effects):             0.0021
                                        Varaibles Absorbed:              8.149e+04
                                     Parameter Estimates                                     
                           Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
---------------------------------------------------------------------------------------------
const                         2.4330     0.0023     10

### Table 5 Change in Number of Requested Third-Party Domains and Cookies: Website Popularity- EU firm & cookies

In [12]:
def table5_3(data):
    data['post_EU-audience_top'] = data['after']*data['eu_audience']*data['top']
    data['post_nonEU-audience_top'] = data['after']*data['noneu_audience']*data['top']
    D = data[data['top'].isin([0,1]) & data['eu_location'].isin([1])& data['exclude'].isin([0])]
    cats = pd.DataFrame({'h': pd.Categorical(D['h'])})
    endog_variable = ['log_cookies3']
    exog_variables = ['trend_EU-audience','trend_nonEU-audience','trend_post_EU-audience',
              'trend_post_nonEU-audience','post_EU-audience','post_nonEU-audience',
              'post_EU-audience_top','post_nonEU-audience_top']

    exog = sm.tools.tools.add_constant(D[exog_variables])
    endog = D[endog_variable]

    model = absorbing.AbsorbingLS(endog, exog, absorb= cats, drop_absorbed=False)
    model_res = model.fit(cov_type= 'clustered',clusters = D['h'], debiased=True)
    print(model_res.summary)
table5_3(data)

                         Absorbing LS Estimation Summary                          
Dep. Variable:           log_cookies3   R-squared:                          0.8801
Estimator:               Absorbing LS   Adj. R-squared:                     0.8741
No. Observations:              611541   F-statistic:                        186.06
Date:                Sat, Dec 03 2022   P-value (F-stat):                   0.0000
Time:                        16:18:53   Distribution:                  F(8,582412)
Cov. Estimator:             clustered   R-squared (No Effects):             0.0135
                                        Varaibles Absorbed:              2.912e+04
                                     Parameter Estimates                                     
                           Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
---------------------------------------------------------------------------------------------
const                         1.3629     0.0059     22

### Table 5 Change in Number of Requested Third-Party Domains and Cookies: Website Popularity- non-EU firm & cookies

In [13]:
def table5_4(data):
    data['post_EU-audience_top'] = data['after']*data['eu_audience']*data['top']
    data['post_nonEU-audience_top'] = data['after']*data['noneu_audience']*data['top']
    D = data[data['top'].isin([0,1]) & data['eu_location'].isin([0])& data['exclude'].isin([0])]
    cats = pd.DataFrame({'h': pd.Categorical(D['h'])})
    endog_variable = ['log_cookies3']
    exog_variables = ['trend_EU-audience','trend_nonEU-audience','trend_post_EU-audience',
              'trend_post_nonEU-audience','post_EU-audience','post_nonEU-audience',
              'post_EU-audience_top','post_nonEU-audience_top']

    exog = sm.tools.tools.add_constant(D[exog_variables])
    endog = D[endog_variable]

    model = absorbing.AbsorbingLS(endog, exog, absorb= cats, drop_absorbed=False)
    model_res = model.fit(cov_type= 'clustered',clusters = D['h'], debiased=True)
    print(model_res.summary)
table5_4(data)

                         Absorbing LS Estimation Summary                          
Dep. Variable:           log_cookies3   R-squared:                          0.9023
Estimator:               Absorbing LS   Adj. R-squared:                     0.8974
No. Observations:             1711374   F-statistic:                        235.51
Date:                Sat, Dec 03 2022   P-value (F-stat):                   0.0000
Time:                        16:19:04   Distribution:                 F(8,1629872)
Cov. Estimator:             clustered   R-squared (No Effects):             0.0024
                                        Varaibles Absorbed:              8.149e+04
                                     Parameter Estimates                                     
                           Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
---------------------------------------------------------------------------------------------
const                         1.3758     0.0035     39

### Table 6 Change in Websites Served by Data Type

In [14]:
def table6(data,data_policy):
    data = data[data['exclude']==0]
    D = data.merge(data_policy, on='tracker', how='left') #after drop if _merge ==2
    DD = data.merge(data_policy, how='inner', on='tracker') #find _merge = 3
    _merge3 = DD['tracker'].unique().tolist()
    m = []
    for i in range(len(D)):
        if D['tracker'][i] in _merge3:
             m.append(1)
        else:
             m.append(0)
    D['m'] = m
    #bys tracker_firm: egen temp=max(m) keep if temp==1
    D_agg = D.groupby(['tracker_firm'],as_index=False).agg({'m': ['max']}) #bys tracker_firm: egen temp=max(m)
    D_sorted = D[D['tracker_firm'].isin(D_agg[D_agg['m']['max']==1]['tracker_firm'].tolist())] #keep if temp = 1
    D_sorted1 = D_sorted.groupby(['tracker_firm','date'],as_index=False).agg(date=('date','first'),shr_undisclosed = ('shr_undisclosed','max'),
                        shr_aggregate = ('shr_aggregate','max'),shr_anonymous = ('shr_anonymous','max'),shr_pseudo = ('shr_pseudo','max'),
                        shr_pii = ('shr_pii','max'),shr_sensitive = ('shr_sensitive','max'),
                        col_undisclosed = ('col_undisclosed','max'), col_anonymous = ('col_anonymous','max'),col_pseudo = ('col_pseudo','max'), col_pii = ('col_pii','max'),
                        col_sensitive = ('col_sensitive','max'),
                        use_undisclosed = ('use_undisclosed','max'),
                        use_analytics = ('use_analytics','max'),
                        use_ad = ('use_ad','max'),
                        use_custom = ('use_custom','max'),
                        use_optimization = ('use_optimization','max'),
                        use_tracking = ('use_tracking','max'),
                                                                         
                        retention = ('retention','max'),
                        websites=('websites','sum'),
                        websites_cookie=('websites','sum'),
                        totalwebsites=('totalwebsites','mean'))
    D_sorted1['ms_websites_firm'] = D_sorted1['websites']/D_sorted1['totalwebsites']*100 #gen ms_websites_firm=(websites/totalwebsites)*100
    D_sorted1['ms_websites_cookie_firm'] = D_sorted1['websites_cookie']/D_sorted1['totalwebsites']*100 #gen ms_websites_cookie_firm=(websites_cookie/totalwebsites)*100

    D_sorted1['post']=np.where(D_sorted1['date']>np.datetime64('2018-05-25'), 1,0 )
    D_sorted1['trend']=(D_sorted1['date']- D_sorted1['date'].min()+np.timedelta64(1, 'D'))/np.timedelta64(100, 'D')
    D_sorted1['col_personal_post']=D_sorted1['post']*D_sorted1['col_pii']
    D_sorted1['shr_personal_post']=D_sorted1['post']*D_sorted1['shr_pii']
    D_sorted1['shr_col_personal_post']=D_sorted1['post']*D_sorted1['shr_pii']*D_sorted1['col_pii']
    D_sorted1['col_personal_trend']=D_sorted1['trend']*D_sorted1['col_pii']
    D_sorted1['shr_personal_trend']=D_sorted1['trend']*D_sorted1['shr_pii']
    D_sorted1['shr_col_personal_trend']=D_sorted1['trend']*D_sorted1['shr_pii']*D_sorted1['col_pii']
    D_sorted1['tf']=D_sorted1.groupby('tracker_firm').ngroup()
    D_sorted1['log_websites']=np.log(1+D_sorted1['websites'])
    D_sorted1['log_websites_cookie']=np.log(1+D_sorted1['websites_cookie'])

    cats = pd.DataFrame({'tf': pd.Categorical(D_sorted1['tf'])})
    endog = D_sorted1['log_websites']
    exog = sm.tools.tools.add_constant(D_sorted1[['post','col_personal_post','trend','col_personal_trend']])
    model = absorbing.AbsorbingLS(endog, exog, absorb= cats, drop_absorbed=False)
    model_res = model.fit(cov_type= 'clustered',clusters = D_sorted1['tf'], debiased=True)
    print(model_res.summary)
    
    exog = sm.tools.tools.add_constant(D_sorted1[['post','col_personal_post','shr_personal_post','trend','col_personal_trend','shr_personal_trend']])
    model = absorbing.AbsorbingLS(endog, exog, absorb= cats, drop_absorbed=False)
    model_res = model.fit(cov_type= 'clustered',clusters = D_sorted1['tf'], debiased=True)
    print(model_res.summary)
    
    exog = sm.tools.tools.add_constant(D_sorted1[['post','col_personal_post','shr_personal_post','shr_col_personal_post',
                                                  'trend','col_personal_trend','shr_personal_trend',
                                                  'shr_col_personal_trend']])
    model = absorbing.AbsorbingLS(endog, exog, absorb= cats, drop_absorbed=False)
    model_res = model.fit(cov_type= 'clustered',clusters = D_sorted1['tf'], debiased=True)
    print(model_res.summary)

In [15]:
data_policy = pd.read_stata('/Users/zyy219/Dropbox/Econometric/mksc.2021.1339/data/policies_before.dta') #each tracker only has one entry
data = pd.read_stata('/Users/zyy219/Dropbox/Econometric/mksc.2021.1339/data/gdpr_vendors.dta')
data = data[(data['date']<np.datetime64('2018-10-02')) & 
         (data['date']>np.datetime64('2017-11-14'))]
table6(data,data_policy)



                         Absorbing LS Estimation Summary                          
Dep. Variable:           log_websites   R-squared:                          0.9682
Estimator:               Absorbing LS   Adj. R-squared:                     0.9665
No. Observations:               42820   F-statistic:                        5.4050
Date:                Sat, Dec 03 2022   P-value (F-stat):                   0.0002
Time:                        16:21:24   Distribution:                   F(4,40675)
Cov. Estimator:             clustered   R-squared (No Effects):             0.0051
                                        Varaibles Absorbed:                 2140.0
                                 Parameter Estimates                                  
                    Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
--------------------------------------------------------------------------------------
const                  2.7458     0.0117     233.96     0.0000      2.7228 

In [5]:
def table7(dgpr_vendors):
    # ------------------------------------------------
    # This function will take dgpr_vendors dataset and output the table (7) to show the change in Market Structure 
    # of the Web Technology Industry
    # Inputs: 
    #    dgpr_vendors dataset: dataset
    # Output: Table 7
    # ---------------------------------------------------
    
    
    # bys tracker_firm date: keep if _n==1
    result_table={}
    result_table["row_name"]=["After","(std err)","Observations","Pre-GDPR mean"]
    
    
    ## ---------------------- HHI All--------------------------------
    # collapse (sum) ms_websites_firm=ms_websites, by (tracker_firm date) fast
    dgpr_vendors1 = dgpr_vendors.groupby(['tracker_firm','date'],as_index=False).agg(date=('date','first'),
                                                                tracker_firm=('tracker_firm','first'),
                                                                ms_websites_firm=('ms_websites','sum'))
    # gen ms_websites_firm2=ms_websites_firm^2
    dgpr_vendors1['ms_websites_firm2'] = dgpr_vendors1['ms_websites_firm']**2
    
    # collapse (sum) hhi=ms_websites_firm2, by (date) fast
    dgpr_vendors2 = dgpr_vendors1.groupby(['date'],as_index=False).agg(date=('date','first'),
                                                                    hhi=('ms_websites_firm2','sum'))

    # gen after=date>td(25may2018)
    dgpr_vendors2['after']=np.where(dgpr_vendors2['date']>np.datetime64('2018-05-25'), 1,0 )
    
    # su date
    # gen trend=(date-r(min)+1)/100
    dgpr_vendors2['trend']=(dgpr_vendors2['date']- dgpr_vendors2['date'].min()+np.timedelta64(1, 'D'))/np.timedelta64(100, 'D')
    
    # su date if after==1
    # gen trend_after=(date-r(min)+1)/100
    dgpr_vendors2['trend_after']=0
    dgpr_vendors2.loc[dgpr_vendors2['after']==1,'trend_after']=(dgpr_vendors2.loc[
        dgpr_vendors2['after']==1,'date']-dgpr_vendors2.loc[dgpr_vendors2['after']==1,'date'].min()+
                                                                np.timedelta64(1, 'D'))/np.timedelta64(100, 'D')
    
    # eststo m_requests_g: reg hhi trend after c.trend_after#c.after, robust
    m_requests_g_all = smf.ols("hhi ~ trend + after + trend_after*after", data=dgpr_vendors2).fit(cov_type='HC1')
    m_requests_g_all.summary().tables[1]
    result_for_all = pd.read_html(m_requests_g_all.summary().tables[1].as_html(), header=0, index_col=0)[0]
    result_table["HHI all"]=[result_for_all.loc["after","coef"].round(3),result_for_all.loc["after","std err"].round(3),
                             f"{len(dgpr_vendors2)}",round(dgpr_vendors2[dgpr_vendors2['after']==0]["hhi"].mean(),3) ]
    
    
    ## ---------------------- HHI without Google--------------------------------
    
    # drop if tracker_firm=="google"
    dgpr_vendors_nonGoogle=dgpr_vendors[dgpr_vendors["tracker_firm"]!="google"]
    # collapse (sum) ms_websites_firm=ms_websites, by (tracker_firm date) fast
    dgpr_vendors3 = dgpr_vendors_nonGoogle.groupby(['tracker_firm','date'],as_index=False).agg(date=('date','first'),
                                                                tracker_firm=('tracker_firm','first'),
                                                                ms_websites_firm=('ms_websites','sum'))
    # gen ms_websites_firm2=ms_websites_firm^2
    dgpr_vendors3['ms_websites_firm2'] = dgpr_vendors3['ms_websites_firm']**2
    
    # collapse (sum) hhi=ms_websites_firm2, by (date) fast
    dgpr_vendors4 = dgpr_vendors3.groupby(['date'],as_index=False).agg(date=('date','first'),
                                                                    hhi=('ms_websites_firm2','sum'))

    # gen after=date>td(25may2018)
    dgpr_vendors4['after']=np.where(dgpr_vendors4['date']>np.datetime64('2018-05-25'), 1,0 )
    
    # su date
    # gen trend=(date-r(min)+1)/100
    dgpr_vendors4['trend']=(dgpr_vendors4['date']- dgpr_vendors4['date'].min()+np.timedelta64(1, 'D'))/np.timedelta64(100, 'D')
    
    # su date if after==1
    # gen trend_after=(date-r(min)+1)/100
    dgpr_vendors4['trend_after']=0
    dgpr_vendors4.loc[dgpr_vendors4['after']==1,'trend_after']=(dgpr_vendors4.loc[
        dgpr_vendors4['after']==1,'date']-dgpr_vendors4.loc[dgpr_vendors4['after']==1,'date'].min()+
                                                                np.timedelta64(1, 'D'))/np.timedelta64(100, 'D')
    
    # eststo m_requests_g: reg hhi trend after c.trend_after#c.after, robust
    m_requests_g_nonGoogle = smf.ols("hhi ~ trend + after + trend_after*after", data=dgpr_vendors4).fit(cov_type='HC1')
    m_requests_g_nonGoogle.summary().tables[1]
    result_for_nonGoogle = pd.read_html(m_requests_g_nonGoogle.summary().tables[1].as_html(), header=0, index_col=0)[0]
    result_table["HHI without Google"]=[result_for_nonGoogle.loc["after","coef"].round(3),
                                        result_for_nonGoogle.loc["after","std err"].round(3),
                             f"{len(dgpr_vendors4)}",round(dgpr_vendors4[dgpr_vendors4['after']==0]["hhi"].mean(),3) ]
    
    result=pd.DataFrame(result_table).set_index("row_name")
    result.index.name = None
    return result

In [12]:
dgpr_vendors=pd.read_stata("/Users/zyy219/Dropbox/Econometric/mksc.2021.1339/data/gdpr_vendors.dta")

dgpr_vendors = dgpr_vendors[(dgpr_vendors['date']<np.datetime64('2018-10-02')) & 
         (dgpr_vendors['date']>np.datetime64('2017-11-14'))]
table7(dgpr_vendors)




Unnamed: 0,HHI all,HHI without Google
After,8.176,-1.576
(std err),9.221,1.129
Observations,21.0,21.0
Pre-GDPR mean,950.82,68.458


In [8]:
def table8(dgpr_vendors):
    # ------------------------------------------------
    # This function will take dgpr_vendors dataset and output the table (8) to show the change in Market Structure and Number of Websites
    # Inputs: 
    #    dgpr_vendors dataset: dataset
    # Output: Table 8
    # ---------------------------------------------------
    result_table={}
    result_table["row_name"]=["Post X Google","(std err)","Post X Non-Google","(std err)",
                              "Observations","R2","Pre-GDPR Google","Pre-GDPR non-Google"]
    # gen after=date>=td(25may2018)
    dgpr_vendors['after']=np.where(dgpr_vendors['date']>np.datetime64('2018-05-25'), 1,0 )
    # gen log_websites=log(1+tracker_firm_websites)
    dgpr_vendors['log_websites']=np.log(1+dgpr_vendors['tracker_firm_websites'])
    # gen google=tracker_firm=="google"
    dgpr_vendors['google']=[1 if i else 0 for i in (dgpr_vendors['tracker_firm']=='google')]
    # gen nongoogle=1-google
    dgpr_vendors['nongoogle']=1-dgpr_vendors['google']
    
    # bys tracker_firm date: keep if _n==1	
    dgpr_vendors1=dgpr_vendors.groupby(['tracker_firm','date'],as_index=False).agg(date=('date','first'),
                                                                    tracker_firm=('tracker_firm','first'),
                                                                    ms_websites_firm=('ms_websites_firm','first'),
                                                                    Google=('google','first'),
                                                                    NonGoogle=('nongoogle','first'),
                                                                    log_websites=('log_websites','first'),
                                                                    Post=('after','first')) 
    
    # gen trend=(date-`r(min)'+1)/100
    dgpr_vendors1['trend']=(dgpr_vendors1['date']- dgpr_vendors1['date'].min()+np.timedelta64(1, 'D'))/np.timedelta64(100, 'D')
    
    # su date if after==1
    # gen trend_after=(date-`r(min)'+1)/100
    dgpr_vendors1['trend_after']=0
    dgpr_vendors1.loc[dgpr_vendors1['Post']==1,'trend_after']=(dgpr_vendors1.loc[
        dgpr_vendors1['Post']==1,'date']-dgpr_vendors1.loc[dgpr_vendors1['Post']==1,'date'].min()+
                                                                np.timedelta64(1, 'D'))/np.timedelta64(100, 'D')
    
    # egen tf=group(tracker_firm)
    dgpr_vendors1['tf']=dgpr_vendors1.groupby('tracker_firm').ngroup()
    
    
    dgpr_vendors1['trend_google']=dgpr_vendors1['trend']*dgpr_vendors1['Google']
    dgpr_vendors1['after_google']=dgpr_vendors1['Post']*dgpr_vendors1['Google']
    dgpr_vendors1['trend_after_after_google']=dgpr_vendors1['trend_after']*dgpr_vendors1['Post']*dgpr_vendors1['Google']
    dgpr_vendors1['trend_nongoogle']=dgpr_vendors1['trend']*dgpr_vendors1['NonGoogle']
    dgpr_vendors1['after_nongoogle']=dgpr_vendors1['Post']*dgpr_vendors1['NonGoogle']
    dgpr_vendors1['trend_after_after_nongoogle']=dgpr_vendors1['trend_after']*dgpr_vendors1['Post']*dgpr_vendors1['NonGoogle']

    
    # websites
    exog_variables_websites = ['trend_google','after_google','trend_after_after_google','trend_nongoogle',
                                                     'after_nongoogle','trend_after_after_nongoogle']
    endog_variable_websites = ['log_websites']
    cats_websites = pd.DataFrame({'tf': pd.Categorical(dgpr_vendors1['tf'])})
    exog_websites = sm.tools.tools.add_constant(dgpr_vendors1[exog_variables_websites])
    endog_websites =dgpr_vendors1[endog_variable_websites]
    model_websites = absorbing.AbsorbingLS(endog_websites, exog_websites, absorb= cats_websites, drop_absorbed=False)
    model_res_websites = model_websites.fit(cov_type= 'clustered',clusters =dgpr_vendors1['tf'],debiased=True)
    
    websites_summary_estimate= pd.read_html(model_res_websites.summary.tables[0].as_html(), header=0, index_col=0)[0]
    websites_summary_table = pd.read_html(model_res_websites.summary.tables[1].as_html(), header=0, index_col=0)[0]
    website_p_value_afterGoogle = float(websites_summary_table.loc['after_google','P-value'])
    website_p_value_afternonGoogle = float(websites_summary_table.loc['after_nongoogle','P-value'])
    result_table["No.websites"]=[f"{websites_summary_table.loc['after_google','Parameter'].round(4)}{'***' if (website_p_value_afterGoogle <0.01) else ( '**' if website_p_value_afterGoogle <0.05 else ('*' if website_p_value_afterGoogle  <0.1 else ''))}",
                    websites_summary_table.loc["after_google","Std. Err."].round(4),
                    f"{websites_summary_table.loc['after_nongoogle','Parameter'].round(4)}{'***' if (website_p_value_afternonGoogle <0.01) else ( '**' if website_p_value_afternonGoogle<0.05 else ('*' if website_p_value_afternonGoogle  <0.1 else ''))}",
                    websites_summary_table.loc["after_nongoogle","Std. Err."].round(4),
                    dgpr_vendors1.shape[0],
                    websites_summary_estimate.iloc[0,2],
                    round(np.mean(dgpr_vendors1['log_websites'][(dgpr_vendors1['Post'] == 0) & (dgpr_vendors1['Google']==1)]),4),
                    round(np.mean(dgpr_vendors1['log_websites'][(dgpr_vendors1['Post'] == 0) & (dgpr_vendors1['NonGoogle']==0)]),4)]
    
    # Market Shares
    exog_variables_ms = ['trend_google','after_google','trend_after_after_google','trend_nongoogle',
                                                     'after_nongoogle','trend_after_after_nongoogle']
    endog_variable_ms = ['ms_websites_firm']
    cats_ms = pd.DataFrame({'tf': pd.Categorical(dgpr_vendors1['tf'])})
    exog_ms = sm.tools.tools.add_constant(dgpr_vendors1[exog_variables_ms])
    endog_ms =dgpr_vendors1[endog_variable_ms]
    model_ms = absorbing.AbsorbingLS(endog_ms, exog_ms, absorb= cats_ms, drop_absorbed=False)
    model_res_ms = model_ms.fit(cov_type= 'clustered',clusters =dgpr_vendors1['tf'],debiased=True)

    ms_summary_estimate= pd.read_html(model_res_ms.summary.tables[0].as_html(), header=0, index_col=0)[0]
    ms_summary_table = pd.read_html(model_res_ms.summary.tables[1].as_html(), header=0, index_col=0)[0]
    ms_p_value_afterGoogle = float(ms_summary_table.loc['after_google','P-value'])
    ms_p_value_afternonGoogle = float(ms_summary_table.loc['after_nongoogle','P-value'])
    result_table["Market Shares"]=[f"{ms_summary_table.loc['after_google','Parameter'].round(4)}{ '***' if (ms_p_value_afterGoogle <0.01) else ( '**' if ms_p_value_afterGoogle<0.05 else ('*' if ms_p_value_afterGoogle <0.1 else ''))}",
                ms_summary_table.loc["after_google","Std. Err."].round(4),
                f"{ms_summary_table.loc['after_nongoogle','Parameter'].round(4)}{ '***' if (ms_p_value_afternonGoogle <0.01) else ( '**' if ms_p_value_afternonGoogle<0.05 else ('*' if ms_p_value_afternonGoogle <0.1 else ''))}",
                ms_summary_table.loc["after_nongoogle","Std. Err."].round(4),
                dgpr_vendors1.shape[0],
                ms_summary_estimate.iloc[0,2],
                round(np.mean(dgpr_vendors1['ms_websites_firm'][(dgpr_vendors1['Post'] == 0) & (dgpr_vendors1['Google']==1)]),4),
                round(np.mean(dgpr_vendors1['ms_websites_firm'][(dgpr_vendors1['Post'] == 0) & (dgpr_vendors1['NonGoogle']==1)]),4)]
    
    result=pd.DataFrame(result_table).set_index("row_name")
    result.index.name = None
    return result     

In [13]:
table8(dgpr_vendors)

  x = pd.concat(x[::order], 1)


Unnamed: 0,No.websites,Market Shares
Post X Google,-0.0192***,0.161***
(std err),0.0,0.0
Post X Non-Google,0.0013,-0.0
(std err),0.0011,0.0
Observations,1329132,1329132
R2,0.9232,0.9996
Pre-GDPR Google,13.1587,29.7036
Pre-GDPR non-Google,13.1587,0.0011


In [10]:
def table9(dgpr_vendors_cat):
    # ------------------------------------------------
    # This function will take dgpr_vendors dataset and output the table (9) to show the change in Number of Websites based on different Submarket
    # Inputs: 
    #    dgpr_vendors dataset: dataset
    # Output: Table 9
    # ---------------------------------------------------
    categories=[("audio_video_player","Video"),("advertising","Advertising"),("site_analytics","Analytics"),
                ("cdn","CDN/API"),("unknown","Other")]
    dgpr_vendors_cat['trend_google']=dgpr_vendors_cat['trend']*dgpr_vendors_cat['google']
    dgpr_vendors_cat['trend_nongoogle']=dgpr_vendors_cat['trend']*dgpr_vendors_cat['nongoogle']
    dgpr_vendors_cat['trend_after_after_google']=dgpr_vendors_cat['trend_after']*dgpr_vendors_cat['after']*dgpr_vendors_cat['google']
    dgpr_vendors_cat['trend_after_after_nongoogle']=dgpr_vendors_cat['trend_after']*dgpr_vendors_cat['after']*dgpr_vendors_cat['nongoogle']
    dgpr_vendors_cat['after_google']=dgpr_vendors_cat['after']*dgpr_vendors_cat['google']
    dgpr_vendors_cat['after_nongoogle']=dgpr_vendors_cat['after']*dgpr_vendors_cat['nongoogle']
    result_table={}
    result_table["row_name"]=["Post X Google","(std err)","Post X Non-Google","(std err)",
                          "Observations","R2","Pre-GDPR Google","Pre-GDPR non-Google"]
    for (a,b) in categories:
        dgpr_vendors_cat_detail = dgpr_vendors_cat[dgpr_vendors_cat['category_id']==a]
        exog_variables = ['google','nongoogle','trend_google','trend_nongoogle',
                                                 'trend_after_after_google','trend_after_after_nongoogle',
                                                  'after_google','after_nongoogle']
        endog_variable = ['log_websites']
        cats= pd.DataFrame({'tf': pd.Categorical(dgpr_vendors_cat_detail['tf'])})
        exog = sm.tools.tools.add_constant(dgpr_vendors_cat_detail[exog_variables])
        endog =dgpr_vendors_cat_detail[endog_variable]
        model = absorbing.AbsorbingLS(endog, exog, absorb= cats, drop_absorbed=True)
        model_res = model.fit(cov_type= 'clustered',clusters =dgpr_vendors_cat_detail['tf'],debiased=True)
        
        summary_estimate= pd.read_html(model_res.summary.tables[0].as_html(), header=0, index_col=0)[0]
        summary_table = pd.read_html(model_res.summary.tables[1].as_html(), header=0, index_col=0)[0]
        p_value_afterGoogle = float(summary_table.loc['after_google','P-value'])
        p_value_afternonGoogle = float(summary_table.loc['after_nongoogle','P-value'])
        
        result_table[b]=[f"{summary_table.loc['after_google','Parameter'].round(6)}{ '***' if (p_value_afterGoogle <0.01) else ( '**' if p_value_afterGoogle<0.05 else ('*' if p_value_afterGoogle <0.1 else ''))}",
            summary_table.loc["after_google","Std. Err."].round(6),
            f"{summary_table.loc['after_nongoogle','Parameter'].round(6)}{ '***' if (p_value_afternonGoogle <0.01) else ( '**' if p_value_afternonGoogle<0.05 else ('*' if p_value_afternonGoogle <0.1 else ''))}",
            summary_table.loc["after_nongoogle","Std. Err."].round(6),
            dgpr_vendors_cat_detail.shape[0],
            round(float(summary_estimate.iloc[0,2]),6),
            round(np.mean(dgpr_vendors_cat_detail['log_websites'][(dgpr_vendors_cat_detail['after'] == 0) & (dgpr_vendors_cat_detail['google']==1)]),6),
            round(np.mean(dgpr_vendors_cat_detail['log_websites'][(dgpr_vendors_cat_detail['after'] == 0) & (dgpr_vendors_cat_detail['nongoogle']==1)]),6)]

    result=pd.DataFrame(result_table).set_index("row_name")
    result.index.name = None
    return result

In [15]:
dgpr_vendors_cat = pd.read_stata('/Users/zyy219/Dropbox/Econometric/mksc.2021.1339/data/gdpr_vendors_cat.dta')
dgpr_vendors_cat = dgpr_vendors_cat[(dgpr_vendors_cat['date']<np.datetime64('2018-10-02')) & 
         (dgpr_vendors_cat['date']>np.datetime64('2017-11-14'))]
table9(dgpr_vendors_cat)


  x = pd.concat(x[::order], 1)
Variables have been fully absorbed and have removed from the regression:

google, nongoogle

  model_res = model.fit(cov_type= 'clustered',clusters =dgpr_vendors_cat_detail['tf'],debiased=True)


Unnamed: 0,Video,Advertising,Analytics,CDN/API,Other
Post X Google,-0.0312***,-0.0265***,-0.0146***,-0.0105***,-0.0057***
(std err),0.0,0.0,0.0,0.0,0.0
Post X Non-Google,-0.0006,-0.0233**,-0.0099**,-0.0012,-0.006
(std err),0.0021,0.0111,0.0049,0.0008,0.0042
Observations,20840,20840,20840,20840,20840
R2,0.9889,0.984,0.9893,0.9987,0.9864
Pre-GDPR Google,10.197235,12.287216,11.480545,11.726396,6.707033
Pre-GDPR non-Google,0.101116,2.257271,0.928712,0.188791,0.405099


In [16]:
def table10(dgpr_vendors_cat):
    # ------------------------------------------------
    # This function will take dgpr_vendors dataset and output the table (10) to show the change in Market Shares based on different Submarket
    # Inputs: 
    #    dgpr_vendors dataset: dataset
    # Output: Table 10
    # ---------------------------------------------------
    
    categories=[("audio_video_player","Video"),("advertising","Advertising"),("site_analytics","Analytics"),
                ("cdn","CDN/API"),("unknown","Other")]
    dgpr_vendors_cat['trend_google']=dgpr_vendors_cat['trend']*dgpr_vendors_cat['google']
    dgpr_vendors_cat['trend_nongoogle']=dgpr_vendors_cat['trend']*dgpr_vendors_cat['nongoogle']
    dgpr_vendors_cat['trend_after_after_google']=dgpr_vendors_cat['trend_after']*dgpr_vendors_cat['after']*dgpr_vendors_cat['google']
    dgpr_vendors_cat['trend_after_after_nongoogle']=dgpr_vendors_cat['trend_after']*dgpr_vendors_cat['after']*dgpr_vendors_cat['nongoogle']
    dgpr_vendors_cat['after_google']=dgpr_vendors_cat['after']*dgpr_vendors_cat['google']
    dgpr_vendors_cat['after_nongoogle']=dgpr_vendors_cat['after']*dgpr_vendors_cat['nongoogle']
    result_table={}
    result_table["row_name"]=["Post X Google","(std err)","Post X Non-Google","(std err)",
                          "Observations","R2","Pre-GDPR Google","Pre-GDPR non-Google"]
    for (a,b) in categories:
        dgpr_vendors_cat_detail = dgpr_vendors_cat[dgpr_vendors_cat['category_id']==a]
        exog_variables = ['google','nongoogle','trend_google','trend_nongoogle',
                                                 'trend_after_after_google','trend_after_after_nongoogle',
                                                  'after_google','after_nongoogle']
        endog_variable = ['ms_websites_firm']
        cats= pd.DataFrame({'tf': pd.Categorical(dgpr_vendors_cat_detail['tf'])})
        exog = sm.tools.tools.add_constant(dgpr_vendors_cat_detail[exog_variables])
        endog =dgpr_vendors_cat_detail[endog_variable]
        model = absorbing.AbsorbingLS(endog, exog, absorb= cats, drop_absorbed=True)
        model_res = model.fit(cov_type= 'clustered',clusters =dgpr_vendors_cat_detail['tf'],debiased=True)
        
        summary_estimate= pd.read_html(model_res.summary.tables[0].as_html(), header=0, index_col=0)[0]
        summary_table = pd.read_html(model_res.summary.tables[1].as_html(), header=0, index_col=0)[0]
        p_value_afterGoogle = float(summary_table.loc['after_google','P-value'])
        p_value_afternonGoogle = float(summary_table.loc['after_nongoogle','P-value'])
        
        result_table[b]=[f"{summary_table.loc['after_google','Parameter'].round(6)}{ '***' if (p_value_afterGoogle <0.01) else ( '**' if p_value_afterGoogle<0.05 else ('*' if p_value_afterGoogle <0.1 else ''))}",
            summary_table.loc["after_google","Std. Err."].round(6),
            f"{summary_table.loc['after_nongoogle','Parameter'].round(6)}{ '***' if (p_value_afternonGoogle <0.01) else ( '**' if p_value_afternonGoogle<0.05 else ('*' if p_value_afternonGoogle <0.1 else ''))}",
            summary_table.loc["after_nongoogle","Std. Err."].round(6),
            dgpr_vendors_cat_detail.shape[0],
            round(float(summary_estimate.iloc[0,2]),6),
            round(np.mean(dgpr_vendors_cat_detail['ms_websites_firm'][(dgpr_vendors_cat_detail['after'] == 0) & (dgpr_vendors_cat_detail['google']==1)]),6),
            round(np.mean(dgpr_vendors_cat_detail['ms_websites_firm'][(dgpr_vendors_cat_detail['after'] == 0) & (dgpr_vendors_cat_detail['nongoogle']==1)]),6)]

    result=pd.DataFrame(result_table).set_index("row_name")
    result.index.name = None
    return result

In [17]:
table10(dgpr_vendors_cat)



  x = pd.concat(x[::order], 1)
Variables have been fully absorbed and have removed from the regression:

google, nongoogle

  model_res = model.fit(cov_type= 'clustered',clusters =dgpr_vendors_cat_detail['tf'],debiased=True)


Unnamed: 0,Video,Advertising,Analytics,CDN/API,Other
Post X Google,-1.6969***,0.4869***,0.4948***,-0.5692***,-0.029***
(std err),0.0,0.0,0.0,0.0,0.0
Post X Non-Google,0.0016,-0.0005,-0.0005,0.0005,2.8e-05
(std err),0.001,0.0007,0.0007,0.0006,0.0016
Observations,20840,20840,20840,20840,20840
R2,0.9997,0.9991,0.9986,0.9999,0.9954
Pre-GDPR Google,77.04818,27.228949,39.232204,70.080956,1.192459
Pre-GDPR non-Google,0.022048,0.069905,0.058374,0.028741,0.094916
