In [37]:
from src.models.linreg import LinReg
from src.models.panel import FixedEffects

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm

from linearmodels import PanelOLS


In [38]:
def create_panel_data():
    np.random.seed(69)
    ids = range(1, 100) 
    years = range(2010, 2021)  
    industries = ['Tech', 'Health', 'Finance', 'Education', 'Retail', 'Energy', 'Manufacturing', 'Transport', 'Services', 'Agriculture']  
    regions = ['North', 'South', 'East', 'West', 'Central']  

    industry_effects = {'Tech': 3, 'Health': 2, 'Finance': 4, 'Education': 1, 'Retail': 2, 'Energy': 5, 'Manufacturing': 3, 'Transport': 4, 'Services': 2, 'Agriculture': 3}
    region_effects = {'North': 1, 'South': 2, 'East': 1, 'West': 3, 'Central': 2}

    data_list = []
    for id in ids:
        industry = industries[id % len(industries)]  
        region = regions[id % len(regions)]  
        for year in years:
            x = np.random.uniform(0, 10)
            industry_effect = industry_effects[industry]
            region_effect = region_effects[region]
            y = 1000 + 10* x + 10 * year + 30 * id + 40* region_effect + np.random.normal(0, 2)  
            data_list.append({'id': id, 'year': year, 'industry': industry, 'region': region, 'outcome': y, 'independent': x})

    panel_data = pd.DataFrame(data_list)
    return panel_data

panel_data = create_panel_data()
panel_data

Unnamed: 0,id,year,industry,region,outcome,independent
0,1,2010,Health,South,21238.555995,2.962492
1,1,2011,Health,South,21301.147099,7.894093
2,1,2012,Health,South,21285.228472,5.613490
3,1,2013,Health,South,21245.280654,0.584607
4,1,2014,Health,South,21319.468677,6.732924
...,...,...,...,...,...,...
1084,99,2016,Agriculture,Central,24222.137447,1.025061
1085,99,2017,Agriculture,Central,24296.017354,7.574430
1086,99,2018,Agriculture,Central,24263.123866,3.284386
1087,99,2019,Agriculture,Central,24274.810459,4.047789


In [39]:

base = LinReg(df = panel_data,
       outcome='outcome',
       independent=['independent']).summary(content_type='html')

In [40]:
fe1 = FixedEffects(df = panel_data,
                   outcome='outcome',
                   independent=['independent'],
                   fixed=['year'],
                   standard_error_type='clustered')

fe2 = FixedEffects(df = panel_data,
                   outcome='outcome',
                   independent=['independent'],
                   fixed=['year', 'id'],
                   standard_error_type='clustered').summary(content_type='html')

In [35]:
fe2.standard_errors

array([0.6029342 , 0.02087762, 0.04463666, 0.04463666, 0.04465779,
       0.04465779, 0.04466524, 0.04466524, 0.04470172, 0.04470172,
       0.04463673, 0.04463673, 0.04463588, 0.04463588, 0.0446733 ,
       0.0446733 , 0.04463558, 0.04463558, 0.04469785, 0.04469785,
       0.04469504, 0.04469504, 0.1339096 , 0.1339096 , 0.13394894,
       0.13394894, 0.13392631, 0.13392631, 0.13391008, 0.13391008,
       0.13395727, 0.13395727, 0.13392573, 0.13392573, 0.13403012,
       0.13403012, 0.13393567, 0.13393567, 0.13397794, 0.13397794,
       0.13390977, 0.13390977, 0.1339068 , 0.1339068 , 0.13392421,
       0.13392421, 0.13393196, 0.13393196, 0.13397348, 0.13397348,
       0.13390766, 0.13390766, 0.13395652, 0.13395652, 0.13405153,
       0.13405153, 0.13397692, 0.13397692, 0.13395606, 0.13395606,
       0.13405578, 0.13405578, 0.13411572, 0.13411572, 0.13392616,
       0.13392616, 0.133915  , 0.133915  , 0.13395601, 0.13395601,
       0.13391068, 0.13391068, 0.13395498, 0.13395498, 0.13390

In [25]:
panel_data = panel_data.set_index(['id', 'year'])

# Define the model with year-level fixed effects
# The 'TimeEffects' term accounts for year-specific fixed effects
model = PanelOLS.from_formula('outcome ~ independent + TimeEffects', data=panel_data)

# Fit the model
results = model.fit(cov_type='clustered', cluster_time=True)

# Print the results
print(results)

                          PanelOLS Estimation Summary                           
Dep. Variable:                outcome   R-squared:                        0.0004
Estimator:                   PanelOLS   R-squared (Between):              0.0025
No. Observations:                1089   R-squared (Within):               0.3623
Date:                Tue, Jan 02 2024   R-squared (Overall):              0.0025
Time:                        13:35:18   Log-likelihood                   -8901.0
Cov. Estimator:             Clustered                                           
                                        F-statistic:                      0.3867
Entities:                          99   P-value                           0.5341
Avg Obs:                       11.000   Distribution:                  F(1,1077)
Min Obs:                       11.000                                           
Max Obs:                       11.000   F-statistic (robust):             0.3390
                            

In [27]:
results.std_errors.iloc[0]

9.69880848581809

In [41]:
panel_data = panel_data.set_index(['id', 'year'])

# Define the model with both time and id level fixed effects
# The 'EntityEffects' term accounts for id-specific fixed effects
# The 'TimeEffects' term accounts for year-specific fixed effects
model = PanelOLS.from_formula('outcome ~ independent + EntityEffects + TimeEffects', data=panel_data)

# Fit the model
results = model.fit(cov_type='clustered', cluster_entity=True, cluster_time=True)

# Print the results
print(results)

                          PanelOLS Estimation Summary                           
Dep. Variable:                outcome   R-squared:                        0.9953
Estimator:                   PanelOLS   R-squared (Between):              0.0044
No. Observations:                1089   R-squared (Within):               0.4640
Date:                Tue, Jan 02 2024   R-squared (Overall):              0.0044
Time:                        13:40:35   Log-likelihood                   -2234.1
Cov. Estimator:             Clustered                                           
                                        F-statistic:                   2.069e+05
Entities:                          99   P-value                           0.0000
Avg Obs:                       11.000   Distribution:                   F(1,979)
Min Obs:                       11.000                                           
Max Obs:                       11.000   F-statistic (robust):          3.113e+05
                            

In [8]:
fe22 = FixedEffects(df = panel_data,
                   outcome='outcome',
                   independent=['independent'],
                   fixed=['year', 'id'],
                    standard_error_type='clustered').summary(content_type='html')


In [27]:
"""Year, id and region level fixed effects"""

fe3 = FixedEffects(df = panel_data,
                   outcome='outcome',
                   independent=['independent'],
                   fixed=['year', 'id', 'region']).summary(content_type='html')
