In [1]:
import pandas as pd

pd.set_option("mode.copy_on_write", True)

df_e_price = pd.read_csv("../src/cleaned_e_price.csv")
df_emission = pd.read_csv("../src/cleaned_emission_annual.csv")
df_fuel_ratio = pd.read_csv("../src/cleaned_fuel_ratio.csv")
df_climate = pd.read_csv("../src/cleaned_cdd_hdd.csv")

In [2]:
merged = pd.merge(df_e_price, df_emission, on=["Year", "State"], how="inner")
merged = pd.merge(merged, df_fuel_ratio, on=["Year", "State"], how="inner")
merged = pd.merge(merged, df_climate, on=["Year", "State"], how="inner")

In [3]:
# log transformation of CO2 emission
import numpy as np

merged["CO2_log"] = np.log(merged["CO2_annual"])

In [4]:
merged.head()

Unnamed: 0,State,Year,e_price,CO2_annual,Fuels_ratio,state_code,cdd,hdd,Description,cdd_hdd_sum,CO2_log
0,CT,2001,9.62,45649924,0.0,6,47.5,472.666667,Connecticut,520.166667,17.636513
1,ME,2001,10.55,35440016,0.0,17,22.0,629.0,Maine,651.0,17.383352
2,MA,2001,11.55,99152464,0.0,19,40.833333,496.75,Massachusetts,537.583333,18.412169
3,NH,2001,10.95,18182220,0.0,27,25.0,606.083333,New Hampshire,631.083333,16.715955
4,RI,2001,11.45,13784508,0.0,37,44.583333,464.166667,Rhode Island,508.75,16.439056


In [5]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 960 entries, 0 to 959
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   State        960 non-null    object 
 1   Year         960 non-null    int64  
 2   e_price      960 non-null    float64
 3   CO2_annual   960 non-null    int64  
 4   Fuels_ratio  960 non-null    float64
 5   state_code   960 non-null    int64  
 6   cdd          960 non-null    float64
 7   hdd          960 non-null    float64
 8   Description  960 non-null    object 
 9   cdd_hdd_sum  960 non-null    float64
 10  CO2_log      960 non-null    float64
dtypes: float64(6), int64(3), object(2)
memory usage: 90.0+ KB


In [6]:
merged.describe()

Unnamed: 0,Year,e_price,CO2_annual,Fuels_ratio,state_code,cdd,hdd,cdd_hdd_sum,CO2_log
count,960.0,960.0,960.0,960.0,960.0,960.0,960.0,960.0,960.0
mean,2010.5,9.268604,183316400.0,0.379871,24.5,94.841493,430.178819,525.020313,18.429543
std,5.769287,2.87592,173446700.0,0.40402,13.86062,69.012197,173.464706,118.822882,1.558522
min,2001.0,4.24,26332.0,0.0,1.0,10.0,37.583333,272.333333,10.17854
25%,2005.75,7.205,69673020.0,0.0,12.75,44.208333,295.395833,434.625,18.059239
50%,2010.5,8.81,143436500.0,0.148316,24.5,75.708333,441.541667,507.875,18.781402
75%,2015.25,10.33,240210900.0,0.853962,36.25,129.4375,561.083333,612.9375,19.297028
max,2020.0,19.13,1069856000.0,0.998595,48.0,345.833333,850.083333,882.75,20.79079


In [7]:
merged["Year"].unique(), merged["State"].unique()

(array([2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
        2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]),
 array(['CT', 'ME', 'MA', 'NH', 'RI', 'VT', 'NJ', 'NY', 'PA', 'IL', 'IN',
        'MI', 'OH', 'WI', 'IA', 'KS', 'MN', 'MO', 'NE', 'ND', 'SD', 'DE',
        'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'WV', 'AL', 'KY', 'MS', 'TN',
        'AR', 'LA', 'OK', 'TX', 'AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT',
        'WY', 'CA', 'OR', 'WA'], dtype=object))

In [8]:
merged.to_csv("../src/merged_data.csv", index=False)

In [9]:
joined_rggi = ["CT", "DE", "ME", "MD", "MA", "NH", "NY", "RI", "VT"]

merged["treated"] = merged["State"].isin(joined_rggi)

In [10]:
# select years before and after the treatment
Pre = list(range(2001, 2009))
Post = list(range(2010, 2018))
merged["post"] = merged["Year"].isin(Post)
merged["treated:post"] = merged["treated"] * merged["post"]
merged_subset = merged[merged["Year"].isin(Post + Pre)]

# drop NJ as it initially withdrew in 2012 but rejoined in 2020.
merged_subset = merged_subset[merged_subset["State"] != "NJ"].reset_index(drop=True)

In [11]:
merged_subset

Unnamed: 0,State,Year,e_price,CO2_annual,Fuels_ratio,state_code,cdd,hdd,Description,cdd_hdd_sum,CO2_log,treated,post,treated:post
0,CT,2001,9.62,45649924,0.000000,6,47.500000,472.666667,Connecticut,520.166667,17.636513,True,False,False
1,ME,2001,10.55,35440016,0.000000,17,22.000000,629.000000,Maine,651.000000,17.383352,True,False,False
2,MA,2001,11.55,99152464,0.000000,19,40.833333,496.750000,Massachusetts,537.583333,18.412169,True,False,False
3,NH,2001,10.95,18182220,0.000000,27,25.000000,606.083333,New Hampshire,631.083333,16.715955,True,False,False
4,RI,2001,11.45,13784508,0.000000,37,44.583333,464.166667,Rhode Island,508.750000,16.439056,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
747,UT,2017,8.60,110790548,0.941796,42,52.000000,553.416667,Utah,605.416667,18.523152,False,True,False
748,WY,2017,8.28,177090016,0.992830,48,24.333333,657.916667,Wyoming,682.250000,18.992169,False,True,False
749,CA,2017,16.06,177732004,0.483931,4,97.250000,204.083333,California,301.333333,18.995787,False,True,False
750,OR,2017,8.81,31963616,0.001395,35,29.166667,447.500000,Oregon,476.666667,17.280109,False,True,False


In [12]:
merged_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 752 entries, 0 to 751
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   State         752 non-null    object 
 1   Year          752 non-null    int64  
 2   e_price       752 non-null    float64
 3   CO2_annual    752 non-null    int64  
 4   Fuels_ratio   752 non-null    float64
 5   state_code    752 non-null    int64  
 6   cdd           752 non-null    float64
 7   hdd           752 non-null    float64
 8   Description   752 non-null    object 
 9   cdd_hdd_sum   752 non-null    float64
 10  CO2_log       752 non-null    float64
 11  treated       752 non-null    bool   
 12  post          752 non-null    bool   
 13  treated:post  752 non-null    bool   
dtypes: bool(3), float64(6), int64(3), object(2)
memory usage: 67.0+ KB


In [13]:
import statsmodels.formula.api as smf

model = smf.ols(
    "CO2_log ~ Fuels_ratio + cdd + hdd + treated + post + treated:post",
    data=merged_subset,
).fit(
    cov_type="cluster",
    cov_kwds={"groups": merged_subset["State"]},
)
model.summary()

0,1,2,3
Dep. Variable:,CO2_log,R-squared:,0.43
Model:,OLS,Adj. R-squared:,0.426
Method:,Least Squares,F-statistic:,20.25
Date:,"Mon, 29 Apr 2024",Prob (F-statistic):,2.01e-11
Time:,13:56:30,Log-Likelihood:,-1192.1
No. Observations:,752,AIC:,2398.0
Df Residuals:,745,BIC:,2431.0
Df Model:,6,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,19.3861,0.929,20.871,0.000,17.566,21.207
treated[T.True],-1.2732,0.735,-1.732,0.083,-2.714,0.167
post[T.True],-0.1302,0.025,-5.303,0.000,-0.178,-0.082
treated[T.True]:post[T.True],-0.3057,0.084,-3.634,0.000,-0.471,-0.141
Fuels_ratio,0.9671,0.353,2.738,0.006,0.275,1.659
cdd,0.0010,0.003,0.310,0.757,-0.005,0.007
hdd,-0.0024,0.002,-1.477,0.140,-0.006,0.001

0,1,2,3
Omnibus:,353.948,Durbin-Watson:,1.85
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2283.509
Skew:,-2.037,Prob(JB):,0.0
Kurtosis:,10.502,Cond. No.,3720.0


In [14]:
merged_subset = merged_subset.set_index(["State", "Year"])

In [15]:
print(merged_subset.index)

MultiIndex([('CT', 2001),
            ('ME', 2001),
            ('MA', 2001),
            ('NH', 2001),
            ('RI', 2001),
            ('VT', 2001),
            ('NY', 2001),
            ('PA', 2001),
            ('IL', 2001),
            ('IN', 2001),
            ...
            ('CO', 2017),
            ('ID', 2017),
            ('MT', 2017),
            ('NV', 2017),
            ('NM', 2017),
            ('UT', 2017),
            ('WY', 2017),
            ('CA', 2017),
            ('OR', 2017),
            ('WA', 2017)],
           names=['State', 'Year'], length=752)


In [16]:
from linearmodels.panel import PanelOLS

formula = "CO2_log ~ Fuels_ratio + cdd + hdd + treated + post + treated:post + EntityEffects + TimeEffects"
mod = PanelOLS.from_formula(
    formula,
    data=merged_subset,
    drop_absorbed=True,
).fit(cov_type="clustered", cluster_entity=True)

mod.summary

Variables have been fully absorbed and have removed from the regression:

treated, post

  ).fit(cov_type="clustered", cluster_entity=True)


0,1,2,3
Dep. Variable:,CO2_log,R-squared:,0.1647
Estimator:,PanelOLS,R-squared (Between):,0.0304
No. Observations:,752,R-squared (Within):,0.2220
Date:,"Mon, Apr 29 2024",R-squared (Overall):,0.0304
Time:,13:56:30,Log-likelihood,431.44
Cov. Estimator:,Clustered,,
,,F-statistic:,33.823
Entities:,47,P-value,0.0000
Avg Obs:,16.000,Distribution:,"F(4,686)"
Min Obs:,16.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Fuels_ratio,0.5596,0.2344,2.3869,0.0173,0.0993,1.0199
cdd,0.0018,0.0004,4.2369,0.0000,0.0010,0.0026
hdd,-0.0002,0.0003,-0.7141,0.4754,-0.0008,0.0004
treated:post,-0.2723,0.0883,-3.0848,0.0021,-0.4455,-0.0990
