### Import Libraries

In [9]:
import pandas as pd
from pathlib import Path
import numpy as np

### Import Data

In [10]:
# ------- define paths -------
relative_path = Path('../data/intermediate_data/') # define relative path
absolute_path = relative_path.resolve() # get absolute path


# ------- import data -------
assess_df = pd.read_csv(absolute_path/'iac_assess_tidy.csv') # import IAC assess dataset
recc_df = pd.read_csv(absolute_path/'recc_ppi.csv') # import IAC recc dataset
emissions_tidy_df = pd.read_csv(absolute_path/'emissions_tidy.csv') # import emissions data
generation_df = pd.read_csv(absolute_path/'generation.csv') # import generation data

### Integrate Generation data into the Emissions dataframe

In [11]:
print(list(emissions_tidy_df.columns))

['state', 'year', 'producer_type', 'energy_source', 'emission_type', 'amount']


In [12]:
print(list(generation_df.columns))

['year', 'state', 'type_of_producer', 'energy_source', 'generation_megawatthours', 'units']


In [13]:
# Merge the generation dataframe with the emissions dataframe on the "state" column
# Add new column for emissions by Tonnes/MWh =  kg/kWh
# Only include producer type Total Electric Power Industry and energy source All Sources/Total
# Merge generation and emissions dataframes

# Rename All Sources
emissions_tidy_df['energy_source'] = emissions_tidy_df['energy_source'].replace('All Sources', 'Total')

# Filter
emissions_filtered = emissions_tidy_df[
    (emissions_tidy_df['energy_source'] == 'Total') &
    (emissions_tidy_df['producer_type'] == 'Total Electric Power Industry')]
generation_filtered = generation_df[
    (generation_df['energy_source'] == 'Total') &
    (generation_df['type_of_producer'] == 'Total Electric Power Industry')]

# Merge
emissions_generation_df = generation_filtered.merge(
    emissions_filtered[['state', 'year', 'producer_type', 'energy_source', 'emission_type', 
                        'amount']].rename(columns={
        'producer_type': 'type_of_producer'  # Rename to match the generation dataframe
    }),
    on=['state', 'year', 'type_of_producer', 'energy_source'],
    how='left'  # Left join to keep all rows from generation data
)

# Calculate emissions per generation and add the 'kg/kWh' unit
emissions_generation_df['emission_factor'] = emissions_generation_df['amount'] / emissions_generation_df['generation_megawatthours']
emissions_generation_df['unit'] = 'kg/kWh'

In [14]:
# reoder columns in emissions_generation_df
emissions_generation_columns_order = ['state', 'year', 'type_of_producer', 'energy_source', 'generation_megawatthours', 'units', 'emission_type', 'amount', 'emission_factor', 'unit' 
                          ]
emissions_generation_df = emissions_generation_df[emissions_generation_columns_order]
emissions_generation_df

Unnamed: 0,state,year,type_of_producer,energy_source,generation_megawatthours,units,emission_type,amount,emission_factor,unit
0,AK,1990,Total Electric Power Industry,Total,5599506.0,MWh,CO2,4208809.0,0.751639,kg/kWh
1,AK,1990,Total Electric Power Industry,Total,5599506.0,MWh,SO2,18741.0,0.003347,kg/kWh
2,AK,1990,Total Electric Power Industry,Total,5599506.0,MWh,NOx,12562.0,0.002243,kg/kWh
3,AL,1990,Total Electric Power Industry,Total,79652133.0,MWh,CO2,52936063.0,0.664591,kg/kWh
4,AL,1990,Total Electric Power Industry,Total,79652133.0,MWh,SO2,505530.0,0.006347,kg/kWh
...,...,...,...,...,...,...,...,...,...,...
5275,WV,2023,Total Electric Power Industry,Total,52286784.0,MWh,SO2,35530.0,0.000680,kg/kWh
5276,WV,2023,Total Electric Power Industry,Total,52286784.0,MWh,NOx,25505.0,0.000488,kg/kWh
5277,WY,2023,Total Electric Power Industry,Total,43181420.0,MWh,CO2,36580219.0,0.847129,kg/kWh
5278,WY,2023,Total Electric Power Industry,Total,43181420.0,MWh,SO2,21963.0,0.000509,kg/kWh


### Integrate Emission factors into the Assess data frame

In [15]:
print(list(assess_df.columns))

['center', 'fy', 'sic', 'naics', 'state', 'sales', 'employees', 'plant_area', 'products', 'produnits', 'prodlevel', 'prodhours', 'numars', 'source_code', 'plant_cost', 'plant_usage']


In [16]:
print(list(emissions_generation_df.columns))

['state', 'year', 'type_of_producer', 'energy_source', 'generation_megawatthours', 'units', 'emission_type', 'amount', 'emission_factor', 'unit']


In [21]:
# Rename year column and filter years 1990 - 2023
assess_df = assess_df.rename(columns={"fy": "year"})
assess_df = assess_df.loc[(assess_df['year'] >= 1990) & (assess_df['year'] <= 2023)]

# Merge
assess_emissions_df = assess_df.merge(
    emissions_generation_df[['state', 'year', 'type_of_producer', 'energy_source', 
                             'generation_megawatthours', 'units', 'emission_type', 
                             'amount', 'emission_factor', 'unit']], 
    on=['state', 'year'], 
    how='left')

# Calculate emission per product yearly electricity consumption
assess_emissions_df['emissions'] = assess_emissions_df.apply(
    lambda row: row['plant_usage'] * row['emission_factor'] if row['energy_source'] == 'EC' else 0,
    axis=1
)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
assess_emissions_df['u'] = 'kg'

In [22]:
# reoder columns in assess_emissions_df
assess_emissions_columns_order = ['state', 'year', 'type_of_producer', 'energy_source', 
                                  'generation_megawatthours', 'units', 'emission_type', 
                                  'amount', 'emission_factor', 'unit', 'center', 
                                  'sic', 'naics', 'sales', 'employees', 'plant_area', 
                                  'products', 'produnits', 'prodlevel', 'prodhours', 'numars',
                                  'plant_cost', 'plant_usage', 'emissions', 'u', 
                                  ]
assess_emissions_df = assess_emissions_df[assess_emissions_columns_order]
assess_emissions_df

Unnamed: 0,state,year,type_of_producer,energy_source,generation_megawatthours,units,emission_type,amount,emission_factor,unit,...,plant_area,products,produnits,prodlevel,prodhours,numars,plant_cost,plant_usage,emissions,u
0,TX,1990,Total Electric Power Industry,Total,281559635.0000,MWh,CO2,213056168.0000,0.7567,kg/kWh,...,,PIPECOATING,6.0000,9038.0000,2400.0000,5,269266.0000,3456917.0000,0,kg
1,TX,1990,Total Electric Power Industry,Total,281559635.0000,MWh,SO2,568924.0000,0.0020,kg/kWh,...,,PIPECOATING,6.0000,9038.0000,2400.0000,5,269266.0000,3456917.0000,0,kg
2,TX,1990,Total Electric Power Industry,Total,281559635.0000,MWh,NOx,662759.0000,0.0024,kg/kWh,...,,PIPECOATING,6.0000,9038.0000,2400.0000,5,269266.0000,3456917.0000,0,kg
3,TX,1990,Total Electric Power Industry,Total,281559635.0000,MWh,CO2,213056168.0000,0.7567,kg/kWh,...,,PIPECOATING,6.0000,9038.0000,2400.0000,5,29923.0000,8238.0000,0,kg
4,TX,1990,Total Electric Power Industry,Total,281559635.0000,MWh,SO2,568924.0000,0.0020,kg/kWh,...,,PIPECOATING,6.0000,9038.0000,2400.0000,5,29923.0000,8238.0000,0,kg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206594,WV,2023,Total Electric Power Industry,Total,52286784.0000,MWh,SO2,35530.0000,0.0007,kg/kWh,...,95000.0000,Lumber,6.0000,5000.0000,2400.0000,14,166292.0000,23139.0000,0,kg
206595,WV,2023,Total Electric Power Industry,Total,52286784.0000,MWh,NOx,25505.0000,0.0005,kg/kWh,...,95000.0000,Lumber,6.0000,5000.0000,2400.0000,14,166292.0000,23139.0000,0,kg
206596,WV,2023,Total Electric Power Industry,Total,52286784.0000,MWh,CO2,45743064.0000,0.8748,kg/kWh,...,95000.0000,Lumber,6.0000,5000.0000,2400.0000,14,68017.0000,5809.8960,0,kg
206597,WV,2023,Total Electric Power Industry,Total,52286784.0000,MWh,SO2,35530.0000,0.0007,kg/kWh,...,95000.0000,Lumber,6.0000,5000.0000,2400.0000,14,68017.0000,5809.8960,0,kg


In [23]:
assess_emissions_df.to_csv("../data/intermediate_data/assess_emissions.csv", index=False)

### Integrate Assess w/ Emissions into Recc w/ PPI dataframe

In [24]:
print(list(recc_df.columns))

['superid', 'id', 'ar_number', 'appcode', 'arc2', 'impstatus', 'impcost', 'ref_year_impcost', 'source_rank', 'sourccode', 'conserved', 'sourconsv', 'saved', 'rebate', 'incremntal', 'fy', 'ic_capital', 'ic_other', 'payback', 'bptool']


In [25]:
print(list(assess_emissions_df.columns))

['state', 'year', 'type_of_producer', 'energy_source', 'generation_megawatthours', 'units', 'emission_type', 'amount', 'emission_factor', 'unit', 'center', 'sic', 'naics', 'sales', 'employees', 'plant_area', 'products', 'produnits', 'prodlevel', 'prodhours', 'numars', 'plant_cost', 'plant_usage', 'emissions', 'u']


In [27]:
# Rename year column and filter years 1990 - 2023
recc_df = recc_df.rename(columns={"fy": "year"})
recc_df = recc_df.loc[(recc_df['year'] >= 1990) & (recc_df['year'] <= 2023)]

# Merge
assess_recc_emissions_df = recc_df.merge(
    assess_emissions_df[['state', 'year', 'type_of_producer', 'energy_source', 
                         'generation_megawatthours', 'units', 'emission_type', 
                         'amount', 'emission_factor', 'unit', 'center', 'sic', 
                         'naics', 'sales', 'employees', 'plant_area', 'products', 
                         'produnits', 'prodlevel', 'prodhours', 'numars', 'plant_cost', 
                         'plant_usage', 'emissions', 'u']], 
    on=['id','year'],
    how='left')

# Calculate emission per product yearly electricity consumption
assess_recc_emissions_df.loc[assess_recc_emissions_df['sourccode'].isin(['EC']), 'conserved_emissions'] = assess_recc_emissions_df['conserved'] * assess_recc_emissions_df['emission_factor']
pd.set_option('display.float_format', lambda x: '%.4f' % x)
assess_recc_emissions_df['u'] = 'kg'

KeyError: 'id'

In [19]:
# reoder columns in assess_emissions_df
assess_recc_emissions_columns_order = ['state', 'year', 'type_of_producer', 'energy_source', 
                                  'generation_megawatthours', 'units', 'emission_type', 
                                  'amount', 'emission_factor', 'unit', 'id', 'superid', 
                                  'center', 'ar_number', 'appcode', 'arc2', 'impstatus', 
                                  'impcost', 'ref_year_impcost', 'source_rank', 'sourccode', 
                                  'conserved', 'conserved_emissions', 'u', 'sourconsv', 
                                  'saved', 'rebate', 'incremntal', 'ic_capital', 'ic_other', 
                                  'payback', 'bptool', 'sic', 'naics', 'sales', 'employees', 
                                  'plant_area', 'products', 'produnits', 'prodlevel', 
                                  'prodhours', 'numars','ec_plant_cost', 'ec_plant_usage', 
                                  'ec_emissions', 'u', 
                                  'ed_plant_cost', 'ed_plant_usage', 'ef_plant_cost', 
                                  'e2_plant_cost', 'e2_plant_usage', 'e3_plant_cost', 
                                  'e3_plant_usage', 'e4_plant_cost', 'e4_plant_usage', 
                                  'e5_plant_cost', 'e5_plant_usage', 'e6_plant_cost', 
                                  'e6_plant_usage', 'e7_plant_cost', 'e7_plant_usage', 
                                  'e8_plant_cost', 'e8_plant_usage', 'e9_plant_cost', 
                                  'e9_plant_usage', 'e10_plant_cost', 'e10_plant_usage', 
                                  'e11_plant_cost', 'e11_plant_usage', 'e12_plant_cost', 
                                  'e12_plant_usage', 'w0_plant_cost', 'w0_plant_usage', 
                                  'w1_plant_cost', 'w1_plant_usage', 'w2_plant_cost', 
                                  'w2_plant_usage', 'w3_plant_cost', 'w3_plant_usage', 
                                  'w4_plant_cost', 'w4_plant_usage', 'w5_plant_cost', 
                                  'w5_plant_usage', 'w6_plant_cost', 'w6_plant_usage']
assess_recc_emissions_df = assess_recc_emissions_df[assess_recc_emissions_columns_order]
assess_recc_emissions_df

Unnamed: 0,state,year,type_of_producer,energy_source,generation_megawatthours,units,emission_type,amount,emission_factor,unit,...,w2_plant_cost,w2_plant_usage,w3_plant_cost,w3_plant_usage,w4_plant_cost,w4_plant_usage,w5_plant_cost,w5_plant_usage,w6_plant_cost,w6_plant_usage
0,TX,1990,Total Electric Power Industry,Total,281559635.0000,MWh,CO2,213056168.0000,0.7567,kg/kWh,...,,,,,,,,,,
1,TX,1990,Total Electric Power Industry,Total,281559635.0000,MWh,SO2,568924.0000,0.0020,kg/kWh,...,,,,,,,,,,
2,TX,1990,Total Electric Power Industry,Total,281559635.0000,MWh,NOx,662759.0000,0.0024,kg/kWh,...,,,,,,,,,,
3,TX,1990,Total Electric Power Industry,Total,281559635.0000,MWh,CO2,213056168.0000,0.7567,kg/kWh,...,,,,,,,,,,
4,TX,1990,Total Electric Power Industry,Total,281559635.0000,MWh,SO2,568924.0000,0.0020,kg/kWh,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1696383,WV,2023,Total Electric Power Industry,Total,52286784.0000,MWh,SO2,35530.0000,0.0007,kg/kWh,...,,,,,,,,,,
1696384,WV,2023,Total Electric Power Industry,Total,52286784.0000,MWh,NOx,25505.0000,0.0005,kg/kWh,...,,,,,,,,,,
1696385,WV,2023,Total Electric Power Industry,Total,52286784.0000,MWh,CO2,45743064.0000,0.8748,kg/kWh,...,,,,,,,,,,
1696386,WV,2023,Total Electric Power Industry,Total,52286784.0000,MWh,SO2,35530.0000,0.0007,kg/kWh,...,,,,,,,,,,


In [15]:
print(list(assess_recc_emissions_df.columns))

['state', 'year', 'type_of_producer', 'energy_source', 'generation_megawatthours', 'units', 'emission_type', 'amount', 'emission_factor', 'unit', 'id', 'superid', 'center', 'ar_number', 'appcode', 'arc2', 'impstatus', 'impcost', 'ref_year_impcost', 'source_rank', 'sourccode', 'conserved', 'conserved_emissions', 'u', 'sourconsv', 'saved', 'rebate', 'incremntal', 'ic_capital', 'ic_other', 'payback', 'bptool', 'sic', 'naics', 'sales', 'employees', 'plant_area', 'products', 'produnits', 'prodlevel', 'prodhours', 'numars', 'ec_plant_cost', 'ec_plant_usage', 'ec_emissions', 'u', 'ed_plant_cost', 'ed_plant_usage', 'ef_plant_cost', 'e2_plant_cost', 'e2_plant_usage', 'e3_plant_cost', 'e3_plant_usage', 'e4_plant_cost', 'e4_plant_usage', 'e5_plant_cost', 'e5_plant_usage', 'e6_plant_cost', 'e6_plant_usage', 'e7_plant_cost', 'e7_plant_usage', 'e8_plant_cost', 'e8_plant_usage', 'e9_plant_cost', 'e9_plant_usage', 'e10_plant_cost', 'e10_plant_usage', 'e11_plant_cost', 'e11_plant_usage', 'e12_plant_cos

In [32]:
assess_recc_emissions_df.to_csv("../data/intermediate_data/assess_recc_ppi_emissions.csv", index=False)

In [17]:
missing_values_by_year = assess_recc_emissions_df.groupby('year')['naics'].apply(lambda x: x.isna().sum())

print(missing_values_by_year)

year
1990    29004
1991    37092
1992    45324
1993    49560
1994    65688
1995    72660
1996    76836
1997    71952
1998    70464
1999    66240
2000    63996
2001    55680
2002    55356
2003     1380
2004        0
2005        0
2006      288
2007        0
2008        0
2009        0
2010        0
2011        0
2012        0
2013        0
2014        0
2015        0
2016        0
2017        0
2018        0
2019        0
2020        0
2021        0
2022        0
2023        4
Name: naics, dtype: int64
