In [1]:
# Import libraries
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import janitor
from janitor import clean_names


In [2]:
# Import datasets

# ------- define paths -------
relative_path = Path('../../data/intermediate_data/') # define relative path
absolute_path = relative_path.resolve() # get absolute path


# ------- import data -------
assess_df = pd.read_csv(absolute_path/'iac_assess_tidy.csv') # import IAC assess dataset
ec_emissions_df = pd.read_csv(absolute_path/'emissions_tidy.csv') # import emissions dataset
ec_generation_df = pd.read_csv(absolute_path/'generation.csv') # import electricity generation dataset
recc_integrated_ppi_df = pd.read_csv(absolute_path/'recc_integrated_ppi.csv') # import an integrated recc dataset with adjusted impcost
fuel_emission_factors_df = pd.read_excel(absolute_path/'emission_factors_tidy.xlsx', sheet_name='Sheet1') # import fuel emission factors

In [3]:
recc_integrated_ppi_df

Unnamed: 0,superid,id,description,ar_number,appcode,arc2,impstatus,impcost,impcost_adj,source_rank,...,saved,rebate,incremntal,fy,ic_capital,ic_other,payback,bptool,reference_year,reference_ppi
0,AM000101,AM0001,,1,,2.8114,N,15000.0,,PSOURCCODE,...,1828.0,N,N,1987,,,8.205689,,,
1,AM000101,AM0001,,1,,2.8114,N,15000.0,,SSOURCCODE,...,,N,N,1987,,,8.205689,,,
2,AM000101,AM0001,,1,,2.8114,N,15000.0,,TSOURCCODE,...,,N,N,1987,,,8.205689,,,
3,AM000101,AM0001,,1,,2.8114,N,15000.0,,QSOURCCODE,...,,N,N,1987,,,8.205689,,,
4,AM000102,AM0001,,2,,2.7142,N,189.0,,PSOURCCODE,...,663.0,N,N,1987,,,0.285068,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652595,WV067708,WV0677,,8,,2.4239,,540.0,,QSOURCCODE,...,,N,N,2025,400.0,140.0,,AM+,,
652596,WV067709,WV0677,INSULATE BARE EQUIPMENT,9,,2.2511,,446.0,446.0,PSOURCCODE,...,290.0,N,N,2025,330.0,116.0,,SSTS,2024.0,189.87
652597,WV067709,WV0677,INSULATE BARE EQUIPMENT,9,,2.2511,,446.0,446.0,SSOURCCODE,...,,N,N,2025,330.0,116.0,,SSTS,2024.0,189.87
652598,WV067709,WV0677,INSULATE BARE EQUIPMENT,9,,2.2511,,446.0,446.0,TSOURCCODE,...,,N,N,2025,330.0,116.0,,SSTS,2024.0,189.87


In [4]:
ec_emissions_df

Unnamed: 0,state,year,producer_type,energy_source,emission_type,amount
0,AK,1990,Commercial Cogen,All Sources,CO2,824004
1,AK,1990,Commercial Cogen,Coal,CO2,821929
2,AK,1990,Commercial Cogen,Petroleum,CO2,2075
3,AK,1990,Commercial Non-Cogen,All Sources,CO2,0
4,AK,1990,Commercial Non-Cogen,Petroleum,CO2,0
...,...,...,...,...,...,...
147313,WY,2023,Total Electric Power Industry,Coal,NOx,22890
147314,WY,2023,Total Electric Power Industry,Natural Gas,NOx,2258
147315,WY,2023,Total Electric Power Industry,Other Gases,NOx,3075
147316,WY,2023,Total Electric Power Industry,Other,NOx,42


In [5]:
fuel_emission_factors_df

Unnamed: 0,energy_source,sourccode,units,emission_factor_units,fuel_description,emission_type,emission_factor
0,Natural Gas,E2,MMBtu,kg/MMBtu,Natural gas is a gas consisting primarily of m...,CO2,53.06
1,Natural Gas,E2,MMBtu,kg/MMBtu,Natural gas is a gas consisting primarily of m...,SO2,0.000267
2,Natural Gas,E2,MMBtu,kg/MMBtu,Natural gas is a gas consisting primarily of m...,NOx,0.078934
3,L.P.G,E3,MMBtu,kg/MMBtu,LPG is a mixture of hydrocarbon gases used as ...,CO2,61.71
4,L.P.G,E3,MMBtu,kg/MMBtu,LPG is a mixture of hydrocarbon gases used as ...,SO2,0.00672
5,L.P.G,E3,MMBtu,kg/MMBtu,LPG is a mixture of hydrocarbon gases used as ...,NOx,0.066085
6,#1 Fuel Oil,E4,MMBtu,kg/MMBtu,Fuel Oil No. 1 is similar to kerosene and is t...,CO2,75.2
7,#1 Fuel Oil,E4,MMBtu,kg/MMBtu,Fuel Oil No. 1 is similar to kerosene and is t...,SO2,0.078212
8,#1 Fuel Oil,E4,MMBtu,kg/MMBtu,Fuel Oil No. 1 is similar to kerosene and is t...,NOx,0.055079
9,#2 Fuel Oil,E5,MMBtu,kg/MMBtu,"Fuel Oil No. 2 is diesel fuel, which is common...",CO2,73.96


In [6]:
# remove records that don't have any values for power sources other than primary
recc_integrated_ppi_df = recc_integrated_ppi_df.dropna(subset=['sourccode', 'conserved','sourconsv','saved'], how='all')

In [7]:
# verify the rows with blank values are not in the dataframe
recc_integrated_ppi_df[recc_integrated_ppi_df['superid']=='AM043901']

Unnamed: 0,superid,id,description,ar_number,appcode,arc2,impstatus,impcost,impcost_adj,source_rank,...,saved,rebate,incremntal,fy,ic_capital,ic_other,payback,bptool,reference_year,reference_ppi
13300,AM043901,AM0439,,1,1.0,4.132,I,435000.0,,PSOURCCODE,...,910000.0,N,N,2003,395000.0,40000.0,0.478022,,,


In [8]:
assess_df.head()

Unnamed: 0,id,center,fy,sic,naics,state,sales,employees,plant_area,products,produnits,prodlevel,prodhours,numars,source_code,plant_cost,plant_usage
0,AM0001,AM,1987,3671.0,,TX,33900000.0,206.0,,MICRO & MINI COMPUTERS,1.0,310.0,2250.0,7,EC,184985.0,2560082.0
1,AM0002,AM,1987,2761.0,,TX,25000000.0,156.0,,BUSINESS FORMS,,,2250.0,9,EC,267702.0,4867233.0
2,AM0002,AM,1987,2761.0,,TX,25000000.0,156.0,,BUSINESS FORMS,,,2250.0,9,E2,70657.0,19338.0
3,AM0003,AM,1987,3494.0,,TX,15000000.0,200.0,,SAFETY JOINTS & VALVES,,,2250.0,8,EC,129334.0,1723036.0
4,AM0003,AM,1987,3494.0,,TX,15000000.0,200.0,,SAFETY JOINTS & VALVES,,,2250.0,8,E2,10146.0,2074.0


In [9]:
# add Sector and state attributes to recc_integrated_ppi_df from assess_df
# integrate assess_df into 
integrated_ppi_df = pd.merge(recc_integrated_ppi_df, assess_df,
                             left_on=['sourccode', 'id', 'fy'], 
                             right_on=['source_code', 'id','fy'], 
                             how='left')

integrated_ppi_df.drop_duplicates(inplace = True)

In [10]:
integrated_ppi_df.shape #(247485, 39) (247485, 38)

(247485, 38)

In [11]:
integrated_ppi_df.columns

Index(['superid', 'id', 'description', 'ar_number', 'appcode', 'arc2',
       'impstatus', 'impcost', 'impcost_adj', 'source_rank', 'sourccode',
       'conserved', 'sourconsv', 'saved', 'rebate', 'incremntal', 'fy',
       'ic_capital', 'ic_other', 'payback', 'bptool', 'reference_year',
       'reference_ppi', 'center', 'sic', 'naics', 'state', 'sales',
       'employees', 'plant_area', 'products', 'produnits', 'prodlevel',
       'prodhours', 'numars', 'source_code', 'plant_cost', 'plant_usage'],
      dtype='object')

In [12]:
# check unique power source codes 
integrated_ppi_df['sourccode'].unique()


array(['EC', 'E2', 'R2', 'E3', 'E9', 'E12', 'E4', 'E11', 'W6', 'W1', 'R5',
       'W5', 'R3', 'W3', 'W4', 'E5', 'W2', 'R1', 'R4', 'ED', 'EF', 'P1',
       'P2', 'R6', 'P3', 'W0', nan, 'E7', 'E8', 'E6', 'E10'], dtype=object)

In [13]:
integrated_ppi_df[integrated_ppi_df['superid']=='WV061012']

Unnamed: 0,superid,id,description,ar_number,appcode,arc2,impstatus,impcost,impcost_adj,source_rank,...,employees,plant_area,products,produnits,prodlevel,prodhours,numars,source_code,plant_cost,plant_usage
246067,WV061012,WV0610,INSULATE BARE EQUIPMENT,12,,2.2511,I,488.0,497.76,PSOURCCODE,...,23.0,116000.0,Processed Coffee Beans,3.0,7690.0,4160.0,12.0,E2,134406.0,40605.0


In [14]:
integrated_ppi_df[integrated_ppi_df['superid']=='AM057403']

Unnamed: 0,superid,id,description,ar_number,appcode,arc2,impstatus,impcost,impcost_adj,source_rank,...,employees,plant_area,products,produnits,prodlevel,prodhours,numars,source_code,plant_cost,plant_usage
5837,AM057403,AM0574,UTILIZE HIGHER EFFICIENCY LAMPS AND/OR BALLASTS,3,3.0,2.7142,N,52875.0,72967.5,PSOURCCODE,...,150.0,320000.0,Carbonated soft drinks,1.0,22685034.0,5773.0,10.0,EC,545600.0,11608476.0
5838,AM057403,AM0574,UTILIZE HIGHER EFFICIENCY LAMPS AND/OR BALLASTS,3,3.0,2.7142,N,52875.0,72967.5,SSOURCCODE,...,150.0,320000.0,Carbonated soft drinks,1.0,22685034.0,5773.0,10.0,ED,212700.0,28944.0
5839,AM057403,AM0574,UTILIZE HIGHER EFFICIENCY LAMPS AND/OR BALLASTS,3,3.0,2.7142,N,52875.0,72967.5,TSOURCCODE,...,,,,,,,,,,


#### Merge Fuel Emission Factors into the integrated recc table

In [15]:
# add fuel emission factors to the integrated recc df
integrated_df = pd.merge(integrated_ppi_df, fuel_emission_factors_df[['sourccode','emission_type','emission_factor','emission_factor_units']],
                                  on='sourccode',
                                  how='left')

In [16]:
integrated_df.head()

Unnamed: 0,superid,id,description,ar_number,appcode,arc2,impstatus,impcost,impcost_adj,source_rank,...,produnits,prodlevel,prodhours,numars,source_code,plant_cost,plant_usage,emission_type,emission_factor,emission_factor_units
0,AM000101,AM0001,,1,,2.8114,N,15000.0,,PSOURCCODE,...,1.0,310.0,2250.0,7.0,EC,184985.0,2560082.0,,,
1,AM000102,AM0001,,2,,2.7142,N,189.0,,PSOURCCODE,...,1.0,310.0,2250.0,7.0,EC,184985.0,2560082.0,,,
2,AM000103,AM0001,,3,,2.7111,N,398.0,,PSOURCCODE,...,1.0,310.0,2250.0,7.0,EC,184985.0,2560082.0,,,
3,AM000104,AM0001,,4,,2.7447,I,354.0,,PSOURCCODE,...,1.0,310.0,2250.0,7.0,EC,184985.0,2560082.0,,,
4,AM000105,AM0001,,5,,2.7233,N,15.0,,PSOURCCODE,...,1.0,310.0,2250.0,7.0,EC,184985.0,2560082.0,,,


In [17]:
integrated_df[integrated_df['superid'].isin(['WV061012', 'AM057403'])]

Unnamed: 0,superid,id,description,ar_number,appcode,arc2,impstatus,impcost,impcost_adj,source_rank,...,produnits,prodlevel,prodhours,numars,source_code,plant_cost,plant_usage,emission_type,emission_factor,emission_factor_units
7081,AM057403,AM0574,UTILIZE HIGHER EFFICIENCY LAMPS AND/OR BALLASTS,3,3.0,2.7142,N,52875.0,72967.5,PSOURCCODE,...,1.0,22685034.0,5773.0,10.0,EC,545600.0,11608476.0,,,
7082,AM057403,AM0574,UTILIZE HIGHER EFFICIENCY LAMPS AND/OR BALLASTS,3,3.0,2.7142,N,52875.0,72967.5,SSOURCCODE,...,1.0,22685034.0,5773.0,10.0,ED,212700.0,28944.0,,,
7083,AM057403,AM0574,UTILIZE HIGHER EFFICIENCY LAMPS AND/OR BALLASTS,3,3.0,2.7142,N,52875.0,72967.5,TSOURCCODE,...,,,,,,,,,,
318739,WV061012,WV0610,INSULATE BARE EQUIPMENT,12,,2.2511,I,488.0,497.76,PSOURCCODE,...,3.0,7690.0,4160.0,12.0,E2,134406.0,40605.0,CO2,53.06,kg/MMBtu
318740,WV061012,WV0610,INSULATE BARE EQUIPMENT,12,,2.2511,I,488.0,497.76,PSOURCCODE,...,3.0,7690.0,4160.0,12.0,E2,134406.0,40605.0,SO2,0.000267,kg/MMBtu
318741,WV061012,WV0610,INSULATE BARE EQUIPMENT,12,,2.2511,I,488.0,497.76,PSOURCCODE,...,3.0,7690.0,4160.0,12.0,E2,134406.0,40605.0,NOx,0.078934,kg/MMBtu


#### Calculate fuel emission factors

In [18]:
# Calculate fuel emissions avoided
integrated_df['emissions_avoided'] = integrated_df['emission_factor'] * integrated_df['conserved']

In [19]:
integrated_df.columns

Index(['superid', 'id', 'description', 'ar_number', 'appcode', 'arc2',
       'impstatus', 'impcost', 'impcost_adj', 'source_rank', 'sourccode',
       'conserved', 'sourconsv', 'saved', 'rebate', 'incremntal', 'fy',
       'ic_capital', 'ic_other', 'payback', 'bptool', 'reference_year',
       'reference_ppi', 'center', 'sic', 'naics', 'state', 'sales',
       'employees', 'plant_area', 'products', 'produnits', 'prodlevel',
       'prodhours', 'numars', 'source_code', 'plant_cost', 'plant_usage',
       'emission_type', 'emission_factor', 'emission_factor_units',
       'emissions_avoided'],
      dtype='object')

In [20]:
integrated_df[integrated_df['superid'].isin(['WV061012', 'AM057403'])]

Unnamed: 0,superid,id,description,ar_number,appcode,arc2,impstatus,impcost,impcost_adj,source_rank,...,prodlevel,prodhours,numars,source_code,plant_cost,plant_usage,emission_type,emission_factor,emission_factor_units,emissions_avoided
7081,AM057403,AM0574,UTILIZE HIGHER EFFICIENCY LAMPS AND/OR BALLASTS,3,3.0,2.7142,N,52875.0,72967.5,PSOURCCODE,...,22685034.0,5773.0,10.0,EC,545600.0,11608476.0,,,,
7082,AM057403,AM0574,UTILIZE HIGHER EFFICIENCY LAMPS AND/OR BALLASTS,3,3.0,2.7142,N,52875.0,72967.5,SSOURCCODE,...,22685034.0,5773.0,10.0,ED,212700.0,28944.0,,,,
7083,AM057403,AM0574,UTILIZE HIGHER EFFICIENCY LAMPS AND/OR BALLASTS,3,3.0,2.7142,N,52875.0,72967.5,TSOURCCODE,...,,,,,,,,,,
318739,WV061012,WV0610,INSULATE BARE EQUIPMENT,12,,2.2511,I,488.0,497.76,PSOURCCODE,...,7690.0,4160.0,12.0,E2,134406.0,40605.0,CO2,53.06,kg/MMBtu,6579.44
318740,WV061012,WV0610,INSULATE BARE EQUIPMENT,12,,2.2511,I,488.0,497.76,PSOURCCODE,...,7690.0,4160.0,12.0,E2,134406.0,40605.0,SO2,0.000267,kg/MMBtu,0.033086
318741,WV061012,WV0610,INSULATE BARE EQUIPMENT,12,,2.2511,I,488.0,497.76,PSOURCCODE,...,7690.0,4160.0,12.0,E2,134406.0,40605.0,NOx,0.078934,kg/MMBtu,9.787804


# Merge electricity emissions into an integrated recc table

In [21]:
ec_emissions_df = ec_emissions_df[(ec_emissions_df['producer_type']=='Total Electric Power Industry')& # units = metric ton
                                  (ec_emissions_df['energy_source']=='All Sources')]

ec_generation_df = ec_generation_df[(ec_generation_df['type_of_producer']=='Total Electric Power Industry')&
                                  (ec_generation_df['energy_source']=='Total')]

In [22]:
ec_emissions_df.columns

Index(['state', 'year', 'producer_type', 'energy_source', 'emission_type',
       'amount'],
      dtype='object')

In [23]:
# calculate emission factors
# Total Emissions/Total Electricity Generated
ec_emission_factors_df = pd.merge(ec_generation_df,ec_emissions_df[['year','state','emission_type','amount']])
ec_emission_factors_df['emission_factor'] = ec_emission_factors_df['amount'] / ec_emission_factors_df['generation_megawatthours_']

# add column emission_factor_units
ec_emission_factors_df['emission_factor_units'] = 'kg/kWh'
ec_emission_factors_df['sourccode'] = 'EC'

In [24]:
ec_emission_factors_df

Unnamed: 0,year,state,type_of_producer,energy_source,generation_megawatthours_,units,emission_type,amount,emission_factor,emission_factor_units,sourccode
0,1990,AK,Total Electric Power Industry,Total,5599506.0,MWh,CO2,4208809,0.751639,kg/kWh,EC
1,1990,AK,Total Electric Power Industry,Total,5599506.0,MWh,SO2,18741,0.003347,kg/kWh,EC
2,1990,AK,Total Electric Power Industry,Total,5599506.0,MWh,NOx,12562,0.002243,kg/kWh,EC
3,1990,AL,Total Electric Power Industry,Total,79652133.0,MWh,CO2,52936063,0.664591,kg/kWh,EC
4,1990,AL,Total Electric Power Industry,Total,79652133.0,MWh,SO2,505530,0.006347,kg/kWh,EC
...,...,...,...,...,...,...,...,...,...,...,...
5263,2023,WV,Total Electric Power Industry,Total,52286784.0,MWh,SO2,35530,0.000680,kg/kWh,EC
5264,2023,WV,Total Electric Power Industry,Total,52286784.0,MWh,NOx,25505,0.000488,kg/kWh,EC
5265,2023,WY,Total Electric Power Industry,Total,43181420.0,MWh,CO2,36580219,0.847129,kg/kWh,EC
5266,2023,WY,Total Electric Power Industry,Total,43181420.0,MWh,SO2,21963,0.000509,kg/kWh,EC


In [25]:
integrated_df.columns

Index(['superid', 'id', 'description', 'ar_number', 'appcode', 'arc2',
       'impstatus', 'impcost', 'impcost_adj', 'source_rank', 'sourccode',
       'conserved', 'sourconsv', 'saved', 'rebate', 'incremntal', 'fy',
       'ic_capital', 'ic_other', 'payback', 'bptool', 'reference_year',
       'reference_ppi', 'center', 'sic', 'naics', 'state', 'sales',
       'employees', 'plant_area', 'products', 'produnits', 'prodlevel',
       'prodhours', 'numars', 'source_code', 'plant_cost', 'plant_usage',
       'emission_type', 'emission_factor', 'emission_factor_units',
       'emissions_avoided'],
      dtype='object')

In [26]:
ec_emission_factors_df.columns

Index(['year', 'state', 'type_of_producer', 'energy_source',
       'generation_megawatthours_', 'units', 'emission_type', 'amount',
       'emission_factor', 'emission_factor_units', 'sourccode'],
      dtype='object')

In [27]:
integrated_df.columns

Index(['superid', 'id', 'description', 'ar_number', 'appcode', 'arc2',
       'impstatus', 'impcost', 'impcost_adj', 'source_rank', 'sourccode',
       'conserved', 'sourconsv', 'saved', 'rebate', 'incremntal', 'fy',
       'ic_capital', 'ic_other', 'payback', 'bptool', 'reference_year',
       'reference_ppi', 'center', 'sic', 'naics', 'state', 'sales',
       'employees', 'plant_area', 'products', 'produnits', 'prodlevel',
       'prodhours', 'numars', 'source_code', 'plant_cost', 'plant_usage',
       'emission_type', 'emission_factor', 'emission_factor_units',
       'emissions_avoided'],
      dtype='object')

In [30]:
ec_emission_factors_df.columns

Index(['year', 'state', 'type_of_producer', 'energy_source',
       'generation_megawatthours_', 'units', 'emission_type', 'amount',
       'emission_factor', 'emission_factor_units', 'sourccode'],
      dtype='object')

In [None]:
# combine ec_emission_factors_df with the integrated recc table
integrated_df = pd.merge(integrated_df, ec_emission_factors_df[['state','year','emission_type','emission_factor','emission_factor_units','sourccode']],
                         left_on=['fy','state','sourccode'],
                         right_on=['year','state','sourccode'],
                         how='left')

# merge overlapping columns
integrated_df['emission_type'] = integrated_df['emission_type_x'].combine_first(integrated_df['emission_type_y'])
integrated_df['emission_factor_units'] = integrated_df['emission_factor_units_x'].combine_first(integrated_df['emission_factor_units_y'])
integrated_df['emission_factor'] = integrated_df['emission_factor_x'].combine_first(integrated_df['emission_factor_y'])

# drop the old duplicate columns
integrated_df.drop(columns=['emission_type_x', 'emission_type_y', 'emission_factor_units_x', 'emission_factor_units_y', 'year','emission_factor_x','emission_factor_y'], 
                   inplace=True)


KeyError: "['fy'] not in index"

In [None]:
integrated_df[integrated_df['superid'].isin(['WV061012', 'AM057403'])]

In [None]:
integrated_df.loc[integrated_df['sourccode'] == 'EC', 'emissions_avoided'] = (
    integrated_df['emission_factor'] * integrated_df['conserved']
)
integrated_df.loc[integrated_df['sourccode'] == 'EC', 'emissions_avoided'] = (
    integrated_df['emission_factor'] * integrated_df['conserved']
)


In [None]:
integrated_df[integrated_df['id'].isin(['SF0532', 'OR0712', 'MI0415','IC0115'])]

In [None]:
integrated_df[integrated_df['superid'].isin(['WV061012', 'AM057403','MI041503','SF053206','SF053207','MI041503','IC011501'])]

## Generate a final integrated dataset

In [None]:
# save integrated dataframe
integrated_df.to_csv("../../data/intermediate_data/iac_integrated.csv", index=False)