# Further Data Integration

Further data integration of `iac_integration.csv` for Industrial Energy dashboard for Industrial Sustainability Analysis Lab - headed by Dr. Eric Masanet.

In this notebook we will:
- Select relevant years
- Integrate NAICS/SIC codes and descriptions
- Select for relevant ARC codes
- Integrate ARC descriptions

The data required for these processes are in the raw_data and final_data google drive folders.

In [1]:
# Import relevant libraries
import numpy as np
import pandas as pd
from pathlib import Path

In [2]:
# Import datasets

# ------- define paths -------
relative_path = Path('../../data/') # define relative path
absolute_path = relative_path.resolve() # get absolute path


# ------- import data -------
naics = pd.read_csv(absolute_path/'raw_data/NAICS_Codes.csv') # import NAICS dataset
naic_sics_xwalk = pd.read_csv(absolute_path/"raw_data/NAICS_SIC_Xwalk.csv")
arc2 = pd.read_csv(absolute_path/"raw_data/ARC2.csv")
iac = pd.read_csv(absolute_path/'intermediate_data/iac_integrated.csv') # import Integrated IAC dataset


In [3]:
iac[iac['superid']=='AM007601']

Unnamed: 0,superid,id,description,ar_number,appcode,arc2,impstatus,impcost,impcost_adj,source_rank,...,prodlevel,prodhours,numars,source_code,plant_cost,plant_usage,emissions_avoided,emission_type,emission_factor_units,emission_factor
843,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,...,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,CO2,kg/kWh,0.7567
844,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,...,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,SO2,kg/kWh,0.002021
845,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,...,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,NOx,kg/kWh,0.002354


In [4]:
naic_sics_xwalk[naic_sics_xwalk['Related SIC Code']=='3479']

Unnamed: 0,Input Seq 1,2022 NAICS Code,2022 NAICS Title,Related SIC Code,Related SIC Code Description,Change to 2017 Code,2017 NAICS Code,2017 NAICS Code.1
849,850,332812,"Metal Coating, Engraving (except Jewelry and S...",3479,Metal Coating and Allied Services,No Change,332812,"Metal Coating, Engraving (except Jewelry and S..."
1148,1149,339910,Jewelry and Silverware Manufacturing,3479,Metal Coating and Allied Services,No Change,339910,Jewelry and Silverware Manufacturing
1149,1150,339910,Jewelry and Silverware Manufacturing,3479,Metal Coating and Allied Services,No Change,339910,Jewelry and Silverware Manufacturing
1150,1151,339910,Jewelry and Silverware Manufacturing,3479,Metal Coating and Allied Services,No Change,339910,Jewelry and Silverware Manufacturing


In [5]:
# Set all column names to visible
pd.set_option('display.max_columns', None)

# Explore csv columns
iac.head(1)

Unnamed: 0,superid,id,description,ar_number,appcode,arc2,impstatus,impcost,impcost_adj,source_rank,sourccode,conserved,sourconsv,saved,rebate,incremntal,fy,ic_capital,ic_other,payback,bptool,reference_year,reference_ppi,center,sic,naics,state,sales,employees,plant_area,products,produnits,prodlevel,prodhours,numars,source_code,plant_cost,plant_usage,emissions_avoided,emission_type,emission_factor_units,emission_factor
0,AM000101,AM0001,,1,,2.8114,N,15000.0,,PSOURCCODE,EC,,,1828.0,N,N,1987,,,8.205689,,,,AM,3671.0,,TX,33900000.0,206.0,,MICRO & MINI COMPUTERS,1.0,310.0,2250.0,7.0,EC,184985.0,2560082.0,,,,


Looking at the codes within the 'naics' column in the iac df - we notice that the codes are six digits with a '.' at the end. To join the naics df - we must clean this column.

In [6]:
#iac.naics.unique() # Uncomment to view

In [7]:
def clean_naics(value):
    if pd.isna(value):
        # Return NaN values as is
        return value
    else:
        # Convert to string first to handle the value properly
        value_str = str(value)
        # Remove decimal point and trailing zeros
        if '.' in value_str:
            return value_str.split('.')[0]
        else:
            return value_str

In [8]:
iac['naics'] = iac['naics'].apply(clean_naics)

In [9]:
iac[iac['superid']=='AM007601']

Unnamed: 0,superid,id,description,ar_number,appcode,arc2,impstatus,impcost,impcost_adj,source_rank,sourccode,conserved,sourconsv,saved,rebate,incremntal,fy,ic_capital,ic_other,payback,bptool,reference_year,reference_ppi,center,sic,naics,state,sales,employees,plant_area,products,produnits,prodlevel,prodhours,numars,source_code,plant_cost,plant_usage,emissions_avoided,emission_type,emission_factor_units,emission_factor
843,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,EC,,,23000.0,N,N,1990,,,2.782609,,2024.0,278.19,AM,3479.0,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,CO2,kg/kWh,0.7567
844,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,EC,,,23000.0,N,N,1990,,,2.782609,,2024.0,278.19,AM,3479.0,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,SO2,kg/kWh,0.002021
845,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,EC,,,23000.0,N,N,1990,,,2.782609,,2024.0,278.19,AM,3479.0,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,NOx,kg/kWh,0.002354


In [10]:
# Double-check

#iac.tail() # Uncomment to view

In [11]:
naics.head()

Unnamed: 0,Seq. No.,2022 NAICS US Code,2022 NAICS US Title,Description,Unnamed: 4
0,1,11,"Agriculture, Forestry, Fishing and HuntingT","The Sector as a Whole\n\nThe Agriculture, Fore...",
1,2,111,Crop ProductionT,Industries in the Crop Production subsector gr...,
2,3,1111,Oilseed and Grain FarmingT,This industry group comprises establishments p...,
3,4,11111,Soybean FarmingT,See industry description for 111110.,
4,5,111110,Soybean Farming,This industry comprises establishments primari...,


In [12]:
naics.columns

Index(['Seq. No.', '2022 NAICS US   Code', '2022 NAICS US Title',
       'Description', 'Unnamed: 4'],
      dtype='object')

In [13]:
naics_clean = naics.drop(['Seq. No.','Unnamed: 4'], axis=1)

In [14]:
naics_clean.head()

Unnamed: 0,2022 NAICS US Code,2022 NAICS US Title,Description
0,11,"Agriculture, Forestry, Fishing and HuntingT","The Sector as a Whole\n\nThe Agriculture, Fore..."
1,111,Crop ProductionT,Industries in the Crop Production subsector gr...
2,1111,Oilseed and Grain FarmingT,This industry group comprises establishments p...
3,11111,Soybean FarmingT,See industry description for 111110.
4,111110,Soybean Farming,This industry comprises establishments primari...


In [15]:
# Create the naics lookup from naics_clean
naics_lookup = dict(zip(
    naics_clean['2022 NAICS US   Code'].astype(str),
    naics_clean['2022 NAICS US Title'] # replaced with title because description is too verbose
))

# Apply the lookup to iac dd
iac['naics'] = iac['naics'].astype(str)
iac['naics_description'] = iac['naics'].map(naics_lookup)

In [16]:
naics_clean[naics_clean['2022 NAICS US   Code']=='922110']

Unnamed: 0,2022 NAICS US Code,2022 NAICS US Title,Description
2068,922110,Courts,This industry comprises civilian courts of law...
4232,922110,Courts,This industry comprises civilian courts of law...


In [17]:
iac[iac['superid']=='ORC00702']

Unnamed: 0,superid,id,description,ar_number,appcode,arc2,impstatus,impcost,impcost_adj,source_rank,sourccode,conserved,sourconsv,saved,rebate,incremntal,fy,ic_capital,ic_other,payback,bptool,reference_year,reference_ppi,center,sic,naics,state,sales,employees,plant_area,products,produnits,prodlevel,prodhours,numars,source_code,plant_cost,plant_usage,emissions_avoided,emission_type,emission_factor_units,emission_factor,naics_description
324220,ORC00702,ORC007,UTILIZE HIGHER EFFICIENCY LAMPS AND/OR BALLASTS,2,,2.7142,N,26400.0,26928.0,PSOURCCODE,EC,96830.0,,5644.0,N,N,2023,26400.0,,,none,2024.0,243.25,OR,8111.0,922110,OR,80000.0,400.0,98000.0,Support services,1.0,100.0,3120.0,7.0,EC,46747.0,797000.0,15125.163152,CO2,kg/kWh,0.156203,Courts
324221,ORC00702,ORC007,UTILIZE HIGHER EFFICIENCY LAMPS AND/OR BALLASTS,2,,2.7142,N,26400.0,26928.0,PSOURCCODE,EC,96830.0,,5644.0,N,N,2023,26400.0,,,none,2024.0,243.25,OR,8111.0,922110,OR,80000.0,400.0,98000.0,Support services,1.0,100.0,3120.0,7.0,EC,46747.0,797000.0,5.708543,SO2,kg/kWh,5.9e-05,Courts
324222,ORC00702,ORC007,UTILIZE HIGHER EFFICIENCY LAMPS AND/OR BALLASTS,2,,2.7142,N,26400.0,26928.0,PSOURCCODE,EC,96830.0,,5644.0,N,N,2023,26400.0,,,none,2024.0,243.25,OR,8111.0,922110,OR,80000.0,400.0,98000.0,Support services,1.0,100.0,3120.0,7.0,EC,46747.0,797000.0,34.274804,NOx,kg/kWh,0.000354,Courts
324223,ORC00702,ORC007,UTILIZE HIGHER EFFICIENCY LAMPS AND/OR BALLASTS,2,,2.7142,N,26400.0,26928.0,SSOURCCODE,ED,29.0,,209.0,N,N,2023,26400.0,,,none,2024.0,243.25,OR,8111.0,922110,OR,80000.0,400.0,98000.0,Support services,1.0,100.0,3120.0,7.0,ED,24025.0,3585.0,,,,,Courts


In [18]:
# remove trailing “.0” or any decimal from sic values
iac['sic'] = iac['sic'].astype(str)
iac['sic'] = iac['sic'].str.replace(r'\.0+$', '', regex=True)

# convert reference_year to integer 
iac['reference_year'] = pd.to_numeric(iac['reference_year'], errors='coerce').astype('Int64')

In [19]:
iac[iac['superid']=='AM007601']

Unnamed: 0,superid,id,description,ar_number,appcode,arc2,impstatus,impcost,impcost_adj,source_rank,sourccode,conserved,sourconsv,saved,rebate,incremntal,fy,ic_capital,ic_other,payback,bptool,reference_year,reference_ppi,center,sic,naics,state,sales,employees,plant_area,products,produnits,prodlevel,prodhours,numars,source_code,plant_cost,plant_usage,emissions_avoided,emission_type,emission_factor_units,emission_factor,naics_description
843,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,EC,,,23000.0,N,N,1990,,,2.782609,,2024,278.19,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,CO2,kg/kWh,0.7567,
844,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,EC,,,23000.0,N,N,1990,,,2.782609,,2024,278.19,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,SO2,kg/kWh,0.002021,
845,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,EC,,,23000.0,N,N,1990,,,2.782609,,2024,278.19,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,NOx,kg/kWh,0.002354,


In [20]:
# remove audits that were recorded prior to 1990
# Reason: we don't have emissions and PPI data prior to 1990
iac = iac[(iac['fy'] > 1989) & (iac['arc2'] < 3)]

In [21]:
before_sic_update = iac[iac['sic']=='3479'].shape[0]

In [22]:
iac.head(3)

Unnamed: 0,superid,id,description,ar_number,appcode,arc2,impstatus,impcost,impcost_adj,source_rank,sourccode,conserved,sourconsv,saved,rebate,incremntal,fy,ic_capital,ic_other,payback,bptool,reference_year,reference_ppi,center,sic,naics,state,sales,employees,plant_area,products,produnits,prodlevel,prodhours,numars,source_code,plant_cost,plant_usage,emissions_avoided,emission_type,emission_factor_units,emission_factor,naics_description
843,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,EC,,,23000.0,N,N,1990,,,2.782609,,2024,278.19,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,CO2,kg/kWh,0.7567,
844,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,EC,,,23000.0,N,N,1990,,,2.782609,,2024,278.19,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,SO2,kg/kWh,0.002021,
845,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,EC,,,23000.0,N,N,1990,,,2.782609,,2024,278.19,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,NOx,kg/kWh,0.002354,


In [23]:
naic_sics_xwalk.head(3)

Unnamed: 0,Input Seq 1,2022 NAICS Code,2022 NAICS Title,Related SIC Code,Related SIC Code Description,Change to 2017 Code,2017 NAICS Code,2017 NAICS Code.1
0,1,111110,Soybean Farming,116,Soybeans,No Change,111110,Soybean Farming
1,2,111120,Oilseed (except Soybean) Farming,119,"Cash Grains, Nec",No Change,111120,Oilseed (except Soybean) Farming
2,3,111130,Dry Pea and Bean Farming,119,"Cash Grains, Nec",No Change,111130,Dry Pea and Bean Farming


In [24]:
naic_sics_xwalk.columns

Index(['Input Seq 1', '2022 NAICS Code', '2022 NAICS Title',
       'Related SIC Code', 'Related SIC Code Description',
       'Change to 2017 Code', '2017 NAICS Code', '2017 NAICS Code.1'],
      dtype='object')

In [25]:
naic_sics_xwalk_clean = naic_sics_xwalk.drop(['Input Seq 1', 'Change to 2017 Code', '2017 NAICS Code', '2017 NAICS Code.1'], axis=1)

In [26]:
# naic_sics_xwalk_clean[naic_sics_xwalk_clean['Related SIC Code']=='3479']
naic_sics_xwalk_clean[naic_sics_xwalk_clean['2022 NAICS Code']==922110]

Unnamed: 0,2022 NAICS Code,2022 NAICS Title,Related SIC Code,Related SIC Code Description
2327,922110,Courts,9211,Courts


In [27]:
# Create the naics lookup from naic_sics_xwalk_clean
naics_title_lookup = dict(zip(
    naic_sics_xwalk_clean['2022 NAICS Code'].astype(str),
    naic_sics_xwalk_clean['2022 NAICS Title']
))

sic_code_lookup = dict(zip(
    naic_sics_xwalk_clean['2022 NAICS Code'].astype(str),
    naic_sics_xwalk_clean['Related SIC Code']
))
    
naics_to_sic_desc_lookup = dict(zip(
    naic_sics_xwalk_clean['2022 NAICS Code'].astype(str),
    naic_sics_xwalk_clean['Related SIC Code Description']
))

# to be applied where NAICS values are missing
naics_code_lookup = dict(zip(
    naic_sics_xwalk_clean['Related SIC Code'],
    naic_sics_xwalk_clean['2022 NAICS Code'].astype(str)
))

# to be applied where NAICS values are missing
sic_desc_lookup = dict(zip(
    naic_sics_xwalk_clean['Related SIC Code'].astype(str),
    naic_sics_xwalk_clean['Related SIC Code Description']
))

**Issue**: We cannot lookup NAICS by SIC value directly because SIC to NAICS is many to many relationship.

`naic_sics_xwalk_clean[naic_sics_xwalk_clean['Related SIC Code']=='3479']`

| 2022 NAICS Code | 2022 NAICS Title                                           | Related SIC Code | Related SIC Code Description          |
| --------------- | ---------------------------------------------------------- | ---------------- | ------------------------------------- |
| 332812          | Metal Coating, Engraving (except Jewelry and S...)         | 3479             | Metal Coating and Allied Services     |
| 339910          | Jewelry and Silverware Manufacturing                       | 3479             | Metal Coating and Allied Services     |
| 339910          | Jewelry and Silverware Manufacturing                       | 3479             | Metal Coating and Allied Services     |
| 339910          | Jewelry and Silverware Manufacturing                       | 3479             | Metal Coating and Allied Services     |


`naic_sics_xwalk_clean[naic_sics_xwalk_clean['2022 NAICS Code'] == 332812]`

| 2022 NAICS Code | 2022 NAICS Title                                   | Related SIC Code | Related SIC Code Description     |
|-----------------|----------------------------------------------------|------------------|----------------------------------|
| 332812          | Metal Coating, Engraving (except Jewelry and S...) | 3479             | Metal Coating and Allied Services|
| 332812          | Metal Coating, Engraving (except Jewelry and S...) | 3999             | Manufacturing Industries, Nec    |

**Decision**: In order to not update IAC records where NAICS are missing with invalid NAICS category based on SIC code, we will set Sector to SIC description.

In [28]:
arc2_lookup = dict(zip(
    arc2['specific_code'].astype(str),
    arc2['specific_description']
))

In [29]:
# # Apply the lookup to iac dd
# iac['naics'] = iac['naics'].astype(str)
# iac['naics_description'] = iac['naics'].map(naics_lookup)
# # iac['sic'] = iac['naics'].map(sic_code_lookup) - we don't want to override existing SIC values with NA for the records where NAICS are missing
# iac['sic_description'] = iac['naics'].map(sic_desc_lookup)

In [30]:
# update sic_description only for rows that have non-missing sic 
sic_exist = iac['sic'].notna() & (iac['sic'].astype(str).str.strip() != '')
iac.loc[sic_exist, 'sic_description'] = iac.loc[sic_exist, 'sic'].map(sic_desc_lookup)

# update sic_description based on naics code if sic value is missing
sic_na = iac['sic'].isna()
iac.loc[sic_na, 'sic_description'] = iac.loc[sic_na, 'naics'].map(naics_to_sic_desc_lookup)

In [31]:
# update naics description only for rows that have non-missing naics
iac['naics'] = iac['naics'].astype(str)
# check for non-missing values and non-empty strings
naics_exist = iac['naics'].notna() & (iac['naics'].astype(str).str.strip() != '')
iac.loc[naics_exist, 'naics_description'] = iac.loc[naics_exist, 'naics'].map(naics_lookup)


In [32]:
# test update didn't remove or override sic values
after_sic_update = iac[iac['sic']=='3479'].shape[0]
print(f"{before_sic_update == after_sic_update }")

True


In [33]:
# add sector column to the IAC dataset
# if NAICS description is not NA, set value to NAICS description, else to SIC description
iac['sector'] = iac['naics_description']
iac.loc[iac['naics_description'].isna(), 'sector'] = iac['sic_description']

In [34]:
iac['arc2'] = iac['arc2'].astype(str)
iac['arc_description'] = iac['arc2'].map(arc2_lookup)

In [35]:
iac.head(10)

Unnamed: 0,superid,id,description,ar_number,appcode,arc2,impstatus,impcost,impcost_adj,source_rank,sourccode,conserved,sourconsv,saved,rebate,incremntal,fy,ic_capital,ic_other,payback,bptool,reference_year,reference_ppi,center,sic,naics,state,sales,employees,plant_area,products,produnits,prodlevel,prodhours,numars,source_code,plant_cost,plant_usage,emissions_avoided,emission_type,emission_factor_units,emission_factor,naics_description,sic_description,sector,arc_description
843,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,EC,,,23000.0,N,N,1990,,,2.782609,,2024,278.19,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,CO2,kg/kWh,0.7567,,Metal Coating and Allied Services,Metal Coating and Allied Services,OPTIMIZE PLANT POWER FACTOR
844,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,EC,,,23000.0,N,N,1990,,,2.782609,,2024,278.19,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,SO2,kg/kWh,0.002021,,Metal Coating and Allied Services,Metal Coating and Allied Services,OPTIMIZE PLANT POWER FACTOR
845,AM007601,AM0076,OPTIMIZE PLANT POWER FACTOR,1,,2.3212,I,64000.0,177920.0,PSOURCCODE,EC,,,23000.0,N,N,1990,,,2.782609,,2024,278.19,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,,NOx,kg/kWh,0.002354,,Metal Coating and Allied Services,Metal Coating and Allied Services,OPTIMIZE PLANT POWER FACTOR
846,AM007602,AM0076,ELIMINATE LEAKS IN INERT GAS AND COMPRESSED AI...,2,,2.4236,I,2000.0,5560.0,PSOURCCODE,EC,121043.0,1241.0,5100.0,N,N,1990,,,0.392157,,2024,278.19,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,91593.234745,CO2,kg/kWh,0.7567,,Metal Coating and Allied Services,Metal Coating and Allied Services,ELIMINATE LEAKS IN INERT GAS AND COMPRESSED AI...
847,AM007602,AM0076,ELIMINATE LEAKS IN INERT GAS AND COMPRESSED AI...,2,,2.4236,I,2000.0,5560.0,PSOURCCODE,EC,121043.0,1241.0,5100.0,N,N,1990,,,0.392157,,2024,278.19,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,244.581464,SO2,kg/kWh,0.002021,,Metal Coating and Allied Services,Metal Coating and Allied Services,ELIMINATE LEAKS IN INERT GAS AND COMPRESSED AI...
848,AM007602,AM0076,ELIMINATE LEAKS IN INERT GAS AND COMPRESSED AI...,2,,2.4236,I,2000.0,5560.0,PSOURCCODE,EC,121043.0,1241.0,5100.0,N,N,1990,,,0.392157,,2024,278.19,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,284.921301,NOx,kg/kWh,0.002354,,Metal Coating and Allied Services,Metal Coating and Allied Services,ELIMINATE LEAKS IN INERT GAS AND COMPRESSED AI...
849,AM007603,AM0076,UTILIZE DAYLIGHT WHENEVER POSSIBLE IN LIEU OF ...,3,,2.7121,I,3614.0,6866.6,PSOURCCODE,EC,40739.0,418.0,1940.0,N,N,1990,,,1.862887,,2024,239.54,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,30827.200171,CO2,kg/kWh,0.7567,,Metal Coating and Allied Services,Metal Coating and Allied Services,UTILIZE DAYLIGHT WHENEVER POSSIBLE IN LIEU OF ...
850,AM007603,AM0076,UTILIZE DAYLIGHT WHENEVER POSSIBLE IN LIEU OF ...,3,,2.7121,I,3614.0,6866.6,PSOURCCODE,EC,40739.0,418.0,1940.0,N,N,1990,,,1.862887,,2024,239.54,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,82.317889,SO2,kg/kWh,0.002021,,Metal Coating and Allied Services,Metal Coating and Allied Services,UTILIZE DAYLIGHT WHENEVER POSSIBLE IN LIEU OF ...
851,AM007603,AM0076,UTILIZE DAYLIGHT WHENEVER POSSIBLE IN LIEU OF ...,3,,2.7121,I,3614.0,6866.6,PSOURCCODE,EC,40739.0,418.0,1940.0,N,N,1990,,,1.862887,,2024,239.54,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,EC,269266.0,3456917.0,95.894921,NOx,kg/kWh,0.002354,,Metal Coating and Allied Services,Metal Coating and Allied Services,UTILIZE DAYLIGHT WHENEVER POSSIBLE IN LIEU OF ...
852,AM007604,AM0076,INSULATE BARE EQUIPMENT,4,,2.2511,N,180.0,259.2,PSOURCCODE,E2,65.0,,240.0,N,N,1990,,,0.75,,2024,189.87,AM,3479,,TX,10000000.0,80.0,,PIPECOATING,6.0,9038.0,2400.0,5.0,E2,29923.0,8238.0,3448.9,CO2,kg/MMBtu,53.06,,Metal Coating and Allied Services,Metal Coating and Allied Services,INSULATE BARE EQUIPMENT


In [36]:
iac.shape

(453647, 46)

In [None]:
# keep only primary energy source 
dashboard_df = iac[iac['source_rank'] == 'PSOURCCODE']

# remove columns not used in the dashboard viz
dashboard_df = dashboard_df.drop(['description','source_rank','source_code','naics_description','sic_description' ,'reference_year','reference_ppi', 'emission_factor_units', 'emission_factor','rebate','incremntal','sales', 'employees','produnits', 'prodlevel', 'prodhours', 'numars','bptool','ar_number', 'appcode','ic_capital', 'ic_other','center'], axis=1)

# convert emissions_avoided values to numeric values rounded to 2 decimals
dashboard_df['emissions_avoided'] = dashboard_df['emissions_avoided'].astype(float).round(2)

In [74]:
# Save new 'iac' df to csv for PPI integration
iac.to_csv(absolute_path/'processed_data/iac_integrated.csv')
dashboard_df.to_csv(absolute_path/'processed_data/iac_integrated_dash.csv', index=False)