# Further Data Integration

Further data integration of `iac_integration.csv` for Industrial Energy dashboard for Industrial Sustainability Analysis Lab - headed by Dr. Eric Masanet.

In this notebook we will:
- Select relevant years
- Integrate NAICS/SIC codes and descriptions
- Select for relevant ARC codes
- Integrate ARC descriptions

The data required for these processes are in the raw_data and final_data google drive folders.

In [None]:
# Import relevant libraries
import numpy as np
import pandas as pd
from pathlib import Path

In [None]:
# Import datasets

# ------- define paths -------
relative_path = Path('../../data/') # define relative path
absolute_path = relative_path.resolve() # get absolute path


# ------- import data -------
naics = pd.read_csv(absolute_path/'raw_data/NAICS_Codes.csv') # import NAICS dataset
naic_sics_xwalk = pd.read_csv(absolute_path/"raw_data/NAICS_SIC_Xwalk.csv")
arc2 = pd.read_csv(absolute_path/"raw_data/ARC2.csv")
iac = pd.read_csv(absolute_path/'intermediate_data/iac_integrated.csv') # import Integrated IAC dataset


In [None]:
iac[iac['superid']=='AM007601']

In [None]:
naic_sics_xwalk[naic_sics_xwalk['Related SIC Code']=='3479']

In [None]:
# Set all column names to visible
pd.set_option('display.max_columns', None)

# Explore csv columns
iac.head(1)

Looking at the codes within the 'naics' column in the iac df - we notice that the codes are six digits with a '.' at the end. To join the naics df - we must clean this column.

In [None]:
#iac.naics.unique() # Uncomment to view

In [None]:
def clean_naics(value):
    if pd.isna(value):
        # Return NaN values as is
        return value
    else:
        # Convert to string first to handle the value properly
        value_str = str(value)
        # Remove decimal point and trailing zeros
        if '.' in value_str:
            return value_str.split('.')[0]
        else:
            return value_str

In [None]:
iac['naics'] = iac['naics'].apply(clean_naics)

In [None]:
iac[iac['superid']=='AM007601']

In [None]:
# Double-check

#iac.tail() # Uncomment to view

In [None]:
naics.head()

In [None]:
naics.columns

In [None]:
naics_clean = naics.drop(['Seq. No.','Unnamed: 4'], axis=1)

In [None]:
naics_clean.head()

In [None]:
# Create the naics lookup from naics_clean
naics_lookup = dict(zip(
    naics_clean['2022 NAICS US   Code'].astype(str),
    naics_clean['2022 NAICS US Title'] # replaced with title because description is too verbose
))

# Apply the lookup to iac dd
iac['naics'] = iac['naics'].astype(str)
iac['naics_description'] = iac['naics'].map(naics_lookup)

In [None]:
naics_clean[naics_clean['2022 NAICS US   Code']=='922110']

In [None]:
iac[iac['superid']=='ORC00702']

In [None]:
# remove trailing “.0” or any decimal from sic values
iac['sic'] = iac['sic'].astype(str)
iac['sic'] = iac['sic'].str.replace(r'\.0+$', '', regex=True)

# convert reference_year to integer 
iac['reference_year'] = pd.to_numeric(iac['reference_year'], errors='coerce').astype('Int64')

In [None]:
iac[iac['superid']=='AM007601']

In [None]:
# remove audits that were recorded prior to 1990
# Reason: we don't have emissions and PPI data prior to 1990
iac = iac[(iac['fy'] > 1989) & (iac['arc2'] < 3)]

In [None]:
before_sic_update = iac[iac['sic']=='3479'].shape[0]

In [None]:
iac.head(3)

In [None]:
naic_sics_xwalk.head(3)

In [None]:
naic_sics_xwalk.columns

In [None]:
naic_sics_xwalk_clean = naic_sics_xwalk.drop(['Input Seq 1', 'Change to 2017 Code', '2017 NAICS Code', '2017 NAICS Code.1'], axis=1)

In [None]:
# naic_sics_xwalk_clean[naic_sics_xwalk_clean['Related SIC Code']=='3479']
naic_sics_xwalk_clean[naic_sics_xwalk_clean['2022 NAICS Code']==922110]

In [None]:
# Create the naics lookup from naic_sics_xwalk_clean
naics_title_lookup = dict(zip(
    naic_sics_xwalk_clean['2022 NAICS Code'].astype(str),
    naic_sics_xwalk_clean['2022 NAICS Title']
))

sic_code_lookup = dict(zip(
    naic_sics_xwalk_clean['2022 NAICS Code'].astype(str),
    naic_sics_xwalk_clean['Related SIC Code']
))
    
naics_to_sic_desc_lookup = dict(zip(
    naic_sics_xwalk_clean['2022 NAICS Code'].astype(str),
    naic_sics_xwalk_clean['Related SIC Code Description']
))

# to be applied where NAICS values are missing
naics_code_lookup = dict(zip(
    naic_sics_xwalk_clean['Related SIC Code'],
    naic_sics_xwalk_clean['2022 NAICS Code'].astype(str)
))

# to be applied where NAICS values are missing
sic_desc_lookup = dict(zip(
    naic_sics_xwalk_clean['Related SIC Code'].astype(str),
    naic_sics_xwalk_clean['Related SIC Code Description']
))

**Issue**: We cannot lookup NAICS by SIC value directly because SIC to NAICS is many to many relationship.

`naic_sics_xwalk_clean[naic_sics_xwalk_clean['Related SIC Code']=='3479']`

| 2022 NAICS Code | 2022 NAICS Title                                           | Related SIC Code | Related SIC Code Description          |
| --------------- | ---------------------------------------------------------- | ---------------- | ------------------------------------- |
| 332812          | Metal Coating, Engraving (except Jewelry and S...)         | 3479             | Metal Coating and Allied Services     |
| 339910          | Jewelry and Silverware Manufacturing                       | 3479             | Metal Coating and Allied Services     |
| 339910          | Jewelry and Silverware Manufacturing                       | 3479             | Metal Coating and Allied Services     |
| 339910          | Jewelry and Silverware Manufacturing                       | 3479             | Metal Coating and Allied Services     |


`naic_sics_xwalk_clean[naic_sics_xwalk_clean['2022 NAICS Code'] == 332812]`

| 2022 NAICS Code | 2022 NAICS Title                                   | Related SIC Code | Related SIC Code Description     |
|-----------------|----------------------------------------------------|------------------|----------------------------------|
| 332812          | Metal Coating, Engraving (except Jewelry and S...) | 3479             | Metal Coating and Allied Services|
| 332812          | Metal Coating, Engraving (except Jewelry and S...) | 3999             | Manufacturing Industries, Nec    |

**Decision**: In order to not update IAC records where NAICS are missing with invalid NAICS category based on SIC code, we will set Sector to SIC description.

In [None]:
arc2_lookup = dict(zip(
    arc2['specific_code'].astype(str),
    arc2['specific_description']
))

In [None]:
# # Apply the lookup to iac dd
# iac['naics'] = iac['naics'].astype(str)
# iac['naics_description'] = iac['naics'].map(naics_lookup)
# # iac['sic'] = iac['naics'].map(sic_code_lookup) - we don't want to override existing SIC values with NA for the records where NAICS are missing
# iac['sic_description'] = iac['naics'].map(sic_desc_lookup)

In [None]:
# update sic_description only for rows that have non-missing sic 
sic_exist = iac['sic'].notna() & (iac['sic'].astype(str).str.strip() != '')
iac.loc[sic_exist, 'sic_description'] = iac.loc[sic_exist, 'sic'].map(sic_desc_lookup)

# update sic_description based on naics code if sic value is missing
sic_na = iac['sic'].isna()
iac.loc[sic_na, 'sic_description'] = iac.loc[sic_na, 'naics'].map(naics_to_sic_desc_lookup)

In [None]:
# update naics description only for rows that have non-missing naics
iac['naics'] = iac['naics'].astype(str)
# check for non-missing values and non-empty strings
naics_exist = iac['naics'].notna() & (iac['naics'].astype(str).str.strip() != '')
iac.loc[naics_exist, 'naics_description'] = iac.loc[naics_exist, 'naics'].map(naics_lookup)


In [None]:
# test update didn't remove or override sic values
after_sic_update = iac[iac['sic']=='3479'].shape[0]
print(f"{before_sic_update == after_sic_update }")

In [None]:
# add sector column to the IAC dataset
# if NAICS description is not NA, set value to NAICS description, else to SIC description
iac['sector'] = iac['naics_description']
iac.loc[iac['naics_description'].isna(), 'sector'] = iac['sic_description']

In [None]:
iac['arc2'] = iac['arc2'].astype(str)
iac['arc_description'] = iac['arc2'].map(arc2_lookup)

In [None]:
iac.head(10)

In [None]:
iac.shape

In [None]:
# keep only primary energy source 
dashboard_df = iac[iac['source_rank'] == 'PSOURCCODE']

# remove columns not used in the dashboard viz
dashboard_df = dashboard_df.drop(['description','source_rank','source_code','naics_description','sic_description' ,'reference_year','reference_ppi', 'emission_factor_units', 'emission_factor','rebate','incremntal','sales', 'employees','produnits', 'prodlevel', 'prodhours', 'numars','bptool','ar_number', 'appcode','ic_capital', 'ic_other','center'], axis=1)

# convert emissions_avoided values to numeric values rounded to 2 decimals
dashboard_df['emissions_avoided'] = dashboard_df['emissions_avoided'].astype(float).round(2)

In [None]:
# Save new 'iac' df to csv for PPI integration
iac.to_csv(absolute_path/'processed_data/iac_integrated.csv')
dashboard_df.to_csv(absolute_path/'processed_data/iac_integrated_dash.csv', index=False)