# Further Data Integration

Further data integration of `iac_integration.csv` for Industrial Energy dashboard for Industrial Sustainability Analysis Lab - headed by Dr. Eric Masanet.

In this notebook we will:
- Select relevant years
- Integrate NAICS/SIC codes and descriptions
- Select for relevant ARC codes
- Integrate ARC descriptions

The data required for these processes are in the raw_data and final_data google drive folders.

In [None]:
# Import relevant libraries
import numpy as np
import pandas as pd

In [None]:
# Read in data
iac = pd.read_csv("/Users/nmoraes/Capstone/cleaning/data/iac_integrated.csv")

naics = pd.read_csv("/Users/nmoraes/Capstone/cleaning/data/NAICS_Codes.csv")

In [None]:
naic_sics_xwalk = pd.read_csv("/Users/nmoraes/Capstone/cleaning/data/NAICS_SIC_Xwalk.csv")

In [None]:
arc2 = pd.read_csv("/Users/nmoraes/Capstone/cleaning/data/ARC2.csv")

In [None]:
# Set all column names to visible
pd.set_option('display.max_columns', None)

# Explore csv columns
iac.head(1)

Looking at the codes within the 'naics' column in the iac df - we notice that the codes are six digits with a '.' at the end. To join the naics df - we must clean this column.

In [None]:
#iac.naics.unique() # Uncomment to view

In [None]:
def clean_naics(value):
    if pd.isna(value):
        # Return NaN values as is
        return value
    else:
        # Convert to string first to handle the value properly
        value_str = str(value)
        # Remove decimal point and trailing zeros
        if '.' in value_str:
            return value_str.split('.')[0]
        else:
            return value_str

In [None]:
iac['naics'] = iac['naics'].apply(clean_naics)

In [None]:
# Double-check

# iac.tail() # Uncomment to view

In [None]:
naics.head()

In [None]:
naics.columns

In [None]:
naics_clean = naics.drop(['Seq. No.','Unnamed: 4'], axis=1)

In [None]:
naics_clean.head()

In [None]:
# Create the naics lookup from naics_clean
naics_lookup = dict(zip(
    naics_clean['2022 NAICS US   Code'].astype(str),
    naics_clean['Description']
))

# Apply the lookup to iac dd
iac['naics'] = iac['naics'].astype(str)
iac['naics_description'] = iac['naics'].map(naics_lookup)

In [None]:
iac = iac[(iac['fy'] > 1989) & (iac['arc2'] < 3)]

In [None]:
iac.head(3)

In [None]:
naic_sics_xwalk.head(3)

In [None]:
naic_sics_xwalk.columns

In [None]:
naic_sics_xwalk_clean = naic_sics_xwalk.drop(['Input Seq 1', 'Change to 2017 Code', '2017 NAICS Code', '2017 NAICS Code.1'], axis=1)

In [None]:
# Create the naics lookup from naic_sics_xwalk_clean
naics_title_lookup = dict(zip(
    naic_sics_xwalk_clean['2022 NAICS Code'].astype(str),
    naic_sics_xwalk_clean['2022 NAICS Title']
))

sic_code_lookup = dict(zip(
    naic_sics_xwalk_clean['2022 NAICS Code'].astype(str),
    naic_sics_xwalk_clean['Related SIC Code']
))
    
sic_desc_lookup = dict(zip(
    naic_sics_xwalk_clean['2022 NAICS Code'].astype(str),
    naic_sics_xwalk_clean['Related SIC Code Description']
))

In [None]:
arc2_lookup = dict(zip(
    arc2['specific_code'].astype(str),
    arc2['specific_description']
))

In [None]:
# Apply the lookup to iac dd
iac['naics'] = iac['naics'].astype(str)
iac['naics_description'] = iac['naics'].map(naics_lookup)
iac['sic'] = iac['naics'].map(sic_code_lookup)
iac['sic_description'] = iac['naics'].map(sic_desc_lookup)

In [None]:
iac['arc2'] = iac['arc2'].astype(str)
iac['arc_description'] = iac['arc2'].map(arc2_lookup)

In [None]:
iac.head(3)

In [None]:
# Save new 'iac' df to csv for PPI integration
iac.to_csv('iac_integrated_2.csv')