### Set-up: Import Required Libraries

In [4]:
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt

### Import Data

#### Import RECC Table

In [5]:
# import IAC database, sheet "RECC1"
# define relative path
relative_path = Path('../data/raw_data/IAC_Database_20250208.xls')

# get absolute path
absolute_path = relative_path.resolve()
print(absolute_path)

/Users/oksi/workspace/industrialenergy_datainterface/data/raw_data/IAC_Database_20250208.xls


In [6]:
# import all RECC* sheets from the IAC database excel file  
all_sheets = pd.read_excel(absolute_path, sheet_name=None)  # load all sheets as a dictionary

# filter sheets that match the pattern
selected_sheets = {name: data for name, data in all_sheets.items() if name.startswith('RECC')}

# combine matching sheets into a single DataFrame
iac_recc_df = pd.concat(
    [sheet.assign(RECC=name) for name, sheet in selected_sheets.items()],
    ignore_index=True
)

In [7]:
iac_recc_df.columns

Index(['SUPERID', 'ID', 'AR_NUMBER', 'APPCODE', 'ARC2', 'IMPSTATUS', 'IMPCOST',
       'PSOURCCODE', 'PCONSERVED', 'PSOURCONSV', 'PSAVED', 'SSOURCCODE',
       'SCONSERVED', 'SSOURCONSV', 'SSAVED', 'TSOURCCODE', 'TCONSERVED',
       'TSOURCONSV', 'TSAVED', 'QSOURCCODE', 'QCONSERVED', 'QSOURCONSV',
       'QSAVED', 'REBATE', 'INCREMNTAL', 'FY', 'IC_CAPITAL', 'IC_OTHER',
       'PAYBACK', 'BPTOOL', 'RECC'],
      dtype='object')

In [8]:
# Verify transformed data
filtered_raw_df = iac_recc_df.query('SUPERID in ["AM000202","AM000504"]')
filtered_raw_df
selected_columns = filtered_raw_df[['SUPERID', 'ID', 'AR_NUMBER','IMPSTATUS', 'IMPCOST',
       'PSOURCCODE', 'PCONSERVED', 'PSOURCONSV', 'PSAVED', 'SSOURCCODE', 'SCONSERVED', 'SSOURCONSV',
       'SSAVED']]
selected_columns

Unnamed: 0,SUPERID,ID,AR_NUMBER,IMPSTATUS,IMPCOST,PSOURCCODE,PCONSERVED,PSOURCONSV,PSAVED,SSOURCCODE,SCONSERVED,SSOURCONSV,SSAVED
8,AM000202,AM0002,2,I,60000.0,E1,1077960.0,11049.0,52212.0,E2,10208.0,,42872.0
38,AM000504,AM0005,4,N,960.0,E2,83.0,,273.0,,,,


#### Import ASSESS table


In [29]:
# create an assess dataframe with the data from the sheet "ASSESS"
iac_assess_df = all_sheets['ASSESS']


In [30]:
iac_assess_df.columns

Index(['ID', 'CENTER', 'FY', 'SIC', 'NAICS', 'STATE', 'SALES', 'EMPLOYEES',
       'PLANT_AREA', 'PRODUCTS', 'PRODUNITS', 'PRODLEVEL', 'PRODHOURS',
       'NUMARS', 'EC_plant_cost', 'EC_plant_usage', 'ED_plant_cost',
       'ED_plant_usage', 'EF_plant_cost', 'E2_plant_cost', 'E2_plant_usage',
       'E3_plant_cost', 'E3_plant_usage', 'E4_plant_cost', 'E4_plant_usage',
       'E5_plant_cost', 'E5_plant_usage', 'E6_plant_cost', 'E6_plant_usage',
       'E7_plant_cost', 'E7_plant_usage', 'E8_plant_cost', 'E8_plant_usage',
       'E9_plant_cost', 'E9_plant_usage', 'E10_plant_cost', 'E10_plant_usage',
       'E11_plant_cost', 'E11_plant_usage', 'E12_plant_cost',
       'E12_plant_usage', 'W0_plant_cost', 'W0_plant_usage', 'W1_plant_cost',
       'W1_plant_usage', 'W2_plant_cost', 'W2_plant_usage', 'W3_plant_cost',
       'W3_plant_usage', 'W4_plant_cost', 'W4_plant_usage', 'W5_plant_cost',
       'W5_plant_usage', 'W6_plant_cost', 'W6_plant_usage'],
      dtype='object')

#### Import ARC-PPI dataset

In [58]:
# define relative path
relative_path = Path('../data/raw_data/ARC_PPI_Draft.xlsx')

# get absolute path
absolute_path = relative_path.resolve()
print(absolute_path)

ppi_df = pd.read_excel(absolute_path, sheet_name="PPI")  # load all sheets as a dictionary

/Users/oksi/workspace/industrialenergy_datainterface/data/raw_data/ARC_PPI_Draft.xlsx


In [59]:
ppi_df

Unnamed: 0,ARC,Description,1987,1988,1989,1990,1991,1992,1993,1994,...,2012,2013,2014,2015,2016,2017,2018,Series ID,Industry,Product
0,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,114.4,117.9,122.5,126.4,130.2,133.2,136.3,137.7,...,204.4,206.4,210.5,213.2,216.4,218.4,221.7,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments
1,2.1112,HEAT OIL TO PROPER TEMPERATURE FOR GOOD ATOMIZ...,,,,,,,,,...,,,,,,,,,,
2,2.1113,REDUCE COMBUSTION AIR FLOW TO OPTIMUM,114.4,117.9,122.5,126.4,130.2,133.2,136.3,137.7,...,204.4,206.4,210.5,213.2,216.4,218.4,221.7,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments
3,2.1114,LIMIT AND CONTROL SECONDARY COMBUSTION AIR IN ...,114.4,117.9,122.5,126.4,130.2,133.2,136.3,137.7,...,204.4,206.4,210.5,213.2,216.4,218.4,221.7,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments
4,2.1115,ELIMINATE COMBUSTIBLE GAS IN FLUE GAS,114.4,117.9,122.5,126.4,130.2,133.2,136.3,137.7,...,204.4,206.4,210.5,213.2,216.4,218.4,221.7,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,2.9112,USE SOLAR HEAT TO HEAT WATER,135.4,140.1,146.2,152.5,156.5,160.5,163,165.4,...,271.9,276,280.2,287.7,293.6,298.4,306.5,PCU333414333414,Heating equipment (except warm air furnaces) mfg,Heating equipment (except warm air furnaces) mfg
346,2.9113,USE SOLAR HEAT FOR HEAR,135.4,140.1,146.2,152.5,156.5,160.5,163,165.4,...,271.9,276,280.2,287.7,293.6,298.4,306.5,PCU333414333414,Heating equipment (except warm air furnaces) mfg,Heating equipment (except warm air furnaces) mfg
347,2.9121,INSTALL WIND POWERED ELECTRIC GENERATOR,-,-,-,-,-,-,-,-,...,130.2,129.7,133,-,-,-,-,PCU33361133361101,Turbine and turbine generator set units mfg,Turbine generator sets
348,2.9122,USE SYNTHETIC LUBRICANT,112.1,122.3,128.2,138.8,153,152.3,153.8,155.3,...,482.7,463.4,462.9,447.4,434.6,444.4,478.4,PCU324191324191011,Petroleum lubricating oil and grease mfg.,"Lubricating oils (incl. hydraulic fluids, etc...."


In [60]:
ppi_df.columns

Index([        'ARC', 'Description',          1987,          1988,
                1989,          1990,          1991,          1992,
                1993,          1994,          1995,          1996,
                1997,          1998,          1999,          2000,
                2001,          2002,          2003,          2004,
                2005,          2006,          2007,          2008,
                2009,          2010,          2011,          2012,
                2013,          2014,          2015,          2016,
                2017,          2018,   'Series ID',    'Industry',
           'Product'],
      dtype='object')

### Normalize the data
#### Transform the iac_recc table from wide to long format

Requirements
1. Keep all common columns
2. Create four rows for each input row (one for each energy source usage ranking: Primary, Secondary, Tertiary, Quaternary)
3. Maintain the relationship between energy source codes and their associated values: SOURCCODE, CONSERVED, SOURCONSV, SAVED
4. Order the columns to maintain the original dataframe structure

In [23]:
# Create a function to trasnform the recc table from wide to long format
def transform_recc_data(df):
    """
    Transform wide format usage data to long format by unpivoting usage-related columns.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame in wide format
    
    Returns:
    pandas.DataFrame: Transformed DataFrame in long format
    """
    
    # Common columns that will be repeated for each usage record
    common_cols = ['SUPERID', 'ID', 'AR_NUMBER', 'APPCODE', 'ARC2', 
                  'IMPSTATUS', 'IMPCOST', 'REBATE', 'INCREMNTAL', 
                  'FY', 'IC_CAPITAL', 'IC_OTHER', 'PAYBACK', 'BPTOOL']
    
    # Create list of usage types
    usage_types = ['P', 'S', 'T', 'Q']
    
    # Initialize list to store transformed data
    transformed_data = []
    
    # Iterate through each row in the original dataframe
    for _, row in df.iterrows():
        # For each usage type, create a new record
        for usage in usage_types:
            new_row = {col: row[col] for col in common_cols}
            
            # Add usage-specific columns
            sourccode_col = f'{usage}SOURCCODE'
            conserved_col = f'{usage}CONSERVED'
            sourconsv_col = f'{usage}SOURCONSV'
            saved_col = f'{usage}SAVED'
            
            new_row['USAGE_RANK'] = f'{usage}SOURCCODE'
            new_row['SOURCCODE'] = row.get(sourccode_col, '')
            new_row['CONSERVED'] = row.get(conserved_col, '')
            new_row['SOURCONSV'] = row.get(sourconsv_col, '')
            new_row['SOURCSAVED_USD'] = row.get(saved_col, '')
            
            transformed_data.append(new_row)
    
    # Create new dataframe from transformed data
    result_df = pd.DataFrame(transformed_data)
    
    # Ensure columns are in the desired order
    column_order = common_cols[:7] + ['USAGE_RANK', 'SOURCCODE', 'CONSERVED', 
                                    'SOURCONSV', 'SOURCSAVED_USD'] + common_cols[7:]
    
    return result_df[column_order]



In [24]:
# Transform the table
iac_recc_tidy_df = transform_recc_data(iac_recc_df)

In [25]:
# Verify transformed data
filtered_df = iac_recc_tidy_df.query('SUPERID in ["AM000202","AM000504"]')
filtered_df
selected_columns = filtered_df[['SUPERID', 'ID', 'AR_NUMBER','IMPSTATUS', 'IMPCOST',
       'USAGE_RANK', 'SOURCCODE', 'CONSERVED', 
                                    'SOURCONSV', 'SOURCSAVED_USD']]
selected_columns

Unnamed: 0,SUPERID,ID,AR_NUMBER,IMPSTATUS,IMPCOST,USAGE_RANK,SOURCCODE,CONSERVED,SOURCONSV,SOURCSAVED_USD
32,AM000202,AM0002,2,I,60000.0,PSOURCCODE,E1,1077960.0,11049.0,52212.0
33,AM000202,AM0002,2,I,60000.0,SSOURCCODE,E2,10208.0,,42872.0
34,AM000202,AM0002,2,I,60000.0,TSOURCCODE,,,,
35,AM000202,AM0002,2,I,60000.0,QSOURCCODE,,,,
152,AM000504,AM0005,4,N,960.0,PSOURCCODE,E2,83.0,,273.0
153,AM000504,AM0005,4,N,960.0,SSOURCCODE,,,,
154,AM000504,AM0005,4,N,960.0,TSOURCCODE,R2,,,157.0
155,AM000504,AM0005,4,N,960.0,QSOURCCODE,,,,


#### Transform the iac_assess table from wide to long format

Requirements
1. Keep all common columns
2. Convert *_plant_usage and *_plant_cost columns into rows under the plant_usage and plant_cost columns, and add a separate column for the source code.
4. Order the columns to maintain the original dataframe structure

In [50]:
def transform_assess_data(df):
    """
    Transform wide format plant data to long format by converting *_plant_usage 
    and *_plant_cost columns into rows.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame in wide format
    
    Returns:
    pandas.DataFrame: Transformed DataFrame in long format
    """
    # Common columns that will be preserved
    id_vars = ['CENTER', 'FY', 'SIC', 'NAICS', 'STATE', 'SALES', 
               'EMPLOYEES', 'PLANT_AREA', 'PRODUCTS', 'PRODUNITS', 
               'PRODLEVEL', 'PRODHOURS', 'NUMARS']
    
    # Melt cost columns
    cost_df = pd.melt(
        df,
        id_vars=['ID'] + id_vars,
        value_vars=[col for col in df.columns if col.endswith('_plant_cost')],
        var_name='source_code',
        value_name='plant_cost'
    )
    # Clean up source_code by removing '_plant_cost'
    cost_df['source_code'] = cost_df['source_code'].str.replace('_plant_cost', '')
    
    # Melt usage columns
    usage_df = pd.melt(
        df,
        id_vars=['ID'] + id_vars,
        value_vars=[col for col in df.columns if col.endswith('_plant_usage')],
        var_name='source_code',
        value_name='plant_usage'
    )
    # Clean up source_code by removing '_plant_usage'
    usage_df['source_code'] = usage_df['source_code'].str.replace('_plant_usage', '')
    
    # Merge cost and usage dataframes
    result_df = cost_df.merge(
        usage_df,
        on=['ID'] + id_vars + ['source_code'],
        how='outer'
    )
    
    # Create ordered categorical for source_code
    source_order = ['EC', 'ED', 'EF'] + [f'E{i}' for i in range(2, 13)] + [f'W{i}' for i in range(7)]
    result_df['source_code'] = pd.Categorical(result_df.source_code, categories=source_order, ordered=True)
    
    # Remove rows where both plant_cost and plant_usage are NA
    result_df = result_df.dropna(subset=['plant_cost', 'plant_usage'], how='all')

    # Sort by ID and source_code and set ID as index
    result_df = result_df.sort_values(by=['ID', 'source_code']).set_index('ID')
    
    return result_df

In [51]:
iac_assess_tidy_df = transform_assess_data(iac_assess_df)

In [53]:
# Verify transformed data
filtered_assess_df = iac_assess_tidy_df.loc[['AM0002', 'AM0005',"AM0324"]]
filtered_assess_df

Unnamed: 0_level_0,CENTER,FY,SIC,NAICS,STATE,SALES,EMPLOYEES,PLANT_AREA,PRODUCTS,PRODUNITS,PRODLEVEL,PRODHOURS,NUMARS,source_code,plant_cost,plant_usage
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AM0002,AM,1987,2761.0,,TX,25000000.0,156.0,,BUSINESS FORMS,,,2250.0,9,EC,267702.0,4867233.0
AM0002,AM,1987,2761.0,,TX,25000000.0,156.0,,BUSINESS FORMS,,,2250.0,9,E2,70657.0,19338.0
AM0005,AM,1987,2024.0,,TX,12000000.0,39.0,,ICE CREAM,5.0,2000.0,2080.0,7,EC,174617.0,2868113.0
AM0005,AM,1987,2024.0,,TX,12000000.0,39.0,,ICE CREAM,5.0,2000.0,2080.0,7,E2,10448.0,2903.0
AM0324,AM,1998,3446.0,,TX,24000000.0,140.0,99000.0,"Architectural metal work, mostly gratings and ...",,,5952.0,10,EC,141913.0,3808910.0
AM0324,AM,1998,3446.0,,TX,24000000.0,140.0,99000.0,"Architectural metal work, mostly gratings and ...",,,5952.0,10,ED,99270.0,16490.0
AM0324,AM,1998,3446.0,,TX,24000000.0,140.0,99000.0,"Architectural metal work, mostly gratings and ...",,,5952.0,10,E2,11950.0,2077.0
AM0324,AM,1998,3446.0,,TX,24000000.0,140.0,99000.0,"Architectural metal work, mostly gratings and ...",,,5952.0,10,W3,1840.0,1200.0
AM0324,AM,1998,3446.0,,TX,24000000.0,140.0,99000.0,"Architectural metal work, mostly gratings and ...",,,5952.0,10,W4,815045.0,5100000.0


#### Transform the ppi table from wide to long format

Requirements
1. Keep all common columns
2. Convert year columns into rows under the year and ppi columns
4. Order the columns to maintain the original dataframe structure

In [70]:
def transform_ppi_data(df):
    """
    Transform wide format plant data to long format by converting year columns into rows.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame in wide format
    
    Returns:
    pandas.DataFrame: Transformed DataFrame in long format
    """
    
    # Common columns that will be preserved
    id_vars = ['ARC', 'Description', 'Series ID', 'Industry', 'Product']
    
    # Melt year columns
    result_df = pd.melt(
        df,
        id_vars=id_vars,
        value_vars=[1987, 1988, 1989, 1990, 1991, 1992,
                   1993, 1994, 1995, 1996, 1997, 1998,
                   1999, 2000, 2001, 2002, 2003, 2004,
                   2005, 2006, 2007, 2008, 2009, 2010,
                   2011, 2012, 2013, 2014, 2015, 2016,
                   2017, 2018],
        var_name='year',
        value_name='ppi'
    )

    # Sort by year and ARC and set ARC as index
    result_df = result_df.sort_values(by=['year', 'ARC']).set_index('ARC')
    
    return result_df

In [71]:
ppi_tidy_df = transform_ppi_data(ppi_df)

KeyError: "None of ['ID'] are in the columns"

### Clean Data

#### Replace old source coce for electricity values "E1" with "EC"
Reason: E1 was replaced with EC, ED, and EF as of FY 95 (9/30/95)
<br/>Refer to: https://iac.university/technicalDocs/IAC_DatabaseManualv10.2.pdf

In [55]:
iac_recc_tidy_df.replace({'SOURCCODE':{'E1':'EC'}}, inplace=True)

In [105]:
# Verify that "E1" values in the column "SOURCCODE" were replaced with "EC"
iac_recc_tidy_df.query('SUPERID in ["AM000202","AM000504"]')

Unnamed: 0,SUPERID,ID,AR_NUMBER,APPCODE,ARC2,IMPSTATUS,IMPCOST,USAGE_RANK,SOURCCODE,CONSERVED,SOURCONSV,SOURCSAVED_USD,REBATE,INCREMNTAL,FY,IC_CAPITAL,IC_OTHER,PAYBACK,BPTOOL
32,AM000202,AM0002,2,,2.7226,I,60000.0,PSOURCCODE,EC,1077960.0,11049.0,52212.0,N,N,1987,,,0.631021,
33,AM000202,AM0002,2,,2.7226,I,60000.0,SSOURCCODE,E2,10208.0,,42872.0,N,N,1987,,,0.631021,
34,AM000202,AM0002,2,,2.7226,I,60000.0,TSOURCCODE,,,,,N,N,1987,,,0.631021,
35,AM000202,AM0002,2,,2.7226,I,60000.0,QSOURCCODE,,,,,N,N,1987,,,0.631021,
152,AM000504,AM0005,4,,2.2449,N,960.0,PSOURCCODE,E2,83.0,,273.0,N,N,1987,,,2.232558,
153,AM000504,AM0005,4,,2.2449,N,960.0,SSOURCCODE,,,,,N,N,1987,,,2.232558,
154,AM000504,AM0005,4,,2.2449,N,960.0,TSOURCCODE,R2,,,157.0,N,N,1987,,,2.232558,
155,AM000504,AM0005,4,,2.2449,N,960.0,QSOURCCODE,,,,,N,N,1987,,,2.232558,
