### Set-up: Import Required Libraries

In [11]:
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt

### Import Data

#### Import RECC Table

In [39]:
# import IAC database, sheet "RECC1"
# define relative path
relative_path = Path('../data/raw_data/IAC_Database_20250208.xls')

# get absolute path
absolute_path = relative_path.resolve()
print(absolute_path)

/Users/oksi/workspace/industrialenergy_datainterface/data/raw_data/IAC_Database_20250208.xls


In [40]:
# import all RECC* sheets from the IAC database excel file  
all_sheets = pd.read_excel(absolute_path, sheet_name=None)  # load all sheets as a dictionary

# filter sheets that match the pattern
selected_sheets = {name: data for name, data in all_sheets.items() if name.startswith('RECC')}

# combine matching sheets into a single DataFrame
iac_recc_df = pd.concat(
    [sheet.assign(RECC=name) for name, sheet in selected_sheets.items()],
    ignore_index=True
)

In [44]:
iac_recc_df.columns

Index(['SUPERID', 'ID', 'AR_NUMBER', 'APPCODE', 'ARC2', 'IMPSTATUS', 'IMPCOST',
       'PSOURCCODE', 'PCONSERVED', 'PSOURCONSV', 'PSAVED', 'SSOURCCODE',
       'SCONSERVED', 'SSOURCONSV', 'SSAVED', 'TSOURCCODE', 'TCONSERVED',
       'TSOURCONSV', 'TSAVED', 'QSOURCCODE', 'QCONSERVED', 'QSOURCONSV',
       'QSAVED', 'REBATE', 'INCREMNTAL', 'FY', 'IC_CAPITAL', 'IC_OTHER',
       'PAYBACK', 'BPTOOL', 'RECC'],
      dtype='object')

#### Import ASSESS table


In [114]:
# create an assess dataframe with the data from the sheet "ASSESS"
assess_df = all_sheets['ASSESS']


In [117]:
assess_df.columns

Index(['ID', 'CENTER', 'FY', 'SIC', 'NAICS', 'STATE', 'SALES', 'EMPLOYEES',
       'PLANT_AREA', 'PRODUCTS', 'PRODUNITS', 'PRODLEVEL', 'PRODHOURS',
       'NUMARS', 'EC_plant_cost', 'EC_plant_usage', 'ED_plant_cost',
       'ED_plant_usage', 'EF_plant_cost', 'E2_plant_cost', 'E2_plant_usage',
       'E3_plant_cost', 'E3_plant_usage', 'E4_plant_cost', 'E4_plant_usage',
       'E5_plant_cost', 'E5_plant_usage', 'E6_plant_cost', 'E6_plant_usage',
       'E7_plant_cost', 'E7_plant_usage', 'E8_plant_cost', 'E8_plant_usage',
       'E9_plant_cost', 'E9_plant_usage', 'E10_plant_cost', 'E10_plant_usage',
       'E11_plant_cost', 'E11_plant_usage', 'E12_plant_cost',
       'E12_plant_usage', 'W0_plant_cost', 'W0_plant_usage', 'W1_plant_cost',
       'W1_plant_usage', 'W2_plant_cost', 'W2_plant_usage', 'W3_plant_cost',
       'W3_plant_usage', 'W4_plant_cost', 'W4_plant_usage', 'W5_plant_cost',
       'W5_plant_usage', 'W6_plant_cost', 'W6_plant_usage'],
      dtype='object')

### Normalize the data
#### Transform the iac_recc table from wide to long format

Requirements
1. Keep all common columns
2. Create four rows for each input row (one for each energy source usage ranking: Primary, Secondary, Tertiary, Quaternary)
3. Maintain the relationship between energy source codes and their associated values: SOURCCODE, CONSERVED, SOURCONSV, SAVED
4. Order the columns to maintain the original dataframe structure

In [100]:
# Create a function to trasnform the recc table from wide to long format
def transform_usage_data(df):
    """
    Transform wide format usage data to long format by unpivoting usage-related columns.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame in wide format
    
    Returns:
    pandas.DataFrame: Transformed DataFrame in long format
    """
    
    # Common columns that will be repeated for each usage record
    common_cols = ['SUPERID', 'ID', 'AR_NUMBER', 'APPCODE', 'ARC2', 
                  'IMPSTATUS', 'IMPCOST', 'REBATE', 'INCREMNTAL', 
                  'FY', 'IC_CAPITAL', 'IC_OTHER', 'PAYBACK', 'BPTOOL']
    
    # Create list of usage types
    usage_types = ['P', 'S', 'T', 'Q']
    
    # Initialize list to store transformed data
    transformed_data = []
    
    # Iterate through each row in the original dataframe
    for _, row in df.iterrows():
        # For each usage type, create a new record
        for usage in usage_types:
            new_row = {col: row[col] for col in common_cols}
            
            # Add usage-specific columns
            sourccode_col = f'{usage}SOURCCODE'
            conserved_col = f'{usage}CONSERVED'
            sourconsv_col = f'{usage}SOURCONSV'
            saved_col = f'{usage}SAVED'
            
            new_row['USAGE_RANK'] = f'{usage}SOURCCODE'
            new_row['SOURCCODE'] = row.get(sourccode_col, '')
            new_row['CONSERVED'] = row.get(conserved_col, '')
            new_row['SOURCONSV'] = row.get(sourconsv_col, '')
            new_row['SOURCSAVED_USD'] = row.get(saved_col, '')
            
            transformed_data.append(new_row)
    
    # Create new dataframe from transformed data
    result_df = pd.DataFrame(transformed_data)
    
    # Ensure columns are in the desired order
    column_order = common_cols[:7] + ['USAGE_RANK', 'SOURCCODE', 'CONSERVED', 
                                    'SOURCONSV', 'SOURCSAVED_USD'] + common_cols[7:]
    
    return result_df[column_order]



In [101]:
# Transform the table
iac_long_rec_df = transform_usage_data(iac_recc_df)

In [102]:
# Verify transformed data
filtered_df = iac_long_rec_df.query('SUPERID in ["AM000202","AM000504"]')
filtered_df

Unnamed: 0,SUPERID,ID,AR_NUMBER,APPCODE,ARC2,IMPSTATUS,IMPCOST,USAGE_RANK,SOURCCODE,CONSERVED,SOURCONSV,SOURCSAVED_USD,REBATE,INCREMNTAL,FY,IC_CAPITAL,IC_OTHER,PAYBACK,BPTOOL
32,AM000202,AM0002,2,,2.7226,I,60000.0,PSOURCCODE,E1,1077960.0,11049.0,52212.0,N,N,1987,,,0.631021,
33,AM000202,AM0002,2,,2.7226,I,60000.0,SSOURCCODE,E2,10208.0,,42872.0,N,N,1987,,,0.631021,
34,AM000202,AM0002,2,,2.7226,I,60000.0,TSOURCCODE,,,,,N,N,1987,,,0.631021,
35,AM000202,AM0002,2,,2.7226,I,60000.0,QSOURCCODE,,,,,N,N,1987,,,0.631021,
152,AM000504,AM0005,4,,2.2449,N,960.0,PSOURCCODE,E2,83.0,,273.0,N,N,1987,,,2.232558,
153,AM000504,AM0005,4,,2.2449,N,960.0,SSOURCCODE,,,,,N,N,1987,,,2.232558,
154,AM000504,AM0005,4,,2.2449,N,960.0,TSOURCCODE,R2,,,157.0,N,N,1987,,,2.232558,
155,AM000504,AM0005,4,,2.2449,N,960.0,QSOURCCODE,,,,,N,N,1987,,,2.232558,


### Clean Data

#### Replace old source coce for electricity values "E1" with "EC"
Reason: E1 was replaced with EC, ED, and EF as of FY 95 (9/30/95)
<br/>Refer to: https://iac.university/technicalDocs/IAC_DatabaseManualv10.2.pdf

In [103]:
iac_long_rec_df.replace({'SOURCCODE':{'E1':'EC'}}, inplace=True)

In [105]:
# Verify that "E1" values in the column "SOURCCODE" were replaced with "EC"
iac_long_rec_df.query('SUPERID in ["AM000202","AM000504"]')

Unnamed: 0,SUPERID,ID,AR_NUMBER,APPCODE,ARC2,IMPSTATUS,IMPCOST,USAGE_RANK,SOURCCODE,CONSERVED,SOURCONSV,SOURCSAVED_USD,REBATE,INCREMNTAL,FY,IC_CAPITAL,IC_OTHER,PAYBACK,BPTOOL
32,AM000202,AM0002,2,,2.7226,I,60000.0,PSOURCCODE,EC,1077960.0,11049.0,52212.0,N,N,1987,,,0.631021,
33,AM000202,AM0002,2,,2.7226,I,60000.0,SSOURCCODE,E2,10208.0,,42872.0,N,N,1987,,,0.631021,
34,AM000202,AM0002,2,,2.7226,I,60000.0,TSOURCCODE,,,,,N,N,1987,,,0.631021,
35,AM000202,AM0002,2,,2.7226,I,60000.0,QSOURCCODE,,,,,N,N,1987,,,0.631021,
152,AM000504,AM0005,4,,2.2449,N,960.0,PSOURCCODE,E2,83.0,,273.0,N,N,1987,,,2.232558,
153,AM000504,AM0005,4,,2.2449,N,960.0,SSOURCCODE,,,,,N,N,1987,,,2.232558,
154,AM000504,AM0005,4,,2.2449,N,960.0,TSOURCCODE,R2,,,157.0,N,N,1987,,,2.232558,
155,AM000504,AM0005,4,,2.2449,N,960.0,QSOURCCODE,,,,,N,N,1987,,,2.232558,
