In [None]:
import pandas as pd
from pathlib import Path
import numpy as npq

### Adjust impcost to current year value

#### Import clean ppi and iac datasets 

In [None]:
# ------- define paths -------
relative_path = Path('../../data/intermediate_data/') # define relative path
absolute_path = relative_path.resolve() # get absolute path
# print(absolute_path)


# ------- import data -------
assess_df = pd.read_csv(absolute_path/'iac_assess_tidy.csv') # import IAC assess dataset
recc_df = pd.read_csv(absolute_path/'iac_recc_tidy.csv') # import IAC recc dataset
ppi_df = pd.read_csv(absolute_path/'ppi_tidy.csv') # import ppi dataset

In [None]:
# handle current year ppi data
# e.g. if the current data is not released or updated in the dataset, use the data from the most recent year
# set a reference_year
reference_year = max(ppi_df['year'])

# check which years are in PPI data
ppi_years = set(ppi_df['year'])

# Find the max FY in recc data
max_fy = recc_df['fy'].max()

# Create a new column that only adjusts the most recent year if needed
recc_df['base_year'] = recc_df['fy'].apply(
    lambda y: reference_year if y == max_fy and y not in ppi_years else y
)

In [None]:
# merge the PPI dataframe with the RECC dataframe on the arc2 and year columns
recc_ppi_df = pd.merge(ppi_df, recc_df[['arc2','superid', 'base_year', 'impcost' ,'fy']], 
        left_on=['arc2','year'],
        right_on=['arc2','base_year'],
        how='left'
    )

# drop unused columns
recc_ppi_df = recc_ppi_df.dropna(subset=['superid', 'impcost'], how='all')
recc_ppi_df = recc_ppi_df.dropna(subset=['impcost'], how='all') # exclude rows where impcost = NA
recc_ppi_df.drop_duplicates(inplace = True)
recc_ppi_df.drop(columns=['base_year'], inplace=True)
recc_ppi_df['fy'] = recc_ppi_df['fy'].astype(int)

recc_df.drop(columns=['base_year'], inplace=True)


In [None]:
print(recc_df.shape) # (652600, 19)
print(recc_ppi_df.shape) # (103600, 10)

In [None]:
recc_ppi_df[recc_ppi_df['arc2']==2.3522]

In [None]:
recc_ppi_df[recc_ppi_df['fy']==2025].head(5)

In [None]:
recc_df[recc_df['arc2']==2.7142]

## Calculate adjusted implementation cost based on the current reference year

In [None]:
# create a dataframe with ppi values in a reference year
ppi_ref_year_df = ppi_df[ppi_df['year']==reference_year] 
ppi_ref_year_df = ppi_ref_year_df[['arc2', 'year', 'ppi']].rename(columns={'year': 'reference_year', 'ppi': 'reference_ppi'})

# add reference_year and reference year ppi values to recc_ppi_df
recc_ppi_df = pd.merge(recc_ppi_df,ppi_ref_year_df[['arc2','reference_year','reference_ppi']],
                       on='arc2',
                       how='left')

# calculate impcost in a reference_year
recc_ppi_df['impcost_adj'] = recc_ppi_df['impcost'] * (recc_ppi_df['reference_ppi'] / recc_ppi_df['ppi']).round(2)

In [None]:
ppi_ref_year_df[ppi_ref_year_df['arc2']==2.3522]

In [None]:
recc_ppi_df[recc_ppi_df['superid']=='ME013208']

In [None]:
recc_df[recc_df['superid']=='ME013208']

In [None]:
print(f"recc_df: {recc_df.columns}\n")
print(f"recc_ppi_df: {recc_ppi_df.columns}")

In [None]:
# integrate adjusted impcost into recc dataset
recc_df = pd.merge(recc_df,recc_ppi_df[['superid', 'description', 'reference_year', 'reference_ppi', 'impcost_adj']],
                   on='superid',
                   how='left')

# move impcost_adj column after impcost column
index = recc_df.columns.get_loc('impcost') + 1
recc_df.insert(index, 'impcost_adj', recc_df.pop('impcost_adj'))

# move description column after ID column
index = recc_df.columns.get_loc('id') + 1
recc_df.insert(index, 'description', recc_df.pop('description'))

In [None]:
recc_df['impcost_adj'] = pd.to_numeric(recc_df['impcost_adj'], errors='coerce').round(4)

In [None]:
# test
recc_OR052903 = recc_df[recc_df['superid']=='OR052903']
print(recc_OR052903['superid'].count()) # expected 4 rows

recc_2_1111 = recc_df[recc_df['arc2']==2.1111]
print(recc_2_1111['arc2'].count()) # expected 76 rows

In [None]:
recc_df[recc_df['arc2']==2.8114]

In [None]:
recc_ppi_df[recc_ppi_df['superid']=='AM043901']
recc_ppi_df[recc_ppi_df['superid']=='SF053207']

recc_df[recc_df['description'] == 'REPLACE BOILER']

In [None]:
# save integrated dataset
recc_df.to_csv("../../data/intermediate_data/recc_integrated_ppi.csv", index=False)
