In [None]:
import pandas as pd
from pathlib import Path
import numpy as npq

### Adjust impcost to current year value

#### Import clean ppi and iac datasets 

In [None]:
# ------- define paths -------
relative_path = Path('../../data/intermediate_data/') # define relative path
absolute_path = relative_path.resolve() # get absolute path
# print(absolute_path)


# ------- import data -------
assess_df = pd.read_csv(absolute_path/'iac_assess_tidy.csv') # import IAC assess dataset
recc_df = pd.read_csv(absolute_path/'iac_recc_tidy.csv') # import IAC recc dataset
ppi_df = pd.read_csv(absolute_path/'ppi_tidy.csv') # import ppi dataset

In [None]:
# merge the PPI dataframe with the RECC dataframe on the arc2 and year columns
recc_ppi_df = pd.merge(ppi_df, recc_df[['arc2', 'superid', 'fy', 'impcost']], 
        left_on=['arc2','year'],
        right_on=['arc2','fy'],
        how='left'
    )

# drop unused columns
recc_ppi_df = recc_ppi_df.dropna(subset=['superid', 'fy','impcost'], how='all')
recc_ppi_df = recc_ppi_df.dropna(subset=['impcost'], how='all') # exclude rows where impcost = NA
recc_ppi_df.drop_duplicates(inplace = True)
recc_ppi_df.drop(columns=['fy'], inplace=True)


In [None]:
# test
filtered = recc_ppi_df[recc_ppi_df['arc2']==2.1111]
print(filtered.count()) # should be 9 rows
filtered

## Calculate adjusted implementation cost based on the current reference year

In [None]:
# set a reference_year
reference_year = max(ppi_df['year'])

# create a dataframe with ppi values in a reference year
ppi_ref_year_df = ppi_df[ppi_df['year']==2018] # <UPDATE: when PPI values are collected>
ppi_ref_year_df = ppi_ref_year_df[['arc2', 'year', 'ppi']].rename(columns={'year': 'reference_year', 'ppi': 'reference_ppi'})

# add reference_year and reference year ppi values to recc_ppi_df
recc_ppi_df = pd.merge(recc_ppi_df,ppi_ref_year_df[['arc2','reference_year','reference_ppi']],
                       on='arc2',
                       how='left')

# calculate impcost in a reference_year
recc_ppi_df['ref_year_impcost'] = recc_ppi_df['impcost'] * (recc_ppi_df['reference_ppi'] / recc_ppi_df['ppi'])

In [None]:
# test
filtered = recc_ppi_df[recc_ppi_df['arc2']==2.1111]
print(filtered['arc2'].count()) # should be 9 rows
filtered

In [None]:
# integrate adjusted impcost into recc dataset
recc_df = pd.merge(recc_df,recc_ppi_df[['superid', 'description', 'reference_year', 'reference_ppi','ref_year_impcost']],
                   on='superid',
                   how='left')

# move ref_year_impcost column after impcost column
index = recc_df.columns.get_loc('impcost') + 1
recc_df.insert(index, 'ref_year_impcost', recc_df.pop('ref_year_impcost'))

# move description column after ID column
index = recc_df.columns.get_loc('id') + 1
recc_df.insert(index, 'description', recc_df.pop('description'))

In [None]:
# test
recc_OR052903 = recc_df[recc_df['superid']=='OR052903']
print(recc_OR052903['superid'].count()) # expected 4 rows

recc_2_1111 = recc_df[recc_df['arc2']==2.1111]
print(recc_2_1111['arc2'].count()) # expected 76 rows

In [None]:
recc_ppi_df[recc_ppi_df['superid']=='AM043901']
recc_ppi_df[recc_ppi_df['superid']=='SF053207']

recc_df[recc_df['description'] == 'REPLACE BOILER']

In [None]:
# save integrated dataset
recc_df.to_csv("../../data/intermediate_data/recc_integrated_ppi.csv", index=False)
