In [11]:
import pandas as pd
from pathlib import Path
import numpy as np

### Adjust impcost to current year value

#### Import clean ppi and iac datasets 

In [12]:
# ------- define paths -------
relative_path = Path('../data/intermediate_data/') # define relative path
absolute_path = relative_path.resolve() # get absolute path


# ------- import data -------
assess_df = pd.read_csv(absolute_path/'iac_assess_tidy.csv') # import IAC assess dataset
recc_df = pd.read_csv(absolute_path/'iac_recc_tidy.csv') # import IAC recc dataset
ppi_df = pd.read_csv(absolute_path/'ppi_tidy.csv') # import ppi dataset

In [13]:
# Merge the PPI dataframe with the RECC dataframe on the arc2 and year columns
recc_ppi_df = pd.merge(ppi_df, recc_df[['arc2', 'superid', 'fy', 'impcost']], 
        left_on=['arc2','year'],
        right_on=['arc2','fy'],
        how='left'
    )

# # drop unused columns
recc_ppi_df = recc_ppi_df.dropna(subset=['superid', 'fy','impcost'], how='all')
recc_ppi_df = recc_ppi_df.dropna(subset=['impcost'], how='all') # exclude rows where impcost = NA
recc_ppi_df.drop_duplicates(inplace = True)
recc_ppi_df.drop(columns=['fy'], inplace=True)


In [14]:
# test
filtered = recc_ppi_df[recc_ppi_df['arc2']==2.1111]
print(filtered.count()) # should be 9 rows
filtered

arc2           9
description    9
series_id      9
industry       9
product        9
year           9
ppi            9
superid        9
impcost        9
dtype: int64


Unnamed: 0,arc2,description,series_id,industry,product,year,ppi,superid,impcost
129119,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2001,154.3,NC022707,7500.0
148410,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2003,160.2,GT076008,13200.0
148414,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2003,160.2,GT076310,5823.0
159369,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2004,163.6,GT077806,13000.0
159373,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2004,163.6,GT078012,800.0
159377,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2004,163.6,GT078410,29000.0
159381,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2004,163.6,GT079208,32100.0
159385,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2004,163.6,IA034003,4940.0
198531,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2008,186.1,OR052903,1000.0


In [15]:
# set a reference_year
reference_year = 2018

# create a dataframe with ppi values in a reference year
ppi_ref_year_df = ppi_df[ppi_df['year']==2018]
ppi_ref_year_df = ppi_ref_year_df[['arc2', 'year', 'ppi']].rename(columns={'year': 'reference_year', 'ppi': 'reference_ppi'})

# add reference_year and reference year ppi values to recc_ppi_df
recc_ppi_df = pd.merge(recc_ppi_df,ppi_ref_year_df[['arc2','reference_year','reference_ppi']],
                       on='arc2',
                       how='left')

# Calculate impcost in a reference_year
recc_ppi_df['ref_year_impcost'] = recc_ppi_df['impcost'] * (recc_ppi_df['reference_ppi'] / recc_ppi_df['ppi'])

In [16]:
# test
filtered = recc_ppi_df[recc_ppi_df['arc2']==2.1111]
print(filtered['arc2'].count()) # should be 9 rows
filtered

9


Unnamed: 0,arc2,description,series_id,industry,product,year,ppi,superid,impcost,reference_year,reference_ppi,ref_year_impcost
25874,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2001,154.3,NC022707,7500.0,2018,221.7,10776.085548
29885,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2003,160.2,GT076008,13200.0,2018,221.7,18267.41573
29886,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2003,160.2,GT076310,5823.0,2018,221.7,8058.421348
32145,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2004,163.6,GT077806,13000.0,2018,221.7,17616.748166
32146,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2004,163.6,GT078012,800.0,2018,221.7,1084.107579
32147,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2004,163.6,GT078410,29000.0,2018,221.7,39298.899756
32148,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2004,163.6,GT079208,32100.0,2018,221.7,43499.816626
32149,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2004,163.6,IA034003,4940.0,2018,221.7,6694.364303
40326,2.1111,CONTROL PRESSURE ON STEAMER OPERATIONS,PCU334513334513,Industrial process variable instruments,Industrial process variable instruments,2008,186.1,OR052903,1000.0,2018,221.7,1191.295003


In [17]:
# integrate adjusted impcost into recc dataset
recc_df = pd.merge(recc_df,recc_ppi_df[['superid','ref_year_impcost']],
                   on='superid',
                   how='left')

# move ref_year_impcost column after impcost column
index = recc_df.columns.get_loc('impcost') + 1
recc_df.insert(index, 'ref_year_impcost', recc_df.pop('ref_year_impcost'))

In [18]:
# test
recc_OR052903 = recc_df[recc_df['superid']=='OR052903']
print(recc_OR052903['superid'].count()) # expected 4 rows

recc_2_1111 = recc_df[recc_df['arc2']==2.1111]
print(recc_2_1111['arc2'].count()) # expected 76 rows
recc_df

4
76


Unnamed: 0,superid,id,ar_number,appcode,arc2,impstatus,impcost,ref_year_impcost,source_rank,sourccode,conserved,sourconsv,saved,rebate,incremntal,fy,ic_capital,ic_other,payback,bptool
0,AM000101,AM0001,1,,2.8114,N,15000.0,15000.0,PSOURCCODE,EC,,,1828.0,N,N,1987,,,8.205689,
1,AM000101,AM0001,1,,2.8114,N,15000.0,15000.0,SSOURCCODE,,,,,N,N,1987,,,8.205689,
2,AM000101,AM0001,1,,2.8114,N,15000.0,15000.0,TSOURCCODE,,,,,N,N,1987,,,8.205689,
3,AM000101,AM0001,1,,2.8114,N,15000.0,15000.0,QSOURCCODE,,,,,N,N,1987,,,8.205689,
4,AM000102,AM0001,2,,2.7142,N,189.0,,PSOURCCODE,EC,7034.0,72.0,663.0,N,N,1987,,,0.285068,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652595,WV067708,WV0677,8,,2.4239,,540.0,,QSOURCCODE,,,,,N,N,2025,400.0,140.0,,AM+
652596,WV067709,WV0677,9,,2.2511,,446.0,,PSOURCCODE,E2,69.0,,290.0,N,N,2025,330.0,116.0,,SSTS
652597,WV067709,WV0677,9,,2.2511,,446.0,,SSOURCCODE,,,,,N,N,2025,330.0,116.0,,SSTS
652598,WV067709,WV0677,9,,2.2511,,446.0,,TSOURCCODE,,,,,N,N,2025,330.0,116.0,,SSTS


### Save ppi recc integrated data

In [10]:
recc_df.to_csv("../data/intermediate_data/recc_ppi.csv", index=False)