# Create a boxplot for IMPCOST 

Filters:
* Sector = Food Manufacturing (NAICS code = 311*)
* Period = 2014 - 2024 (last 10 years)
* Implemented vs Not Implemented
* ARCs = TBD
* State = CA
    * Question: do we want to offer a comparative boxplots? or only aggregate?

In [19]:
# Import libraries
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import janitor
from janitor import clean_names

In [93]:
# Import datasets

# ------- define paths -------
relative_path = Path('../data/intermediate_data/') # define relative path
absolute_path = relative_path.resolve() # get absolute path


# ------- import data -------
assess_df = pd.read_csv(absolute_path/'iac_assess_tidy.csv') # import IAC assess dataset
recc_integrated_ppi_df = pd.read_csv(absolute_path/'recc_integrated_ppi.csv') # import IAC recc dataset

In [97]:
assess_df = assess_df.clean_names()

In [98]:
recc_integrated_ppi_df = recc_integrated_ppi_df.dropna(subset=['sourccode', 'conserved','sourconsv','saved'], how='all')
recc_integrated_ppi_df[recc_integrated_ppi_df['superid']=='AM043901']

Unnamed: 0,superid,id,ar_number,appcode,arc2,impstatus,impcost,ref_year_impcost,source_rank,sourccode,conserved,sourconsv,saved,rebate,incremntal,fy,ic_capital,ic_other,payback,bptool
13300,AM043901,AM0439,1,1.0,4.132,I,435000.0,,PSOURCCODE,R3,,,910000.0,N,N,2003,395000.0,40000.0,0.478022,


In [99]:
# filter assess table to food production sector only (naics = 311*)
assess_311_df = assess_df[assess_df['naics'].astype(str).str.startswith('311')]
assess_311_df = assess_311_df[['id','state','naics']]
assess_311_df.drop_duplicates(inplace=True)

In [100]:
assess_311_df

Unnamed: 0,id,state,naics
1235,AM0439,TX,311991.0
1282,AM0453,TX,311830.0
1293,AM0456,TX,311421.0
1300,AM0458,TX,311812.0
1347,AM0471,TX,311119.0
...,...,...,...
76951,WV0508,WV,311812.0
77106,WV0549,OH,311111.0
77341,WV0598,WV,311612.0
77365,WV0602,PA,311999.0


In [101]:
# Add Sector and State attributes to recc_integrated_ppi_df from assess_df
recc_integrated_ppi_df = pd.merge(recc_integrated_ppi_df, assess_311_df[['state','naics','id']],
                                  on='id',
                                  how='left')

recc_integrated_ppi_311_df = recc_integrated_ppi_df[recc_integrated_ppi_df['naics'].astype(str).str.startswith('311')]


In [102]:
recc_integrated_ppi_df[recc_integrated_ppi_df['superid']=='AM043901']

Unnamed: 0,superid,id,ar_number,appcode,arc2,impstatus,impcost,ref_year_impcost,source_rank,sourccode,...,saved,rebate,incremntal,fy,ic_capital,ic_other,payback,bptool,state,naics
4050,AM043901,AM0439,1,1.0,4.132,I,435000.0,,PSOURCCODE,R3,...,910000.0,N,N,2003,395000.0,40000.0,0.478022,,TX,311991.0


In [142]:
recc_integrated_ppi_311_df[recc_integrated_ppi_311_df['superid']=='AM043901']
recc_integrated_ppi_311_df[recc_integrated_ppi_311_df['id']=='AM0439']

recc_integrated_ppi_311_df['arc2'].unique()[:15]

array([4.132 , 4.652 , 3.7214, 2.1113, 3.4111, 2.4236, 2.2113, 2.2511,
       2.4231, 2.7142, 3.4154, 2.2135, 2.7124, 2.2136, 2.2523])

In [143]:
# set filters
arc2_filter = [4.132 , 4.652 , 3.7214, 2.1113, 3.4111, 2.4236, 2.2113, 2.2511, 
               2.4231, 2.7142, 3.4154, 2.2135, 2.7124, 2.2136, 2.2523]
sector_filter = '311'
period_filter_from = '2010'
period_filter_to = '2024'

In [144]:
filtered = recc_integrated_ppi_311_df[recc_integrated_ppi_311_df['arc2'].isin(arc2_filter)]
filtered

Unnamed: 0,superid,id,ar_number,appcode,arc2,impstatus,impcost,ref_year_impcost,source_rank,sourccode,...,saved,rebate,incremntal,fy,ic_capital,ic_other,payback,bptool,state,naics
4050,AM043901,AM0439,1,1.0,4.1320,I,435000.0,,PSOURCCODE,R3,...,910000.0,N,N,2003,395000.0,40000.0,0.478022,,TX,311991.0
4051,AM043902,AM0439,2,1.0,4.6520,I,1200.0,,PSOURCCODE,R1,...,79200.0,N,N,2003,,1200.0,0.015152,,TX,311991.0
4052,AM043903,AM0439,3,1.0,3.7214,I,220.0,,PSOURCCODE,P1,...,36600.0,N,N,2003,100.0,120.0,0.006011,,TX,311991.0
4053,AM043904,AM0439,4,1.0,4.6520,I,270.0,,PSOURCCODE,R1,...,31700.0,N,N,2003,,270.0,0.008517,,TX,311991.0
4054,AM043905,AM0439,5,2.0,2.1113,I,500.0,691.947566,PSOURCCODE,E2,...,15600.0,N,N,2003,,500.0,0.032051,,TX,311991.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246052,WV061005,WV0610,5,,2.4236,I,2340.0,,PSOURCCODE,EC,...,5010.0,N,N,2021,2240.0,100.0,,AM+,MD,311920.0
246053,WV061005,WV0610,5,,2.4236,I,2340.0,,SSOURCCODE,ED,...,427.0,N,N,2021,2240.0,100.0,,AM+,MD,311920.0
246062,WV061010,WV0610,10,,2.4231,I,100.0,,PSOURCCODE,EC,...,2448.0,N,N,2021,100.0,,,MM+,MD,311920.0
246063,WV061010,WV0610,10,,2.4231,I,100.0,,SSOURCCODE,ED,...,206.0,N,N,2021,100.0,,,MM+,MD,311920.0


In [None]:
# test that filters values are correct
print("Unique values in arc2:", filtered['arc2'].unique())
len(filtered['arc2'].unique())

Unique values in arc2: [4.132  4.652  3.7214 2.1113 3.4111 2.4236 2.2113 2.2511 2.4231 2.7142
 3.4154 2.2135 2.7124 2.2136 2.2523]


15