# Create a boxplot for ARC implementation Cost and Payback distribution 

### Filters:
* Sector = Food Manufacturing (NAICS code = 311*)
* Period = 2014 - 2024 (last 10 years)
* Implemented vs Not Implemented
* ARCs = TBD
* State = CA and AZ

## Implementation Cost Distribution Boxplot

Attribute: 'IMPCOST'



### Notebook generates the following boxplot:

In [None]:
from IPython.display import Image

image_path = '../assets/impcost_boxplot_aggregate.png'

# Display the image
Image(image_path)

In [None]:
# Import libraries
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import janitor
from janitor import clean_names

In [None]:
# Import datasets

# ------- define paths -------
relative_path = Path('../../data/intermediate_data/') # define relative path
absolute_path = relative_path.resolve() # get absolute path


# ------- import data -------
assess_df = pd.read_csv(absolute_path/'iac_assess_tidy.csv') # import IAC assess dataset
recc_integrated_ppi_df = pd.read_csv(absolute_path/'recc_integrated_ppi.csv') # import an integrated recc dataset with adjusted impcost

In [None]:
assess_df = assess_df.clean_names()

In [None]:
# Add Sector and State attributes to recc_integrated_ppi_df from assess_df
integrated_ppi_df = pd.merge(recc_integrated_ppi_df, assess_df[['state','naics','id']],
                                  on='id',
                                  how='left')

integrated_ppi_df.drop_duplicates(inplace = True)

In [None]:
# get arcs2 for sector = 311 (food production)
recc_integrated_ppi_311_df = integrated_ppi_df[integrated_ppi_df['naics'].astype(str).str.startswith('311')]

recc_integrated_ppi_311_df[recc_integrated_ppi_311_df['superid']=='AM043901']
recc_integrated_ppi_311_df[recc_integrated_ppi_311_df['id']=='AM0439']

recc_integrated_ppi_311_df['arc2'].unique()[:15]

In [None]:
# set filters
arc2_filter = [2.1224]
sector_filter = '311'
period_filter_from = 2010
period_filter_to = 2024

In [None]:
# create a filtered df for the plot
filtered_df = integrated_ppi_df[
    (integrated_ppi_df['naics'].astype(str).str.startswith(sector_filter)) &
    (integrated_ppi_df['arc2'].isin(arc2_filter)) &
    (integrated_ppi_df['fy'] >= period_filter_from) &
    (integrated_ppi_df['fy'] <= period_filter_to)
]

In [None]:
# <remove> when PPI values are updated for all ARCs
# drop rows where IMPCOST is NA
filtered_clean_df = filtered_df.dropna(subset=['ref_year_impcost'])

In [None]:
# test
# <remove> when PPI values are updated for all ARCs
filtered_clean_df[filtered_clean_df['superid']=='AM057403']

In [None]:
# test that filters values are correct
print("Unique values in arc2:", filtered_df['arc2'].unique())
print("Unique values in arc2:", filtered_df['fy'].unique())
print(len(filtered_df['arc2'].unique()))

filtered_df.head()

Managing Outliers for Better Visualization 
Method: set showfliers=False in the sns.boxplot. This allows keeping all data in a dataframe while hiding the outliers on a chart.
Seaborn uses Interquartile Range (IQR) Rule to determine outliers:

Outlier Calculation Using IQR:

First Quartile (Q1) = 25th percentile
Third Quartile (Q3) = 75th percentile
Interquartile Range (IQR) = Q3 - Q1
Outliers are defined as any values:
Below: Q1 - 1.5 × IQR
Above: Q3 + 1.5 × IQR

In [None]:
# calculate the outliers

# Compute Q1, Q3, and IQR
Q1 = filtered_clean_df["ref_year_impcost"].quantile(0.25)
Q3 = filtered_clean_df["ref_year_impcost"].quantile(0.75)
IQR = Q3 - Q1

# Define outlier thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = filtered_clean_df[(filtered_clean_df["ref_year_impcost"] < lower_bound) | (filtered_clean_df["ref_year_impcost"] > upper_bound)]

# Display outliers
print(outliers[["impstatus", "ref_year_impcost"]])

In [None]:
# Define a path to save visualizations

# Define relative path
relative_path_vis = Path('../assets/')
absolute_path_vis = relative_path_vis.resolve() # get absolute path

In [None]:
sns.set_theme(style="white")
plt.figure(figsize=(8, 6))

# Filter data
not_implemented = filtered_clean_df[filtered_clean_df["impstatus"] == "N"]
implemented = filtered_clean_df[filtered_clean_df["impstatus"] == "I"]

# Create boxplots separately with different properties
ax = sns.boxplot(data=not_implemented, x=["Not Implemented"]*len(not_implemented), 
                y="ref_year_impcost", width=0.4, showfliers=False,
                boxprops={'edgecolor': '#C44E52', 'facecolor': 'r', 'alpha': 0.6},
                medianprops={'color': '#C44E52', 'linewidth': 2},
                whiskerprops={'color': '#C44E52', 'linewidth': 1.5},
                capprops={'color': '#C44E52', 'linewidth': 1.5})

sns.boxplot(data=implemented, x=["Implemented"]*len(implemented), 
            y="ref_year_impcost", width=0.4, showfliers=False,
            boxprops={'edgecolor': '#376A3E', 'facecolor': '#376A3E', 'alpha': 0.6},
            medianprops={'color': '#376A3E', 'linewidth': 2},
            whiskerprops={'color': '#376A3E', 'linewidth': 1.5},
            capprops={'color': '#376A3E', 'linewidth': 1.5},
            ax=ax)

# Format y-axis with commas
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{x:,.0f}'))

# Adjust titles and labels
plt.title("Comparison of Implementation Cost by Status", fontsize=16)
plt.xlabel("Status", fontsize=14)
plt.ylabel("Implementation Cost, US Dollars (2024 Adjusted)", fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Save the plot to an image file (e.g., PNG)
plt.savefig(absolute_path_vis/'impcost_boxplot_aggregate.png', format='png') 

plt.show()

In [None]:
filtered_clean_df

## Payback Distribution Boxplot

In [None]:
sns.set_theme(style="white")
plt.figure(figsize=(8, 6))

# Filter data
not_implemented = filtered_clean_df[filtered_clean_df["impstatus"] == "N"]
implemented = filtered_clean_df[filtered_clean_df["impstatus"] == "I"]

# Create boxplots separately with different properties
ax = sns.boxplot(data=not_implemented, x=["Not Implemented"]*len(not_implemented), 
                y="payback", width=0.4, showfliers=False,
                boxprops={'edgecolor': '#C44E52', 'facecolor': 'r', 'alpha': 0.6},
                medianprops={'color': '#C44E52', 'linewidth': 2},
                whiskerprops={'color': '#C44E52', 'linewidth': 1.5},
                capprops={'color': '#C44E52', 'linewidth': 1.5})

sns.boxplot(data=implemented, x=["Implemented"]*len(implemented), 
            y="payback", width=0.4, showfliers=False,
            boxprops={'edgecolor': '#376A3E', 'facecolor': '#376A3E', 'alpha': 0.6},
            medianprops={'color': '#376A3E', 'linewidth': 2},
            whiskerprops={'color': '#376A3E', 'linewidth': 1.5},
            capprops={'color': '#376A3E', 'linewidth': 1.5},
            ax=ax)

# Format y-axis with commas
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{x:,.0f}'))

# Adjust titles and labels
plt.title("Payback Period Distribution by Status", fontsize=16)
plt.xlabel("Status", fontsize=14)
plt.ylabel("Payback Period, Years", fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Save the plot to an image file (e.g., PNG)
plt.savefig(absolute_path_vis/'payback_boxplot_aggregate.png', format='png') 

plt.show()

In [None]:
sns.set_theme(style="white")

# Create the boxplot while ignoring outliers
plt.figure(figsize=(8, 6))
sns.boxplot(data=filtered_clean_df, x="impstatus", y="ref_year_impcost", 
            width=0.2, 
            showfliers=False,
            palette={"N": "darkred", "I": "darkblue"}, hue="impstatus",
            legend=False)

# filter outliers for jitter plot
filtered_clean_df_no_outliers = filtered_clean_df[(filtered_clean_df['ref_year_impcost'] >= lower_bound) &
                                                  (filtered_clean_df['ref_year_impcost'] <= upper_bound)]

# Scatter plot (stripplot) to show individual points
sns.stripplot(data=filtered_clean_df_no_outliers, x="impstatus", y="ref_year_impcost", 
              palette={"N": "red", "I": "darkgreen"}, hue="impstatus",
              jitter=True, alpha=0.5, size=3,
              legend=False)

# Customize plot
plt.title("Comparison of Implementation Cost by Status (Outliers Hidden)")
plt.xlabel("Impstatus (N = Not implemented, I = Implemented)")
plt.ylabel("Implementation Cost, US Dollars (2024 Adjusted)")
plt.show()

In [None]:
sns.set_theme(style="dark")

# Impcost per year
plt.figure(figsize=(12, 6)) 
sns.boxplot(data=filtered_clean_df, x="fy", y="ref_year_impcost", hue="impstatus", 
            width=0.6, palette={"N": "darkred", "I": "darkblue"},
            showfliers=False,
            medianprops={'color': 'white', 'linewidth': 1})

# filter outliers for jitter plot
filtered_clean_df_no_outliers = filtered_clean_df[(filtered_clean_df['ref_year_impcost'] >= lower_bound) &
                                                  (filtered_clean_df['ref_year_impcost'] <= upper_bound)]

plt.title("Comparison of Implementation Costs by Status and Year", fontsize=16)
plt.xlabel("Fiscal Year", fontsize=14)
plt.ylabel("Implementation Cost, US Dollars (2024 Adjusted)", fontsize=14)
plt.legend(title="Implementation Status")
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Rename legend labels
legend_labels = {"N": "Not Implemented", "I": "Implemented"}
handles, labels = plt.gca().get_legend_handles_labels()  # Get existing legend handles
plt.legend(handles[:2], [legend_labels[label] for label in labels[:2]], title="Implementation Status",  fontsize=12)

# Save the plot to an image file (e.g., PNG)
plt.savefig(absolute_path_vis/'impcost_boxplot_by_year.png', format='png') 

plt.show()

## Boxplots for a dashboard prototype
Note: plots use demo data 

In [None]:
# Mockup

# Import datasets

# ------- define paths -------
relative_path = Path('../data/intermediate_data/') # define relative path
absolute_path = relative_path.resolve() # get absolute path


# ------- import data -------
mockup_ppi_df = pd.read_csv(absolute_path/'iac_integrated_mockupv2.csv') # import IAC assess dataset

In [None]:
mockup_ppi_df

In [None]:
print(mockup_ppi_df[" adjusted_impcost "].describe())

print("\nNot Implemented Costs:")
print(mockup_ppi_df[mockup_ppi_df["impstatus"] == "N"][" adjusted_impcost "])

print("\nImplemented Costs:")
print(mockup_ppi_df[mockup_ppi_df["impstatus"] == "I"][" adjusted_impcost "])

## Implementation Cost Distribution Boxplot

In [None]:
sns.set_theme(style="white")
plt.figure(figsize=(8, 6))

# Clean and convert the column to numeric
def convert_to_numeric(value):
    return float(value.replace(',', ''))

# Apply conversion to the column
mockup_ppi_df['adjusted_impcost'] = mockup_ppi_df[" adjusted_impcost "].apply(convert_to_numeric)

# Filter data
not_implemented = mockup_ppi_df[mockup_ppi_df["impstatus"] == "N"]
implemented = mockup_ppi_df[mockup_ppi_df["impstatus"] == "I"]

# Create boxplots separately with different properties
ax = sns.boxplot(data=not_implemented, x=["Not Implemented"]*len(not_implemented), 
                y=" adjusted_impcost ", width=0.4, showfliers=False,
                boxprops={'edgecolor': '#C44E52', 'facecolor': 'r', 'alpha': 0.6},
                medianprops={'color': '#C44E52', 'linewidth': 2},
                whiskerprops={'color': '#C44E52', 'linewidth': 1.5},
                capprops={'color': '#C44E52', 'linewidth': 1.5})

sns.boxplot(data=implemented, x=["Implemented"]*len(implemented), 
            y=" adjusted_impcost ", width=0.4, showfliers=False,
            boxprops={'edgecolor': '#376A3E', 'facecolor': '#376A3E', 'alpha': 0.6},
            medianprops={'color': '#376A3E', 'linewidth': 2},
            whiskerprops={'color': '#376A3E', 'linewidth': 1.5},
            capprops={'color': '#376A3E', 'linewidth': 1.5},
            ax=ax)

# Format y-axis with commas
#ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{x:,f}'))

# Adjust titles and labels
plt.title("Comparison of Implementation Cost by Status", fontsize=16)
plt.xlabel("Status", fontsize=14)
plt.ylabel("Implementation Cost, US Dollars (2024 Adjusted)", fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Save the plot to an image file (e.g., PNG)
plt.savefig(absolute_path_vis/'impcost_boxplot_aggregate.png', format='png') 

plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd

sns.set_theme(style="white")
plt.figure(figsize=(8, 6))

# Clean and convert the column to numeric
def convert_to_numeric(value):
    return float(value.replace(',', ''))

# Apply conversion to the column
mockup_ppi_df['adjusted_impcost'] = mockup_ppi_df[" adjusted_impcost "].apply(convert_to_numeric)

# Filter data
not_implemented = mockup_ppi_df[mockup_ppi_df["impstatus"] == "N"]
implemented = mockup_ppi_df[mockup_ppi_df["impstatus"] == "I"]

# Create boxplots separately
ax = sns.boxplot(data=not_implemented, 
                 x=["Not Implemented"]*len(not_implemented),
                 y="adjusted_impcost", 
                 width=0.4, 
                 showfliers=False,
                 boxprops={'edgecolor': '#C44E52', 'facecolor': 'r', 'alpha': 0.6},
                 medianprops={'color': '#C44E52', 'linewidth': 2},
                 whiskerprops={'color': '#C44E52', 'linewidth': 1.5},
                 capprops={'color': '#C44E52', 'linewidth': 1.5})

sns.boxplot(data=implemented, 
            x=["Implemented"]*len(implemented),
            y="adjusted_impcost", 
            width=0.4, 
            showfliers=False,
            boxprops={'edgecolor': '#376A3E', 'facecolor': '#376A3E', 'alpha': 0.6},
            medianprops={'color': '#376A3E', 'linewidth': 2},
            whiskerprops={'color': '#376A3E', 'linewidth': 1.5},
            capprops={'color': '#376A3E', 'linewidth': 1.5},
            ax=ax)

# Format y-axis with commas
ax.yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{x:,.0f}'))

# Adjust titles and labels
plt.title("Comparison of Implementation Cost by Status", fontsize=16)
plt.xlabel("Status", fontsize=14)
plt.ylabel("Implementation Cost, US Dollars (2024 Adjusted)", fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Save the plot
plt.tight_layout()
plt.savefig(absolute_path_vis/'impcost_boxplot_aggregate.png', format='png')
plt.show()

## Payback Period Distribution Boxplot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
import numpy as np

sns.set_theme(style="white")
plt.figure(figsize=(8, 6))

# Drop NA values from payback column
mockup_ppi_df_clean = mockup_ppi_df.dropna(subset=[' payback '])

# Filter data
not_implemented = mockup_ppi_df_clean[mockup_ppi_df_clean["impstatus"] == "N"]
implemented = mockup_ppi_df_clean[mockup_ppi_df_clean["impstatus"] == "I"]

# Create boxplots separately
ax = sns.boxplot(data=not_implemented, 
                 x=["Not Implemented"]*len(not_implemented),
                 y=" payback ", 
                 width=0.4, 
                 showfliers=False,
                 boxprops={'edgecolor': '#C44E52', 'facecolor': 'r', 'alpha': 0.6},
                 medianprops={'color': '#C44E52', 'linewidth': 2},
                 whiskerprops={'color': '#C44E52', 'linewidth': 1.5},
                 capprops={'color': '#C44E52', 'linewidth': 1.5})

sns.boxplot(data=implemented, 
            x=["Implemented"]*len(implemented),
            y=" payback ", 
            width=0.4, 
            showfliers=False,
            boxprops={'edgecolor': '#376A3E', 'facecolor': '#376A3E', 'alpha': 0.6},
            medianprops={'color': '#376A3E', 'linewidth': 2},
            whiskerprops={'color': '#376A3E', 'linewidth': 1.5},
            capprops={'color': '#376A3E', 'linewidth': 1.5},
            ax=ax)

# Format y-axis with 2 decimal places
def format_payback(x, pos):
    return f'{x:.2f}' if x < 10 else f'{x:.1f}'

ax.yaxis.set_major_formatter(mticker.FuncFormatter(format_payback))

# Adjust titles and labels
plt.title("Payback Period Distribution by Status", fontsize=16)
plt.xlabel("Status", fontsize=14)
plt.ylabel("Payback Period, Years", fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

# Save the plot
plt.tight_layout()
plt.savefig(absolute_path_vis/'payback_boxplot_aggregate.png', format='png')
plt.show()