In [1]:
import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# import IAC database, sheet "RECC1"
# define relative path
relative_path = Path('data_raw/IAC_Database_20250112.xls')

# get absolute path
absolute_path = relative_path.resolve()
print(absolute_path)

# import IAC database excell
df = pd.read_excel(relative_path, sheet_name='RECC1')

/Users/ejnewby/MEDS/capstone/industrialenergy_datainterface/src/data_raw/IAC_Database_20250112.xls


FileNotFoundError: [Errno 2] No such file or directory: 'data_raw/IAC_Database_20250112.xls'

In [None]:
df

In [None]:
# read all RECC tabs
recc_2 = pd.read_excel(relative_path, sheet_name='RECC2')
recc_3 = pd.read_excel(relative_path, sheet_name='RECC3')
recc_4 = pd.read_excel(relative_path, sheet_name='RECC4')
recc_5 = pd.read_excel(relative_path, sheet_name='RECC5')
recc_6 = pd.read_excel(relative_path, sheet_name='RECC6')

# append additional tabs to the existing DataFrame
df = pd.concat([df, recc_2, recc_3, recc_4, recc_5, recc_6], ignore_index=True)

In [None]:
df

In [None]:
# alternative method to import data: load all sheets based on a defined pattern

# load all sheets
all_sheets = pd.read_excel(relative_path, sheet_name=None)  # load all sheets as a dictionary

# filter sheets that match the pattern
selected_sheets = {name: data for name, data in all_sheets.items() if name.startswith('RECC')}

# combine matching sheets into a single DataFrame
iac_df = pd.concat(
    [sheet.assign(RECC=name) for name, sheet in selected_sheets.items()],
    ignore_index=True
)

# display the result
print(iac_df.head(10))   # first 10 orders
print(iac_df.tail(10))   # last 10 orders

In [None]:
iac_df.columns.tolist()

In [None]:
# print columns details

column_info = []

for col in iac_df.columns:
    dtype = str(iac_df[col].dtype)  # ensure dtype is explicitly a string
    sample_value = iac_df[col].dropna().iloc[0] if dtype != 'object' else iac_df[col].dropna().iloc[0] 
    max_length = iac_df[col].str.len().max() if dtype == 'object' else None
    unique_count = iac_df[col].nunique()
    missing_count = iac_df[col].isnull().sum()  # ensure missing count is explicitly added
    column_info.append({
        'Column': col,
        'Type': dtype,
        'Sample Value': sample_value,
        'Max Char Length': max_length,
        'Unique Values': unique_count
    })
    
# display column info as a table
info_df = pd.DataFrame(column_info)
print(info_df)

In [None]:
iac_df

# --- Filter and Aggregate Data ---

In [None]:

# group by fiscal year and calculate counts for implemented and not implemented recommendations
status_grouped = df[df['IMPSTATUS'].isin(['I', 'N'])].groupby('FY')['IMPSTATUS'].value_counts().unstack(fill_value=0).reset_index()

# rename columns for clarity
status_grouped.columns = ['FY', 'Implemented', 'Not Implemented']


# --- Visualize the IAC Implementation Trends ---

In [None]:
# set the background style to white and remove gridlines
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "axes.grid": False})

# create plot
plt.figure(figsize=(10, 6))
sns.lineplot(x='FY', y='Implemented', data=status_grouped, label='Implemented', color='green')
sns.lineplot(x='FY', y='Not Implemented', data=status_grouped, label='Not Implemented', color='#FF7F7F')
plt.title('Comparison of Implemented vs. Not Implemented Recommendations Over Time')
plt.xlabel('Fiscal Year')
plt.ylabel('Number of Recommendations')
plt.legend()


relative_path_assets = Path('assets')

# get absolute path
absolute_path_assets = relative_path_assets.resolve()
print(absolute_path_assets)

# save the first chart
line_chart1_path = absolute_path_assets / "line_chart1.jpeg"
plt.savefig(line_chart1_path, format="jpeg", dpi=300, bbox_inches="tight")
print(f"Chart 1 saved to: {line_chart1_path}")

plt.show()


# --- Visualize IAC ARCs Implementation Cost vs Savings ---

In [None]:
# create a DataFrame with implemented ARCs 
implemented_df = iac_df[iac_df['IMPSTATUS'].isin(['I'])]

# define all columns with 'SAVED'
savings_columns = [col for col in implemented_df.columns if 'SAVED' in col]

# aggregate total savings from all *SAVED columns
total_savings = implemented_df[savings_columns].sum(axis=1)

# sum all savings columns
implemented_df['total_savings'] = implemented_df[savings_columns].sum(axis=1)


In [None]:
# set the background style to white and remove gridlines
sns.set(style="whitegrid", rc={"axes.facecolor": "white", "axes.grid": False})

# group by fiscal year and calculate total cost and total savings
implemented_df['IC_CAPITAL'].fillna(0, inplace=True)
implemented_df['IC_OTHER'].fillna(0, inplace=True)
implemented_df['total_savings'] = implemented_df.filter(like='SAVED').sum(axis=1)

In [None]:
comparison_grouped = implemented_df.groupby('FY').agg({
    'IC_CAPITAL': 'sum',
    'IC_OTHER': 'sum',
    'total_savings': 'sum'
}).reset_index()

# add the costs after aggregation
comparison_grouped['total_cost'] = comparison_grouped['IC_CAPITAL'] + comparison_grouped['IC_OTHER']

In [None]:
# calculate percentage of total cost and savings
comparison_grouped['total'] = comparison_grouped['total_cost'] + comparison_grouped['total_savings']
comparison_grouped['cost_percentage'] = comparison_grouped['total_cost'] / comparison_grouped['total'] * 100
comparison_grouped['savings_percentage'] = comparison_grouped['total_savings'] / comparison_grouped['total'] * 100

plt.figure(figsize=(12, 8))
comparison_grouped.plot(kind='bar', x='FY', stacked=True, y=['cost_percentage', 'savings_percentage'])
plt.axhline(y=50, color='r', linestyle='--')

plt.annotate('50%', 
            xy=(len(comparison_grouped) - 1, 50),
            xytext=(10, 0),
            textcoords='offset points',
            color='r',
            va='center')

# show every nth label (e.g., every 2nd or 3rd label)
n = 3  # adjust this value based on your data density
plt.xticks(range(0, len(comparison_grouped), n), 
          comparison_grouped['FY'][::n], 
          rotation=45, 
          ha='right')

plt.title('Relative Comparison of Cost vs. Savings for Implemented Recommendations')
plt.xlabel('Fiscal Year')
plt.ylabel('Percentage')
plt.legend(title='Type', 
          labels=['Cost %', 'Savings %'], 
          loc='upper left', 
          bbox_to_anchor=(1.05, 1))

plt.tight_layout()
plt.show()

