In [1]:
import pandas as pd
import numpy as np

#read the Excel in
analysis_file_df = pd.read_excel("filtered_client_file_amfam.xls.xlsx", engine="openpyxl")
print("Processing...")
print("Analysis_file loaded in...")

Processing...
Analysis_file loaded in...


In [2]:
# Create a copy of the original dataframe
original_df = analysis_file_df.copy()

# Filter out rows with blank scores
analysis_file_df = analysis_file_df.dropna(subset=["level_1", "level_2"])

# Function to create score distributions
def create_score_distributions(df, level_1, level_2, groupby_columns):
    # Group by the specified columns and extract the counts for the specified level_1 and level_2 columns
    count_stats = df.groupby(groupby_columns)[[level_1, level_2]].count()

    # Calculate the percentage distribution and convert it to a decimal
    count_stats["% distribution"] = count_stats.groupby(level=groupby_columns[0])[level_1].apply(lambda x: x / x.sum())

    # Rename the count column to include the level name
    count_stats = count_stats.rename(columns={level_1: f"{level_1} - count", level_2: f"{level_2} - count"})
    return count_stats[["% distribution"]]

def create_score_distributions_l1_l2_intersection(df, level_1_col, level_2_col, groupby_columns, return_percentages=True):
    grouped_df = df.groupby(groupby_columns).size().reset_index(name='count')
    pivot_table = grouped_df.pivot_table(values='count', index=[groupby_columns[0], level_1_col], columns=level_2_col, fill_value=0).reset_index()

    if return_percentages:
        # Calculate the percentage distribution
        score_columns = pivot_table.columns[2:]
        for score_year in pivot_table[groupby_columns[0]].unique():
            total_count = pivot_table[pivot_table[groupby_columns[0]] == score_year][score_columns].sum().sum()
            pivot_table.loc[pivot_table[groupby_columns[0]] == score_year, score_columns] = pivot_table.loc[pivot_table[groupby_columns[0]] == score_year, score_columns].apply(lambda x: x / total_count)

    return pivot_table

# Function to calculate pilot stats
def calculate_pilot_stats(df, original_df):
    total_records = len(original_df)
    l1_provided = len(df.dropna(subset=["level_1"])) / total_records
    l2_provided = len(df.dropna(subset=["level_2"])) / total_records

    level_1_describe = df["level_1"].describe()
    level_2_describe = df["level_2"].describe()

    state_counts = df["state"].value_counts(normalize=True)
    
    pilot_stats = pd.concat([
        pd.Series({"Total Records": total_records,
                   "% L1 Provided": l1_provided,
                   "% L2 Provided": l2_provided}),
        level_1_describe.rename(lambda x: f"L1 {x.capitalize()}"),
        level_2_describe.rename(lambda x: f"L2 {x.capitalize()}"),
        state_counts.rename(lambda x: f"% {x}")
    ])

    return pilot_stats.to_frame().T

# Calculate pilot stats for the entire dataset
pilot_stats = calculate_pilot_stats(analysis_file_df, original_df)

# Distributions for the entire dataset
entire_dataset_l1 = create_score_distributions(analysis_file_df, "level_1", "l1_risk_level", ["level_1", "l1_risk_level"])
entire_dataset_l2 = create_score_distributions(analysis_file_df, "level_2", "l2_risk_level", ["level_2", "l2_risk_level"])

# Distributions for the entire dataset less CA
no_ca_df = analysis_file_df[analysis_file_df["state"] != "CA"]
entire_dataset_no_ca_l1 = create_score_distributions(no_ca_df, "level_1", "l1_risk_level", ["score_year", "level_1", "l1_risk_level"])
entire_dataset_no_ca_l2 = create_score_distributions(no_ca_df, "level_2", "l2_risk_level", ["score_year", "level_2", "l2_risk_level"])

# Distributions for each score_year
score_year_l1 = create_score_distributions(analysis_file_df, "level_1", "l1_risk_level", ["score_year", "level_1", "l1_risk_level"])
score_year_l2 = create_score_distributions(analysis_file_df, "level_2", "l2_risk_level", ["score_year", "level_2", "l2_risk_level"])

# For each state
for state in analysis_file_df["state"].unique():
    state_df = analysis_file_df[analysis_file_df["state"] == state]
    state_score_year_l1_l2 = create_score_distributions_l1_l2_intersection(state_df, "level_1", "level_2", ["score_year", "level_1", "level_2"])

# For the "Score Year L1" and "Score Year L2" tabs
score_year_l1_l2 = create_score_distributions_l1_l2_intersection(analysis_file_df, "level_1", "level_2", ["score_year", "level_1", "level_2"])

print("Calculations complete...")

Calculations complete...


In [3]:
with pd.ExcelWriter("score_distributions.xlsx") as writer:
    # Write score year distributions to the workbook
    score_year_l1.to_excel(writer, sheet_name="Score Year L1")
    score_year_l2.to_excel(writer, sheet_name="Score Year L2")

    # Create matrixed distributions by count and percentage
    score_year_l1_l2_count = create_score_distributions_l1_l2_intersection(analysis_file_df, "level_1", "level_2", ["score_year", "level_1", "level_2"], return_percentages=False)
    score_year_l1_l2_percentage = create_score_distributions_l1_l2_intersection(analysis_file_df, "level_1", "level_2", ["score_year", "level_1", "level_2"], return_percentages=True)

    # Write matrixed distribution by count for the Score Year L1 and L2 tabs
    score_year_l1_l2_count.to_excel(writer, sheet_name="Score Year L2", startrow=score_year_l2.shape[0] + 3, index=False)
    score_year_l1_l2_count.to_excel(writer, sheet_name="Score Year L1", startrow=score_year_l1.shape[0] + 3, index=False)

    # Write matrixed distribution by percentage for the Score Year L1 and L2 tabs
    score_year_l1_l2_percentage.to_excel(writer, sheet_name="Score Year L2", startrow=score_year_l2.shape[0] + 2 + score_year_l1_l2_count.shape[0] + 4, index=False)
    score_year_l1_l2_percentage.to_excel(writer, sheet_name="Score Year L1", startrow=score_year_l1.shape[0] + 2 + score_year_l1_l2_count.shape[0] + 4, index=False)

    # Write X-CA L1 and X-CA L2 sheets to the workbook
    entire_dataset_no_ca_l1.to_excel(writer, sheet_name="X-CA L1", index_label="Score Distributions")
    entire_dataset_no_ca_l2.to_excel(writer, sheet_name="X-CA L2", index_label="Score Distributions")

    # Distributions for each state
    for state in analysis_file_df["state"].unique():
        state_df = analysis_file_df[analysis_file_df["state"] == state]

        # Calculate distributions for the state
        state_l1 = create_score_distributions(state_df, "level_1", "l1_risk_level", ["level_1", "l1_risk_level"])
        state_l2 = create_score_distributions(state_df, "level_2", "l2_risk_level", ["level_2", "l2_risk_level"])

        # Calculate score year distributions for the state
        state_score_year_l1 = create_score_distributions(state_df, "level_1", "l1_risk_level", ["score_year", "level_1", "l1_risk_level"])
        state_score_year_l2 = create_score_distributions(state_df, "level_2", "l2_risk_level", ["score_year", "level_2", "l2_risk_level"])

        # Save score year distributions for each state
        state_score_year_l1.to_excel(writer, sheet_name=f"State {state} L1", index_label="Score Year Distributions")
        state_score_year_l2.to_excel(writer, sheet_name=f"State {state} L2", index_label="Score Year Distributions")

        # Calculate and save matrixed distribution for each state
        state_score_year_l1_l2 = create_score_distributions_l1_l2_intersection(state_df, "level_1", "level_2", ["score_year", "level_1", "level_2"], return_percentages=False)
        state_score_year_l1_l2.to_excel(writer, sheet_name=f"State {state} L1", startrow=state_score_year_l1.shape[0] + 3, index=False)
        state_score_year_l1_l2.to_excel(writer, sheet_name=f"State {state} L2", startrow=state_score_year_l2.shape[0] + 3, index=False)

        # Save percentage distribution matrix for each state
        state_score_year_l1_l2_percentage = create_score_distributions_l1_l2_intersection(state_df, "level_1", "level_2", ["score_year", "level_1", "level_2"], return_percentages=True)
        state_score_year_l1_l2_percentage.to_excel(writer, sheet_name=f"State {state} L1", startrow=state_score_year_l1.shape[0] + state_score_year_l1_l2.shape[0] + 6, index=False)
        state_score_year_l1_l2_percentage.to_excel(writer, sheet_name=f"State {state} L2", startrow=state_score_year_l2.shape[0] + state_score_year_l1_l2.shape[0] + 6, index=False)


    # Save Pilot Stats to a sheet in the workbook
    pilot_stats.to_excel(writer, sheet_name="Pilot Stats", index=False)
    
print("File shell created...")

File shell created...


In [4]:
import openpyxl
import pandas as pd

# Read the sheets of interest
res_l1_state_baselines = pd.read_excel("State Preprocessing Distributions.xlsx", sheet_name="Res. L1 State Baselines", engine="openpyxl")
res_l2_state_baselines = pd.read_excel("State Preprocessing Distributions.xlsx", sheet_name="Res. L2 State Baselines", engine="openpyxl")

# Function to check if a sheet exists in an Excel file
def sheet_exists(file_path, sheet_name):
    workbook = openpyxl.load_workbook(file_path, read_only=True)
    return sheet_name in workbook.sheetnames

# Load the Excel workbook
book = openpyxl.load_workbook("score_distributions.xlsx")

for state in analysis_file_df["state"].unique():
    state_l1_sheet = f"State {state} L1"
    state_l2_sheet = f"State {state} L2"
    
    if sheet_exists("score_distributions.xlsx", state_l1_sheet) and sheet_exists("score_distributions.xlsx", state_l2_sheet):
        # Read the state's L1 and L2 sheets
        state_l1_data = pd.read_excel("score_distributions.xlsx", sheet_name=state_l1_sheet, engine="openpyxl")
        state_l2_data = pd.read_excel("score_distributions.xlsx", sheet_name=state_l2_sheet, engine="openpyxl")
        
        # Get the state's L1 and L2 baselines
        state_l1_baselines = res_l1_state_baselines[res_l1_state_baselines["State"] == state]["L1 Distribution"].reset_index(drop=True)
        state_l2_baselines = res_l2_state_baselines[res_l2_state_baselines["State"] == state]["L2 Distribution"].reset_index(drop=True)

        # Get the existing sheets
        ws_l1 = book[state_l1_sheet]
        ws_l2 = book[state_l2_sheet]

        # Add the "Industry Baseline" heading
        ws_l1.cell(row=1, column=5, value="Industry Baseline")
        ws_l2.cell(row=1, column=5, value="Industry Baseline")

        # Clear the old data
        for row in ws_l1.iter_rows(min_row=2, min_col=6, max_row=11, max_col=15):
            for cell in row:
                cell.value = None
        for row in ws_l2.iter_rows(min_row=2, min_col=6, max_row=11, max_col=15):
            for cell in row:
                cell.value = None

        # Append new data to the sheets
        for index, value in enumerate(state_l1_baselines):
            ws_l1.cell(row=index+2, column=5, value=value)
        for index, value in enumerate(state_l2_baselines):
            ws_l2.cell(row=index+2, column=5, value=value)
    else:
        print(f"Worksheets for state '{state}' not found in score_distributions.xlsx")

# Save the updated workbook
book.save("score_distributions.xlsx")
book.close()
print("All data appended to shell file successfully...")
print("the file is in the directory and is called score_distributions...")

All data appended to shell file successfully...
the file is in the directory and is called score_distributions...
