In [38]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Load the data from both files
file_xlsx = 'NRes/Model_Results_FL_3.xlsx'  # Update the path accordingly
file_csv = 'NRes/Model_Results_Local_3.csv'  # Update the path accordingly

# Reading the Excel file
data_xlsx = pd.read_excel(file_xlsx)

# Reading the CSV file
data_csv = pd.read_csv(file_csv)

# Preprocess CSV data: convert percentage strings to floats
data_csv['Macro Average Precision'] = data_csv['Macro Average Precision'].str.rstrip('%').astype('float') / 100
data_csv['Macro Average Recall'] = data_csv['Macro Average Recall'].str.rstrip('%').astype('float') / 100

# Rename columns for consistency in Excel data
data_xlsx.rename(columns={'Average Precision': 'Macro Average Precision', 'Average Recall': 'Macro Average Recall'}, inplace=True)

# Select relevant columns for comparison
data_xlsx_relevant = data_xlsx[['Agency', 'Macro Average Precision', 'Macro Average Recall']]
data_csv_relevant = data_csv[['Agency', 'Macro Average Precision', 'Macro Average Recall']]

# Aggregate data by Agency and calculate mean for precision and recall
aggregated_xlsx = data_xlsx_relevant.groupby('Agency').mean().reset_index()
aggregated_csv = data_csv_relevant.groupby('Agency').mean().reset_index()

# Calculate F1 Score using the formula: 2 * (precision * recall) / (precision + recall)
aggregated_xlsx['Average F1_Score'] = 2 * (aggregated_xlsx['Macro Average Precision'] * aggregated_xlsx['Macro Average Recall']) / (aggregated_xlsx['Macro Average Precision'] + aggregated_xlsx['Macro Average Recall'])
aggregated_csv['Average F1_Score'] = 2 * (aggregated_csv['Macro Average Precision'] * aggregated_csv['Macro Average Recall']) / (aggregated_csv['Macro Average Precision'] + aggregated_csv['Macro Average Recall'])



In [40]:
def create_and_save_enhanced_f1_graph(pdf_filename, aggregated_data_xlsx, aggregated_data_csv):
    plt.rcParams.update({'font.size': 24, 'font.weight': 'normal'})  # Update default rc settings
    agencies = sorted(set(aggregated_data_xlsx['Agency']) | set(aggregated_data_csv['Agency']), key=int)
    with PdfPages(pdf_filename) as pdf:
        fig, ax = plt.subplots(figsize=(10, 7))
        width = 0.35
        x_indexes = range(len(agencies))

        # Plot bars for each data set
        ax.bar(x_indexes, aggregated_xlsx['Average F1_Score'], width=-width, align='edge', label='Federated', color='skyblue', edgecolor='black')
        ax.bar(x_indexes, aggregated_csv['Average F1_Score'], width=width, align='edge', label='Isolated', color='lightgreen', edgecolor='black')

        # Set x-axis labels and titles
        ax.set_xticks(x_indexes)
        ax.set_xticklabels(agencies)
        ax.set_xlabel('Agency')
        ax.set_ylabel('Average F1 Score')

        # Manually adjust the y-axis limits to provide space for the legend
        current_ylim = ax.get_ylim()
        ax.set_ylim(current_ylim[0], current_ylim[1] * 1.12)  # Increase the upper limit by 10%

        # Add a legend
        ax.legend(loc='upper center', ncol=2)#, bbox_to_anchor=(0.5, 1.15), ncol=2)

        # Ensure layout is tight so everything fits without overlapping
        plt.tight_layout()

        # Save the figure to PDF
        pdf.savefig(fig)
        plt.close()

# Replace 'aggregated_xlsx' and 'aggregated_csv' with your actual DataFrame variables
create_and_save_enhanced_f1_graph('f1_score_comparison.pdf', aggregated_xlsx, aggregated_csv)