Given the attached excel file: Each sheet contains data about a different target disease.
Write and run python code that will do the following analysis: For each target,, calculate and display per group (groups are defined by the 3 seperate columns: "source-shap"	"source-KG"	"source-lit") get the average and sum cases of "Interesting Overall-pred" ==1 .

So, there should be the avg and sum "Interesting" cases, per cases where "source-shap"/"source-KG"/"source-lit" is 1 (3 combinations ) , per disease/target/sheet.


* Warning/note - this is running on ai evaluated cnadidates
* https://docs.google.com/spreadsheets/d/1BPWnCT4PCgAKh4IBYtZlE-KqgJF-3OsxTEa7Uu1fHhM/edit?usp=sharing
    * `Ablation Candidates - InterFeat` (Note - not run with interfeat's LLM pipeline - rather gemini ai  on list)

In [1]:

import pandas as pd
import os

def calculate_interesting_summary_per_group(excel_file="Ablation Candidates - merged.xlsx",
                                             interesting_col='Interesting Overall-pred',
                                             group_cols=['source-shap', 'source-KG', 'source-lit']):
    """
    Processes each sheet in an Excel file to calculate the sum and average
    of an 'interesting' column for subsets defined by individual 'group' columns.

    For each sheet, and for each column listed in group_cols, it filters the data
    where the group column's value is greater than 0 and then calculates the
    sum and mean of the interesting_col for that subset.

    Parameters:
    - excel_file (str): Path to the Excel file.
    - interesting_col (str): The column to calculate sum and average for.
    - group_cols (list): A list of column names, each defining a group based on its value being > 0.

    Returns:
    - dict: A dictionary where keys are sheet names and values are pandas DataFrames
            containing the calculated sum and mean for each group column.
            Returns None if the file is not found.
    """
    # Check if the file exists
    if not os.path.exists(excel_file):
        print(f"Error: File not found at {excel_file}")
        return None

    try:
        # Load the Excel file
        excel_data = pd.ExcelFile(excel_file)
    except Exception as e:
        print(f"Error loading Excel file: {e}")
        return None

    # Initialize a dictionary to store results for all sheets
    all_sheets_summary = {}

    # Loop through each sheet (target disease)
    for sheet_name in excel_data.sheet_names:
        print(f"\nProcessing sheet: {sheet_name}...")
        try:
            sheet_data = excel_data.parse(sheet_name)

            # Check if required columns exist
            required_cols = group_cols + [interesting_col]
            missing_cols = [col for col in required_cols if col not in sheet_data.columns]
            if missing_cols:
                print(f"  Warning: Sheet '{sheet_name}' missing required columns: {missing_cols}. Skipping.")
                continue

            # Check if interesting_col is numeric
            if not pd.api.types.is_numeric_dtype(sheet_data[interesting_col]):
                 try:
                     # Attempt to convert to numeric, coercing errors to NaN
                     sheet_data[interesting_col] = pd.to_numeric(sheet_data[interesting_col], errors='coerce')
                     # Optional: Handle NaNs if necessary, e.g., fill with 0
                     # sheet_data[interesting_col].fillna(0, inplace=True)
                     print(f"  Info: Column '{interesting_col}' converted to numeric.")
                 except Exception as e:
                     print(f"  Warning: Could not convert column '{interesting_col}' to numeric in sheet '{sheet_name}'. Skipping. Error: {e}")
                     continue

            # Initialize a dictionary to store results for the current sheet
            sheet_results_dict = {}

            # Calculate sum and average for each group column separately
            for group_col in group_cols:
                 # Check if group_col is numeric before filtering
                 if not pd.api.types.is_numeric_dtype(sheet_data[group_col]):
                     try:
                         # Attempt conversion, coercing errors
                         sheet_data[group_col] = pd.to_numeric(sheet_data[group_col], errors='coerce')
                         print(f"  Info: Column '{group_col}' converted to numeric.")
                     except Exception as e:
                         print(f"  Warning: Could not convert column '{group_col}' to numeric. Skipping this group for sheet '{sheet_name}'. Error: {e}")
                         sheet_results_dict[group_col] = {'Sum Cases': 'Error', 'Avg Cases': 'Error'}
                         continue


                 # Filter data for the current group (where group_col > 0 and is not NaN)
                 df_group = sheet_data[sheet_data[group_col].notna() & (sheet_data[group_col] > 0)]


                 if not df_group.empty:
                     # Calculate sum and mean of the interesting column for this group
                     # Drop NaN values in the interesting column before aggregation
                     interesting_values = df_group[interesting_col].dropna()
                     if not interesting_values.empty:
                         interesting_sum = interesting_values.sum()
                         interesting_avg = interesting_values.mean()
                     else:
                          interesting_sum = 0
                          interesting_avg = 0.0 # Or float('nan') if preferred

                 else:
                     # Handle cases where the group is empty
                     interesting_sum = 0
                     interesting_avg = 0.0 # Or float('nan') if preferred

                 # Store results using the group column name as the key
                 sheet_results_dict[group_col] = {
                     'Sum Cases': interesting_sum,
                     'Avg': interesting_avg
                 }

            # Convert the results dictionary to a DataFrame for better display
            results_df = pd.DataFrame.from_dict(sheet_results_dict, orient='index').round(2)
            all_sheets_summary[sheet_name] = results_df

        except Exception as e:
            print(f"  Error processing sheet '{sheet_name}': {e}")
            all_sheets_summary[sheet_name] = f"Error: {e}" # Store error message

    return all_sheets_summary

In [2]:

# --- Main Execution ---
# Define the path to your Excel file
# *** Please ensure 'Ablation Candidates - merged.xlsx' is uploaded or accessible ***
file_path = "Ablation Candidates - merged.xlsx"

# Calculate the summaries
summaries = calculate_interesting_summary_per_group(excel_file=file_path)

# Display the results
if summaries:
    print("\n--- Summary Results ---")
    for sheet, result_df in summaries.items():
        print(f"\nTarget Disease: {sheet}")
        if isinstance(result_df, pd.DataFrame):
            print(result_df)
        else:
            # Print error message if processing failed for a sheet
            print(result_df)
    print("\n--- End of Summary ---")
else:
    print("Analysis could not be completed.")


Processing sheet: Gallstones...

Processing sheet: Oesophagus Cancer...

Processing sheet: Gout...

--- Summary Results ---

Target Disease: Gallstones
             Sum Cases   Avg
source-shap          1  0.07
source-KG            2  0.13
source-lit           3  0.20

Target Disease: Oesophagus Cancer
             Sum Cases   Avg
source-shap          1  0.07
source-KG            3  0.20
source-lit           3  0.20

Target Disease: Gout
             Sum Cases   Avg
source-shap          0  0.00
source-KG            0  0.00
source-lit           2  0.13

--- End of Summary ---


In [3]:
# excel_file="Ablation Candidates - merged.xlsx"
# interesting_col='Interesting Overall-pred'; 
# group_cols=['source-shap', 'source-KG', 'source-lit']