<a href="https://colab.research.google.com/github/Marcello-Mar/Susceptibility-Index-Script/blob/main/Marcello_Sensitivity_Index_Script_RC_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ----------------- IMPORTS -----------------
import pandas as pd
import numpy as np
from google.colab import files
from itertools import combinations
from scipy.stats import beta
from openpyxl import Workbook, load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from dataclasses import dataclass
from typing import List, Tuple, Dict, Optional, Union

# ----------------- DATA CLASSES -----------------
@dataclass
class AntibioticResult:
    name: str
    s_percentage: float
    lower_ci: float
    upper_ci: float
    s_count: int
    r_count: int
    tested_percentage: Optional[float] = None
    global_effectiveness: Optional[float] = None
    lower_ci_global: Optional[float] = None
    upper_ci_global: Optional[float] = None

@dataclass
class CombinationResult:
    combination: Tuple[str, str]
    effectiveness: float
    global_effectiveness: float
    lower_ci: float
    upper_ci: float
    s_count: int
    r_count: int
    tested_percentage: float
    lower_ci_global: Optional[float] = None
    upper_ci_global: Optional[float] = None

# ----------------- CORE ANALYZER CLASS -----------------
class AntibioticAnalyzer:
    VALID_VALUES = {'S', 'R', 'N'}

    def __init__(self, data_file_path: str, gram_file_path: str, exclusion_file_path: Optional[str] = None, confidence_level: float = 0.95):
        self.data_file_path = data_file_path
        self.gram_file_path = gram_file_path
        self.exclusion_file_path = exclusion_file_path
        self.CONFIDENCE_LEVEL = confidence_level
        self.gram_stain_map = self._load_gram_stain_map()
        self.excluded_count = 0
        self.data = self._load_and_validate_data()

    def _load_gram_stain_map(self) -> Dict[str, str]:
        """Loads or creates the Gram stain map from an Excel file."""
        try:
            df = pd.read_excel(self.gram_file_path)
            return {
                str(row['Microorganism']).lower(): str(row['Gram_stain']).lower().replace('gram-', '')
                for index, row in df.iterrows()
            }
        except FileNotFoundError:
            print(f"Gram stain map file not found. A new one will be created at '{self.gram_file_path}' as you classify new bacteria.")
            return {}

    def _save_gram_stain_map(self):
        """Saves the updated Gram stain map to the Excel file."""
        df = pd.DataFrame(list(self.gram_stain_map.items()), columns=['Microorganism', 'Gram_stain'])
        df['Gram_stain'] = 'Gram-' + df['Gram_stain']
        df.to_excel(self.gram_file_path, index=False)

    def _get_gram_stain(self, organism_name: str) -> str:
        """Determines the Gram stain for a given microorganism name."""
        organism_lower = organism_name.lower()

        for key, value in self.gram_stain_map.items():
            if key in organism_lower:
                return value

        print(f"\nMicroorganism '{organism_name}' not found in Gram stain map.")
        while True:
            response = input("Is it Gram-positive or Gram-negative? (type 'positive' or 'negative'): ").lower().strip()
            if response in ['positive', 'negative']:
                self.gram_stain_map[organism_lower] = response
                self._save_gram_stain_map()
                print(f"'{organism_name}' saved as Gram-{response}.")
                return response
            else:
                print("Invalid input. Please type 'positive' or 'negative'.")

    def _load_and_validate_data(self) -> pd.DataFrame:
        """Loads, validates, and classifies the main data file."""
        try:
            data = pd.read_excel(self.data_file_path)

            if 'Microorganism' not in data.columns:
                data.rename(columns={data.columns[0]: 'Microorganism'}, inplace=True)

            # --- NEW FEATURE: EXCLUSION FILTERING ---
            if self.exclusion_file_path:
                try:
                    exclusion_df = pd.read_excel(self.exclusion_file_path)
                    exclusion_list = exclusion_df.iloc[:, 0].str.lower().tolist()

                    initial_count = len(data)
                    data = data[~data['Microorganism'].str.contains('|'.join(exclusion_list), case=False, regex=True)]
                    self.excluded_count = initial_count - len(data)
                    print(f"\nSuccessfully excluded {self.excluded_count} isolates based on the provided list.")
                except Exception as e:
                    print(f"Warning: Could not load or apply exclusion list. Error: {e}")
                    self.excluded_count = 0

            # Continue with previous steps on the filtered data
            data['Gram_stain'] = data['Microorganism'].apply(self._get_gram_stain)

            data.iloc[:, 1:-1] = data.iloc[:, 1:-1].apply(
                lambda col: col.str.upper() if col.dtype == 'object' else col
            )
            data.replace({'I': 'S'}, inplace=True)
            data.fillna('N', inplace=True)

            self._validate_data(data)
            return data

        except Exception as e:
            raise ValueError(f"Error loading file: {e}")

    def _validate_data(self, data: pd.DataFrame):
        """Validate data structure and values"""
        if data.empty:
            raise ValueError("No valid data remaining after filtering. Please check your exclusion list.")

        antibiotic_cols = data.columns[1:-1]
        invalid_cells = [
            (i + 2, col) for i, row in data.iterrows()
            for col, value in row[antibiotic_cols].items() if value not in self.VALID_VALUES
        ]

        if invalid_cells:
            error_msg = "\n".join(
                f"Invalid value in Row {row}, Column {col}"
                for row, col in invalid_cells
            )
            raise ValueError(f"Invalid values detected:\n{error_msg}")

    def _calculate_confidence_interval(self, success_count: int, failure_count: int) -> Tuple[float, float]:
        """Calculate confidence interval using beta distribution"""
        if success_count + failure_count == 0:
            return 0.0, 0.0
        alpha = 1 - self.CONFIDENCE_LEVEL

        lower = beta.ppf(alpha/2, success_count, failure_count + 1)
        upper = beta.ppf(1 - alpha/2, success_count + 1, failure_count)

        return lower, upper

    def _calculate_single_antibiotics(self, df: pd.DataFrame, threshold: float) -> List[AntibioticResult]:
        """Calculate effectiveness of single antibiotics on a given dataframe"""
        if df.empty:
            return []

        filtered_data = self._apply_threshold_filter(df, threshold)
        results = []
        total_samples = len(filtered_data)

        for col in filtered_data.columns[1:-1]:
            s_count = (filtered_data[col] == 'S').sum()
            r_count = (filtered_data[col] == 'R').sum()
            total_tested = s_count + r_count

            s_percentage = (s_count / total_tested) * 100 if total_tested > 0 else 0

            lower_ci, upper_ci = self._calculate_confidence_interval(s_count, r_count)

            global_effectiveness = (s_count / total_samples) * 100

            global_failures = total_samples - s_count
            lower_ci_global, upper_ci_global = self._calculate_confidence_interval(s_count, global_failures)

            results.append(AntibioticResult(
                name=col,
                s_percentage=s_percentage,
                lower_ci=lower_ci * 100,
                upper_ci=upper_ci * 100,
                s_count=s_count,
                r_count=r_count,
                tested_percentage=(total_tested / total_samples) * 100,
                global_effectiveness=global_effectiveness,
                lower_ci_global=lower_ci_global * 100,
                upper_ci_global=upper_ci_global * 100
            ))

        return sorted(results, key=lambda x: x.s_percentage, reverse=True)

    def _calculate_combinations(self, df: pd.DataFrame, threshold: float) -> List[CombinationResult]:
        """Calculate effectiveness of antibiotic combinations on a given dataframe"""
        if df.empty:
            return []

        filtered_data = self._apply_threshold_filter(df, threshold)
        antibiotic_cols = filtered_data.columns[1:-1]
        combinations_list = list(combinations(antibiotic_cols, 2))
        results = []

        for comb in combinations_list:
            col1, col2 = comb
            mask = (filtered_data[col1] != 'N') | (filtered_data[col2] != 'N')

            s_count = ((filtered_data[col1] == 'S') | (filtered_data[col2] == 'S')).sum()
            tested_count = mask.sum()
            r_count_local = tested_count - s_count

            effectiveness = (s_count / tested_count * 100) if tested_count > 0 else 0

            lower_ci, upper_ci = self._calculate_confidence_interval(s_count, r_count_local)

            global_effectiveness = (s_count / len(filtered_data)) * 100

            global_failures = len(filtered_data) - s_count
            lower_ci_global, upper_ci_global = self._calculate_confidence_interval(s_count, global_failures)

            results.append(CombinationResult(
                combination=comb,
                effectiveness=effectiveness,
                global_effectiveness=global_effectiveness,
                lower_ci=lower_ci * 100,
                upper_ci=upper_ci * 100,
                s_count=s_count,
                r_count=((filtered_data[col1] == 'R') & (filtered_data[col2] == 'R')).sum(),
                tested_percentage=(tested_count / len(filtered_data)) * 100,
                lower_ci_global=lower_ci_global * 100,
                upper_ci_global=upper_ci_global * 100
            ))

        return sorted(results, key=lambda x: x.effectiveness, reverse=True)

    def _apply_threshold_filter(self, df: pd.DataFrame, threshold: float) -> pd.DataFrame:
        """Filter antibiotics based on testing threshold for a given dataframe"""
        if df.empty:
            return pd.DataFrame()

        threshold /= 100
        valid_columns = [
            col for col in df.columns[1:-1]
            if (df[col].isin(['S', 'R']).sum() / len(df)) >= threshold
        ]
        return df[['Microorganism'] + valid_columns + ['Gram_stain']]

    def analyze(self, threshold: float) -> Dict[str, Dict[str, Union[List[AntibioticResult], List[CombinationResult]]]]:
        """Performs segmented analysis and returns all results."""
        all_isolates = self.data
        gram_positive_isolates = all_isolates[all_isolates['Gram_stain'] == 'positive']
        gram_negative_isolates = all_isolates[all_isolates['Gram_stain'] == 'negative']

        results = {
            'All Isolates': {
                'Singles': self._calculate_single_antibiotics(all_isolates, threshold),
                'Combinations': self._calculate_combinations(all_isolates, threshold)
            },
            'Gram-Positive Isolates': {
                'Singles': self._calculate_single_antibiotics(gram_positive_isolates, threshold),
                'Combinations': self._calculate_combinations(gram_positive_isolates, threshold)
            },
            'Gram-Negative Isolates': {
                'Singles': self._calculate_single_antibiotics(gram_negative_isolates, threshold),
                'Combinations': self._calculate_combinations(gram_negative_isolates, threshold)
            }
        }
        return results

    def generate_summary_data(self) -> Dict[str, pd.DataFrame]:
        """Generates summary data for the main report sheet."""
        summary = {}

        # 1. Exclusion Count
        excluded_df = pd.DataFrame(
            {'Number of Excluded Isolates': [self.excluded_count]}
        ).T
        summary['Exclusion Count'] = excluded_df

        # 2. Gram Stain Distribution
        total_count = len(self.data)
        gram_counts = self.data['Gram_stain'].value_counts()

        gram_summary = pd.DataFrame(
            {
                'Count': gram_counts,
                'Percentage': (gram_counts / total_count * 100).round(2)
            }
        ).sort_index().T
        summary['Gram Stain Distribution'] = gram_summary

        # 3. Top 10 Pathogens
        pathogen_counts = self.data['Microorganism'].value_counts()
        top_10_pathogens = pd.DataFrame(
            {
                'Count': pathogen_counts.head(10),
                'Percentage': (pathogen_counts.head(10) / total_count * 100).round(2)
            }
        )
        summary['Top 10 Pathogens'] = top_10_pathogens

        # 4. Top 10 Most Resistant Antibiotics (Gram+ and Gram-)
        antibiotic_cols = self.data.columns[1:-1]

        gram_pos_data = self.data[self.data['Gram_stain'] == 'positive']
        gram_pos_resistance = []
        for col in antibiotic_cols:
            tested_count = (gram_pos_data[col].isin(['S', 'R'])).sum()
            r_count = (gram_pos_data[col] == 'R').sum()
            if tested_count > 0:
                resistance_rate = (r_count / tested_count) * 100
                gram_pos_resistance.append({'Antibiotic': col, 'Resistance %': round(resistance_rate, 2)})

        gram_pos_res_df = pd.DataFrame(gram_pos_resistance).sort_values(by='Resistance %', ascending=False).head(10)
        summary['Top 10 Resistant Gram-Positives'] = gram_pos_res_df

        gram_neg_data = self.data[self.data['Gram_stain'] == 'negative']
        gram_neg_resistance = []
        for col in antibiotic_cols:
            tested_count = (gram_neg_data[col].isin(['S', 'R'])).sum()
            r_count = (gram_neg_data[col] == 'R').sum()
            if tested_count > 0:
                resistance_rate = (r_count / tested_count) * 100
                gram_neg_resistance.append({'Antibiotic': col, 'Resistance %': round(resistance_rate, 2)})

        gram_neg_res_df = pd.DataFrame(gram_neg_resistance).sort_values(by='Resistance %', ascending=False).head(10)
        summary['Top 10 Resistant Gram-Negatives'] = gram_neg_res_df

        return summary

# ----------------- EXPORTER CLASS -----------------
class ResultsExporter:
    @staticmethod
    def to_excel(all_results: Dict, summary_data: Dict, filename: str = "antibiotic_results.xlsx") -> None:
        """Export all segmented results and a summary to an Excel file with multiple sheets."""
        wb = Workbook()

        ws_summary = wb.create_sheet('Summary', 0)

        # Write Exclusion Count
        ws_summary.append(['Excluded Isolates'])
        for r in dataframe_to_rows(summary_data['Exclusion Count'], index=True, header=False):
            ws_summary.append(r)

        ws_summary.append([]) # spacer

        # Write Gram Stain Distribution
        ws_summary.append(['Gram Stain Distribution'])
        for r in dataframe_to_rows(summary_data['Gram Stain Distribution'], index=True, header=True):
            ws_summary.append(r)

        ws_summary.append([]) # spacer

        # Write Top 10 Pathogens
        ws_summary.append(['Top 10 Pathogens'])
        for r in dataframe_to_rows(summary_data['Top 10 Pathogens'], index=True, header=True):
            ws_summary.append(r)

        ws_summary.append([]) # spacer

        # Write Top 10 Resistant Antibiotics for Gram Positives
        ws_summary.append(['Top 10 Most Resistant Antibiotics (Gram-Positives)'])
        for r in dataframe_to_rows(summary_data['Top 10 Resistant Gram-Positives'], index=False, header=True):
            ws_summary.append(r)

        ws_summary.append([]) # spacer

        # Write Top 10 Resistant Antibiotics for Gram Negatives
        ws_summary.append(['Top 10 Most Resistant Antibiotics (Gram-Negatives)'])
        for r in dataframe_to_rows(summary_data['Top 10 Resistant Gram-Negatives'], index=False, header=True):
            ws_summary.append(r)

        group_names = {
            'All Isolates': 'All',
            'Gram-Positive Isolates': 'Gram+',
            'Gram-Negative Isolates': 'Gram-'
        }

        for analysis_type, data in all_results.items():
            short_name = group_names[analysis_type]

            ws_single = wb.create_sheet(f"{short_name} - Singles")
            single_df = pd.DataFrame([vars(r) for r in data['Singles']])
            for r in dataframe_to_rows(single_df, index=False, header=True):
                ws_single.append(r)

            ws_comb = wb.create_sheet(f"{short_name} - Combs")
            comb_df = pd.DataFrame([vars(r) for r in data['Combinations']])
            comb_df['Combination'] = comb_df['combination'].apply(lambda x: f"{x[0]} + {x[1]}")
            comb_df.drop('combination', axis=1, inplace=True)
            for r in dataframe_to_rows(comb_df, index=False, header=True):
                ws_comb.append(r)

        del wb['Sheet']

        wb.save(filename)
        files.download(filename)
        print(f"\nResults successfully exported to '{filename}' with multiple sheets.")

# ----------------- MAIN FUNCTION -----------------
def main():
    try:
        print("Antibiotic Resistance Analysis Tool v4.0")
        print("----------------------------------------")

        print("\nPlease upload your main antibiotic susceptibility data file (e.g., my_data.xlsx).")
        uploaded_data = files.upload()
        data_file_path = next(iter(uploaded_data))

        print("\nNow, please upload your Gram stain map file (e.g., gram_stain_map.xlsx).")
        print("If you don't have one, just upload an empty Excel file with the headers 'Microorganism' and 'Gram_stain'.")
        uploaded_gram = files.upload()
        gram_file_path = next(iter(uploaded_gram))

        exclusion_file_path = None
        exclude_choice = input("\nDo you want to exclude any microorganisms from the analysis? (yes/no): ").lower().strip()
        if exclude_choice == 'yes':
            print("Please upload your exclusion list file (e.g., commensal_list.xlsx).")
            uploaded_exclusion = files.upload()
            exclusion_file_path = next(iter(uploaded_exclusion))

        if not data_file_path.endswith('.xlsx') or not gram_file_path.endswith('.xlsx') or (exclusion_file_path and not exclusion_file_path.endswith('.xlsx')):
            raise ValueError("Please upload valid Excel files (.xlsx)")

        threshold = float(input("\nEnter minimum testing threshold (0-100): "))
        confidence_input = float(input("Enter confidence level (e.g., 95 for 95%): "))
        if not 0 < confidence_input <= 100:
            raise ValueError("Confidence level must be between 0 and 100.")
        confidence_level = confidence_input / 100.0

        print("\nAnalyzing data...")
        analyzer = AntibioticAnalyzer(data_file_path, gram_file_path, exclusion_file_path, confidence_level=confidence_level)
        all_results = analyzer.analyze(threshold)
        summary_data = analyzer.generate_summary_data()

        print("Exporting results...")
        exporter = ResultsExporter()
        exporter.to_excel(all_results, summary_data)

    except ValueError as ve:
        print(f"\nInput Error: {str(ve)}")
    except Exception as e:
        print(f"\nAn unexpected error occurred: {str(e)}")
        if hasattr(e, 'args') and e.args:
            print("Details:", *e.args)

if __name__ == "__main__":
    main()

Antibiotic Resistance Analysis Tool v4.0
----------------------------------------

Please upload your main antibiotic susceptibility data file (e.g., my_data.xlsx).


Saving Febrile_Neutropenia_2015_2023.xlsx to Febrile_Neutropenia_2015_2023.xlsx

Now, please upload your Gram stain map file (e.g., gram_stain_map.xlsx).
If you don't have one, just upload an empty Excel file with the headers 'Microorganism' and 'Gram_stain'.


Saving gram_stain_map.xlsx to gram_stain_map.xlsx

Do you want to exclude any microorganisms from the analysis? (yes/no): yes
Please upload your exclusion list file (e.g., commensal_list.xlsx).


Saving commensal_list.xlsx to commensal_list.xlsx

Enter minimum testing threshold (0-100): 30
Enter confidence level (e.g., 95 for 95%): 95

Analyzing data...

Successfully excluded 5 isolates based on the provided list.

Microorganism 'bacilli gram negativi' not found in Gram stain map.
Is it Gram-positive or Gram-negative? (type 'positive' or 'negative'): negative
'bacilli gram negativi' saved as Gram-negative.
Exporting results...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Results successfully exported to 'antibiotic_results.xlsx' with multiple sheets.
