# Mutagenecity and CARC complete

In [1]:
%reset -f # variables reset

## Data and libraries load

Load xlsx files from the extraction pipeline.

In [2]:
import pandas as pd
from difflib import SequenceMatcher
import re
from pathlib import Path
import numpy as np

# Load the excel file containing two files to tables to compare, tables should
# contain the same columns

# Testing for Muta
#file_truth_path = '/content/Muta_Truth.xlsx'
#file_compared_path = '/content/Muta_Human.xlsx'
#file_compared_path = '/content/Muta_LLM.xlsx'

# Testing for CARC
file_truth_path = '/content/CARC_Truth.xlsx'
file_compared_path = '/content/CARC_LLM.xlsx'

# Testing for LD
#file_truth_path = '/content/Michael_Truth.xlsx'
#file_compared_path = '/content/Michael_LLM.xlsx'

filename_truth = Path(file_truth_path).stem
filename_compared = Path(file_compared_path).stem

# Load files

table_truth = pd.read_excel(file_truth_path)
table_compared = pd.read_excel(file_compared_path)


## Applying glossaries

Glossaries allow to provide the term equivalency ensuring a proper validation

In [3]:
# Specifying the glossary/picklist
# For this particular Study type was transformed to match OECD guidelines
# Different variations of Data owner are used  as well

# Glossary for Muta and CARC
glossary = {
    "Study Type": {
        "Bacterial Reverse Mutation Test (Ames Test)": "Ames Test", # Muta glossary

         # CARC glossary
        "OECD 453": "Combined chronic toxicity/ carcinogenicity",
        "OECD 451": "Carcinogenicity",
        "OECD 452": "Chronic toxicity",
        "US-EPA 83-5": "Combined chronic toxicity/ carcinogenicity"
    },
    "Data Owner": {
        "Arysta": "Arysta Life Sciences",
        "Helm": "HAG",
        "Helm AG": "HAG",
        "Albaugh":"Albaugh Europe Sàrl"
    },
}


# Testing glossaries with the unexisting columns,
# glossary = {
#     "Study Type": {
#         "Bacterial Reverse Mutation Test (Ames Test)": "Ames Test"
#     },
#     "Result": {
#         "Positive (+)": "Positive",
#         "Negative (-)": "Negative"
#     }
# }

# Strip any leading or trailing whitespace from all column names
# Sorting the data to normalize the comparison

table_truth.columns = table_truth.columns.str.strip()
table_truth = table_truth.sort_values(by=table_truth.columns[0]).reset_index(drop=True)

table_compared.columns = table_compared.columns.str.strip()
table_compared = table_compared.sort_values(by=table_compared.columns[0]).reset_index(drop=True)

# Applying the glossary to a specific column ""
tables = [("truth", table_truth), ("compared", table_compared)]

for name, df in tables:
    missing_columns = []
    applied_columns = []

    for column, replacements in glossary.items():
        if column in df.columns:
            df[column] = df[column].replace(replacements)
            applied_columns.append(column)
        else:
            missing_columns.append(column)

# Reporting the glossaries application results
    if missing_columns:
        print(f"Table {name}: The following glossary columns were not found in the DataFrame: {missing_columns}")
    if applied_columns:
        print(f"Table {name}: Glossary was applied to {applied_columns}")

Table truth: Glossary was applied to ['Study Type', 'Data Owner']
Table compared: Glossary was applied to ['Study Type', 'Data Owner']


## Columns processing

In [4]:
class ColumnProcessor: # Head class specifying the structure for subclasses
    def process(self, value): # Class-specific method for the preprocessing
        raise NotImplementedError("Subclasses should implement this!")

    def compare(self, val1, val2): # Class-specific method for the comparison
        raise NotImplementedError("Subclasses should implement this!")

class NumericColumnProcessor(ColumnProcessor):
    def process(self, value):
        if pd.isna(value) or value == "N/A" or value =="":
          # if an input is an empty string we directly return it
            return ""
        try:
            # Round the numeric value to the first digit after the comma
            value = round((float(value)), 2)
        except ValueError as e:
            print(f"Error converting val1 ('{value}') to float: {e}")
        return value

    def compare(self, val1, val2):
        if val1 == "" and val2 != "":
            return "False Positive"
        elif val1 != "" and val2 == "":
            return "False Negative"
        elif val1 == "" and val2 == "":
            return "Correct"
        if val1 == val2:
          return "Correct"
        else:
          return "Incorrect"


class StringColumnProcessor(ColumnProcessor):
    def __init__(self):
        self.numeric_processor = NumericColumnProcessor()

    def process(self, value):
        if pd.isna(value) or value == "N/A" or value == "":
            return ""
            # Preprocessing of a string
            # Remove brackets and percentage signs
            # Remove all whitespace characters
            # Lower the
        value_str = str(value).strip().lower()
        value_str = re.sub(r"[()\[\]{}%]", "", value_str)
        value_str = re.sub(r"\s", "", value_str)
        try:
            # Attempt to convert to float to check if it's numeric
            # If successful, delegate processing to NumericColumnProcessor
            result=self.numeric_processor.process(float(value_str))
            return result

        except  ValueError:  # if the conversion fails, then the provided string is not
        # a solid number, thus, we are  processing with a string pre-processing
            return value_str

    def compare(self, val1, val2):
        if val1 == "" and val2 != "":
            return "False Positive"
        elif val1 != "" and val2 == "":
            return "False Negative"
        elif val1 == "" and val2 == "":
            return "Correct"
        try:
            return self.numeric_processor.compare(val1, val2) # try to compare values
        except ValueError:
            # If the value comparison fails, then proceed with a string comparison
            similarity = SequenceMatcher(None, val1, val2).ratio()
            return "Correct" if similarity == 1 else "Incorrect"


class StringNoisyColumnProcessor(StringColumnProcessor): # inherits the same preprocessing as for strict strings
    def compare(self, val1, val2):
        if val1 == "" and val2 != "":
            return "False Positive"
        elif val1 != "" and val2 == "":
            return "False Negative"
        elif val1 == "" and val2 == "":
            return "Correct"
        similarity = SequenceMatcher(None, val1, val2).ratio()
        return "Correct" if similarity > 0.75 else "Incorrect"


class List_of_Numbers_ColumnProcessor(ColumnProcessor):
    def process(self, value):
        if pd.isna(value) or value == "N/A" or value == "":
            return ""
        try:
            # Attempt to convert the value directly to float
            return [float(value)]
        except (ValueError, TypeError):
            pass  # Not a standalone number; proceed to extract numbers

        try:
            # Extract integers and decimals, then convert to float
            # Splitting can be commas, semicolons, spaces, etc. the pre-processing
            # does not depend on it directly

            numbers = re.findall(r'\d+(?:\.\d+)?', value)
            extracted_list = [float(num) for num in numbers]
        except ValueError as e:
            print(f"Error converting val1 ('{value}') to the list of numbers: {e}")
        return extracted_list

    def compare(self, truth_list, comparison_list):
        # Convert the lists to sets to compare their contents
        truth_set = set(truth_list)
        comparison_set = set(comparison_list)

        # Check for missing elements in the second list (False Negative)
        missing_elements = truth_set - comparison_set

        # Check for extra elements in the second list (False Positive)
        extra_elements = comparison_set - truth_set

        # If both missing and extra elements exist, treat it as Incorrect
        if missing_elements and extra_elements:
            return "Incorrect"

        # If there are only extra elements in the second list (False Positive)
        if extra_elements and not missing_elements:
            return "False Positive"

        # If there are only missing elements in the second list (False Negative)
        if missing_elements and not extra_elements:
            return "False Negative"

        # If both sets are exactly the same (order is ignored), return Correct
        return "Correct"

class Ordered_List_of_Numbers_ColumnProcessor(List_of_Numbers_ColumnProcessor):
  # Here we inherit the pre-processing logic from List_of_Numbers_ColumnProcessor
    def compare(self, truth_list, comparison_list):
        truth_set = set(truth_list)
        comparison_set = set(comparison_list)

        missing_elements = truth_set - comparison_set
        extra_elements = comparison_set - truth_set

        # If there are only extra elements in the second list (False Positive)
        if extra_elements and not missing_elements:
            return "False Positive"

        # If there are only missing elements in the second list (False Negative)
        if missing_elements and not extra_elements:
            return "False Negative"

        # If the lists do not match exactly (including the order) the return incorrect
        if truth_list != comparison_list:
            return "Incorrect"

        # If both lists are exactly the same in content and order, return Correct
        return "Correct"


class List_of_Strings_ColumnProcessor(List_of_Numbers_ColumnProcessor):
    # Inherits compare from List_of_Numbers_ColumnProcessor
    # Uses instances of StringColumnProcessor
    def __init__(self):
        self.string_processor = StringColumnProcessor()

    def process(self, value):
        if pd.isna(value) or value == "N/A" or value =='':
            return ""
        try:
            raw_parts = re.split(r"[,;]+", str(value))
            # We apply the inherited process method to each string
            extracted_list = [self.string_processor.process(part) for part in raw_parts if part.strip()]
        except ValueError as e:
            print(f"Error converting input ('{value}') to the list of strings: {e}")
        return extracted_list


### Assigning columns to specific pre-processing routines

In [5]:
# processors specific for Muta
# processors = {
#     "Authors": StringColumnProcessor(),
#     "Publication Year": NumericColumnProcessor(),
#     "Study Identification": StringColumnProcessor(),
#     "Data Owner": StringNoisyColumnProcessor(),
#     "Study Type": StringColumnProcessor(),
#     "S.typhimurium strain (TA)": List_of_Numbers_ColumnProcessor(),
#     "E.coli strain": List_of_Strings_ColumnProcessor(),
#     "Dose Range (preincubation) [µg/plate] (min,max)": Ordered_List_of_Numbers_ColumnProcessor(),
#     "Dose Range (plate-incorporation) [µg/plate] (min,max)": Ordered_List_of_Numbers_ColumnProcessor(),
#     "Batchlot": StringColumnProcessor(),
#     "Purity [%]": NumericColumnProcessor(),
#     "Known Impurities [percentage value, name]": List_of_Strings_ColumnProcessor(),
#     "Metabolic Activation (plate-incorporation)": StringColumnProcessor(),
#     "Metabolic Activation (preincubation)": StringColumnProcessor(),
#     "Results": StringColumnProcessor(),
#     "Status": StringColumnProcessor(),
# }

# combined processors

processors = {
    # common columns
    "Authors": StringColumnProcessor(),
    "Publication Year": NumericColumnProcessor(),
    "Study Identification": StringColumnProcessor(),
    "Data Owner": StringNoisyColumnProcessor(),
    "Study Type": StringColumnProcessor(),
    "Batchlot": List_of_Strings_ColumnProcessor(), # For Muta one can use NumericColumnProcessor()
    "Purity [%]": List_of_Numbers_ColumnProcessor(), # For Muta one can use NumericColumnProcessor()

    # columns specific for CARC study
    "Duration (months)": NumericColumnProcessor(),
    "Route": StringColumnProcessor(), # dif
    "Dietary doses (ppm)": Ordered_List_of_Numbers_ColumnProcessor(),
    "Achieved doses in males (mg/kg)": Ordered_List_of_Numbers_ColumnProcessor(),
    "Achieved doses in females (mg/kg)": Ordered_List_of_Numbers_ColumnProcessor(),
    "NO(A)EL": NumericColumnProcessor(),
    "LO(A)EL": NumericColumnProcessor(),

    # columns specific for Muta study
    "S.typhimurium strain (TA)": List_of_Numbers_ColumnProcessor(),
    "E.coli strain": List_of_Strings_ColumnProcessor(),
    "Dose Range (preincubation) [µg/plate] (min,max)": Ordered_List_of_Numbers_ColumnProcessor(),
    "Dose Range (plate-incorporation) [µg/plate] (min,max)": Ordered_List_of_Numbers_ColumnProcessor(),
    "Known Impurities [percentage value, name]": List_of_Strings_ColumnProcessor(),
    "Metabolic Activation (plate-incorporation)": StringColumnProcessor(),
    "Metabolic Activation (preincubation)": StringColumnProcessor(),
    "Results": StringColumnProcessor(),
    "Status": StringColumnProcessor(),
}

## Comparison process

### Comparison and storage

In [6]:
# Create an empty DataFrame to store the comparison results
columns = table_truth.columns
comparison_results = pd.DataFrame(columns=columns)

# Initialize confusion matrices
confusion_matrices = {col: {"Correct": 0, "Incorrect": 0, "False Positive": 0, "False Negative": 0} for col in columns}

# Process and compare the data
for column in columns:
    processor = processors[column]
    processed_table1 = table_truth[column].apply(processor.process)
    processed_table2 = table_compared[column].apply(processor.process)
    comparison_results[column] = [
        processor.compare(val1, val2) for val1, val2 in zip(processed_table1, processed_table2)
    ]

    # Update confusion matrices
    for result in comparison_results[column]:
        confusion_matrices[column][result] += 1

### Formatting the result

In [7]:
combined_output = pd.DataFrame()
# combined_processed_output=pd.DataFrame()

# Combine outputs and comparisons
for column in columns:
    combined_output[f'{column}_truth'] = table_truth[column]
    combined_output[f'{column}_compared'] = table_compared[column]
    combined_output[f'{column}_comparison'] = comparison_results[column]

# For future debugging processed outputs are saved as well
# for column in columns:
#     combined_processed_output[f'{column}_truth_processed'] = processed_table1[column]
#     combined_processed_output[f'{column}_compared_processed'] = processed_table2[column]

# Calculate accuracy for each column in the confusion matrices
for column, matrix in confusion_matrices.items():
    C = matrix["Correct"]
    FP = matrix["False Positive"]
    FN = matrix["False Negative"]
    N = matrix["Incorrect"]

    accuracy = C / (C + FP + FN + N) if (C + FP + FN + N) > 0 else 0

    # Store only the accuracy in the matrix
    matrix.update({
        "Accuracy": accuracy
    })

# Fill confusion matrix
overall_metrics = {"Correct": 0, "Incorrect": 0, "False Positive": 0, "False Negative": 0}

# Sum all the confusion matrix values to get overall metrics
for column, matrix in confusion_matrices.items():
    if column == "Overall":
        continue  # Skip if already calculated
    overall_metrics["Correct"] += matrix["Correct"]
    overall_metrics["Incorrect"] += matrix["Incorrect"]
    overall_metrics["False Positive"] += matrix["False Positive"]
    overall_metrics["False Negative"] += matrix["False Negative"]

# Calculate the overall accuracy
C = overall_metrics["Correct"] # Can be further splitted to True Positive, True Negative as well
# omitted for the current applications.
FP = overall_metrics["False Positive"]
FN = overall_metrics["False Negative"]
N = overall_metrics["Incorrect"]

overall_accuracy = C / (C + FP + FN + N) if (C + FP + FN + N) > 0 else 0

# Store the overall metrics in the matrix
overall_metrics.update({"Accuracy": overall_accuracy})
confusion_matrices["Overall"] = overall_metrics

# Convert confusion matrices to DataFrame
confusion_matrices_df = pd.DataFrame(confusion_matrices).T

### Visual inspection of the results

In [8]:
# Display the combined output
from IPython.display import display
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None, 'display.max_colwidth', None):
    display(combined_output)

Unnamed: 0,Authors_truth,Authors_compared,Authors_comparison,Publication Year_truth,Publication Year_compared,Publication Year_comparison,Study Identification_truth,Study Identification_compared,Study Identification_comparison,Batchlot_truth,Batchlot_compared,Batchlot_comparison,Purity [%]_truth,Purity [%]_compared,Purity [%]_comparison,Data Owner_truth,Data Owner_compared,Data Owner_comparison,Study Type_truth,Study Type_compared,Study Type_comparison,Duration (months)_truth,Duration (months)_compared,Duration (months)_comparison,Route_truth,Route_compared,Route_comparison,Dietary doses (ppm)_truth,Dietary doses (ppm)_compared,Dietary doses (ppm)_comparison,Achieved doses in males (mg/kg)_truth,Achieved doses in males (mg/kg)_compared,Achieved doses in males (mg/kg)_comparison,Achieved doses in females (mg/kg)_truth,Achieved doses in females (mg/kg)_compared,Achieved doses in females (mg/kg)_comparison,NO(A)EL_truth,NO(A)EL_compared,NO(A)EL_comparison,LO(A)EL_truth,LO(A)EL_compared,LO(A)EL_comparison
0,Atkinson et al.,Atkinson et al.,Correct,1993,1993,Correct,TOX9750499,TOX9750499,Correct,"229-JaK-5-1,229-JaK-142-6","229-Jak-5-1, 229-Jak-142-6",Correct,98.9% and 98.7%,"98.9%, 98.7%",Correct,Cheminova,Cheminova,Correct,Combined chronic toxicity/ carcinogenicity,"US-EPA Pesticide Assessment Guidelines Subdivision F, 83-5 (1982)",Incorrect,24,24,Correct,feeding,feeding,Correct,No data,"0, 10, 100, 300, 1000",False Positive,0101003001000,"0, 10, 101, 306, 1007",Incorrect,0101003001000,"0, 10, 103, 311, 1018",Incorrect,100.0,100.0,Correct,300.0,300.0,Correct
1,Brammer,Brammer,Correct,2001,2001,Correct,ASB2012-11488,ASB2012-11488,Correct,P30,P30,Correct,97.6%,97.6,Correct,Syngenta,Syngenta,Correct,Combined chronic toxicity/ carcinogenicity,Combined chronic toxicity/ carcinogenicity,Correct,24,24,Correct,feeding,feeding,Correct,"0, 2000, 6000, 20000","0, 2000, 6000, 20000",Correct,01213611214,"0, 121, 361, 1214",Correct,01454371498,"0, 145, 437, 1498",Correct,361.0,361.0,Correct,1214.0,1214.0,Correct
2,Enomoto,Enomoto,Correct,1997,1997,Correct,"ASB2012-11484, ASB2012-11485, ASB2012-11486, ASB2012-11487","ASB2012-11484, ASB2012-11485, ASB2012-11486, ASB2012-11487",Correct,"T-941209, T-950308","T-941209, T-950308",Correct,97.56% and 94.61%,"97.56%, 94.61%",Correct,Arysta Life Sciences,Arysta Life Sciences,Correct,Combined chronic toxicity/ carcinogenicity,Combined chronic toxicity/ carcinogenicity,Correct,24,24,Correct,feeding,feeding,Correct,"0, 3000, 10000, 30000","0, 3000, 10000, 30000",Correct,01043541127,"0, 104, 354, 1127",Correct,01153931247,"0, 115, 393, 1247",Correct,104.0,104.0,Correct,354.0,354.0,Correct
3,Kumar,Kumar,Correct,2001,2001,Correct,ASB2012-11491,ASB2012-11491,Correct,01/06/97,01/06/97,Correct,>95%,> 95 % (w/w),Correct,Feinchemie Schwebda GmbH,Feinchemie Schwebda GmbH,Correct,Carcinogenicity,Carcinogenicity,Correct,18,18,Correct,feeding,feeding,Correct,"0, 100, 1000, 10000","0, 100, 1000, 10000",Correct,"0,14.5,149.7,1453","0, 14.5, 149.7, 1453",Correct,"0, 15.0, 151.2, 1466.8","0, 15.0, 151.2, 1466.8",Correct,150.0,149.7,Incorrect,1453.0,1453.0,Correct
4,Lankas,Lankas,Correct,1981,1981,Correct,TOX2000-595,TOX2000-595,Correct,XHJ-64,XHJ-64,Correct,98.7%,98.7,Correct,Monsanto,Monsanto,Correct,Combined chronic toxicity/ carcinogenicity,Combined chronic toxicity/ carcinogenicity,Correct,26,26,Correct,feeding,feeding,Correct,030100300,"0, 30, 100, 300",Correct,"0,3.05,10.3,31.49","0, 3.05, 10.30, 31.49",Correct,"0,3.37,11.22,34.02","0, 3.37, 11.22, 34.02",Correct,31.5,31.5,Correct,,,Correct
5,Milburn,Milburn,Correct,1996,1996,Correct,TOX2000-1998,TOX2000-1998,Correct,P24,P24,Correct,95.6%,95.6%,Correct,Syngenta,Syngenta,Correct,Chronic toxicity,Chronic toxicity,Correct,12,12,Correct,feeding,feeding,Correct,"0, 2000, 8000, 20000","0, 2000, 8000, 20000",Correct,01415601409,"0, 141, 560, 1409",Correct,01676711664,"0, 167, 671, 1664",Correct,141.0,141.0,Correct,560.0,560.0,Correct
6,Stout and Ruecker,Stout and Ruecker,Correct,1990,1990,Correct,TOX9300244,TOX9300244,Correct,XLH-264,XLH-264,Correct,96.5%,96.5%,Correct,Monsanto,Monsanto,Correct,Combined chronic toxicity/ carcinogenicity,Combined chronic toxicity/ carcinogenicity,Correct,24,24,Correct,feeding,feeding,Correct,"0, 2000, 8000, 20000","0, 2000, 8000, 20000",Correct,089362940,"0, 89, 362, 940",Correct,01134571183,"0, 113, 457, 1183",Correct,89.0,89.0,Correct,362.0,940.0,Incorrect
7,Sugimoto,Sugimoto,Correct,1997,1997,Correct,ASB2012-11493,ASB2012-11493,Correct,"T-941209, T-950308","T-941209, T-950308",Correct,97.56% and 94.61%,"97.56%, 94.61%",Correct,Arysta LifeScience,Arysta LifeScience,Correct,Carcinogenicity,Carcinogenicity,Correct,18,18,Correct,feeding,feeding,Correct,"0, 1600, 8000, 40000","0, 1600, 8000, 40000",Correct,"0,165,838.1,4348","0, 165.0, 838.1, 4348",Correct,"0,153.2,786.8,4116","0, 153.2, 786.8, 4116",Correct,153.0,153.0,Correct,786.8,,False Negative
8,Suresh,Suresh,Correct,1996,1996,Correct,TOX9651587,TOX9651587,Correct,60; 046,"60, 046",Correct,96.8% and 96.8%,"96.8%, 96.8%",Correct,Feinchemie Schwebda GmbH,Feinchemie Schwebda GmbH,Correct,Combined chronic toxicity/ carcinogenicity,Combined chronic toxicity/ carcinogenicity,Correct,24,24,Correct,feeding,feeding,Correct,"0, 100, 1000, 10000","0, 100, 1000, 10000",Correct,"0,6.3,59.4,595.2","0, 7.4, 73.9, 740.6",Incorrect,"0,8.6,88.5,886.0","0, 8.6, 88.5, 886.0",Correct,60.0,60.0,Correct,595.2,,False Negative
9,Wood et al.,Wood et al.,Correct,2009,2009,Correct,ASB2012-11490,ASB2012-11490,Correct,H05H016A,H05H016A,Correct,95.7%,95.7,Correct,,,Correct,Combined chronic toxicity/ carcinogenicity,Combined chronic toxicity/ carcinogenicity,Correct,24,24,Correct,feeding,feeding,Correct,"0, 1500, 5000, 15000,17000,19000,21000,24000","0, 1500, 5000, 15000, 17000, 19000, 21000, 24000",Correct,"0,85.5,285.2,1077.4","0, 85.5, 285.2, 1077.4",Correct,"0,104.5,348.6,1381.9","0, 104.5, 348.6, 1381.9",Correct,285.0,285.0,Correct,1077.0,1229.7,Incorrect


In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', None, 'display.max_colwidth', None):
    display(confusion_matrices_df)

Unnamed: 0,Correct,Incorrect,False Positive,False Negative,Accuracy
Authors,11.0,0.0,0.0,0.0,1.0
Publication Year,11.0,0.0,0.0,0.0,1.0
Study Identification,11.0,0.0,0.0,0.0,1.0
Batchlot,11.0,0.0,0.0,0.0,1.0
Purity [%],11.0,0.0,0.0,0.0,1.0
Data Owner,11.0,0.0,0.0,0.0,1.0
Study Type,10.0,1.0,0.0,0.0,0.909091
Duration (months),11.0,0.0,0.0,0.0,1.0
Route,11.0,0.0,0.0,0.0,1.0
Dietary doses (ppm),10.0,0.0,1.0,0.0,0.909091


### Saving the result as a separate file

In [9]:
with pd.ExcelWriter(f'Comparison_Results_{filename_truth}_{filename_compared}.xlsx') as writer:
    combined_output.to_excel(writer, sheet_name='Combined Results', index=False)
    confusion_matrices_df.to_excel(writer, sheet_name='Confusion Matrices')