In [3]:
import os
import re
import json
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
import spacy
import json

# Additional imports for synonyms and lemmatization
from owlready2 import *
from owlready2.pymedtermino2 import *
from owlready2.pymedtermino2.umls import *
import stanza

ModuleNotFoundError: No module named 'owlready2'

In [None]:
import os
import re
import json
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
import spacy
import json

# Additional imports for synonyms and lemmatization
from pymedtermino2 import get_ontology  # Ensure PyMedTermino2 is installed and configured
import stanza

# Load stanza pipeline once for efficiency
stanza.download('en', package='mimic', processors='tokenize,pos,lemma')
biomed_nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma', package='mimic')

# Define the cleaning function
def clean_text(text):
    # Replace newline characters with a space and remove asterisks
    text = text.replace('\n', ' ').replace('*', '')
    # Remove any extra spaces that may have resulted
    text = ' '.join(text.split())
    return text

# Function to fetch synonyms
def get_synonyms(entity):
    """
    Retrieve synonyms for a given entity using PyMedTermino2.
    Args:
        entity (str): The clinical entity for which synonyms are needed.
    Returns:
        list: A list of synonyms for the entity.
    """
    try:
        PYM = get_ontology("http://PYM/").load()
        SNOMEDCT_US = PYM["SNOMEDCT_US"]
        concept = SNOMEDCT_US.search(entity)  # Search for the entity in SNOMED CT
        if concept:
            concept = concept[0]  # Take the first match
            synonyms = [str(term) for term in concept.label]
            return synonyms
        else:
            return []
    except Exception as e:
        print(f"Error retrieving synonyms for {entity}: {e}")
        return []

# Function to compute f1 with synonyms
def compute_f1_with_synonyms(entities_true, entities_answer):
    """
    Compute precision, recall, and F1 score using synonym-based intersection.
    
    Args:
        entities_true (set): Set of entities from the true sentence.
        entities_answer (set): Set of entities from the predicted sentence.
        
    Returns:
        precision, recall, f1: Computed scores.
    """
    def expand_with_synonyms(entities):
        expanded = {}
        for entity in entities:
            synonyms = get_synonyms(entity)  # Retrieve synonyms using PyMedTermino2
            # Include the original entity as well, for direct matches
            expanded[entity] = set(synonyms + [entity])
        return expanded

    # Expand entities with synonyms
    true_synonyms = expand_with_synonyms(entities_true)
    answer_synonyms = expand_with_synonyms(entities_answer)
    
    # Compute intersection based on synonym matches
    intersection_count = 0
    used_answer_entities = set()
    for true_entity, true_synonyms_set in true_synonyms.items():
        matched = False
        for answer_entity, answer_synonyms_set in answer_synonyms.items():
            if answer_entity not in used_answer_entities:
                if not true_synonyms_set.isdisjoint(answer_synonyms_set):
                    intersection_count += 1
                    used_answer_entities.add(answer_entity)
                    matched = True
                    break

    # Calculate precision, recall, and F1 score
    precision = intersection_count / len(entities_answer) if len(entities_answer) > 0 else 0.0
    recall = intersection_count / len(entities_true) if len(entities_true) > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return precision, recall, f1

# Lemmatization function
def lemmatize_entities(entities):
    """
    Lemmatize a set of entity strings using Stanza's en_biomedical pipeline.
    
    Args:
        entities (set): A set of entity strings to be lemmatized.
    Returns:
        set: A set of lemmatized entity strings.
    """
    lemmatized_entities = set()
    for entity in entities:
        doc = biomed_nlp(entity)
        lemmatized_entity = " ".join([word.lemma for sentence in doc.sentences for word in sentence.words])
        lemmatized_entities.add(lemmatized_entity)
    return lemmatized_entities

def parsing_and_computing_f1(input_answer_dir, model_list, rephrased=False):
    # List of scispaCy models to use
    nlp_models = {
        'en_core_sci_lg': spacy.load('en_core_sci_lg'),
        # 'en_core_sci_scibert': spacy.load('en_core_sci_scibert')
    }
    
    # Initialize data collection
    data_records = []
    category_ids = [str(num) for num in range(1, 7)]  
    iteration_numbers = [1, 2, 3]
    
    # Process each file with each scispaCy model
    for nlp_name, nlp in nlp_models.items():
        print(f"Processing with model: {nlp_name}")
        for model in model_list:
            for category_id in category_ids:
                for iteration_number in iteration_numbers:
                    # Construct file names
                    answer_file_name = f"{model}_answers_category_{category_id}.{iteration_number}_HIV_EQ.json"
                    input_answer_model = os.path.join(input_answer_dir, f"raw/{model}/")
                    file_path = os.path.join(input_answer_model, answer_file_name)
                    
                    # Check if the file exists
                    if not os.path.exists(file_path):
                        print(f"File not found: {file_path}")
                        continue
                    
                    # Load JSON data
                    with open(file_path, 'r') as f:
                        data = json.load(f)
                    
                    # Process each answer pair
                    for idx, item in enumerate(data):
                        true_answer = item.get('true_answer', '')
                        answer = item.get('answer', '')
                        question_index = idx
    
                        # Clean the 'true_answer' and 'answer' strings
                        true_answer_clean = clean_text(true_answer)
                        answer_clean = clean_text(answer)
    
                        # Extract entities using scispaCy
                        doc_true = nlp(true_answer_clean)
                        doc_answer = nlp(answer_clean)
    
                        # Proceed with entity extraction
                        entities_true = set(ent.text.lower() for ent in doc_true.ents)
                        entities_answer = set(ent.text.lower() for ent in doc_answer.ents)
    
                        # --- Original exact match F1 ---
                        if entities_answer:
                            intersection = entities_true & entities_answer
                            precision = len(intersection) / len(entities_answer) if len(entities_answer) > 0 else 0.0
                            recall = len(intersection) / len(entities_true) if len(entities_true) > 0 else 0.0
                            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
                        else:
                            precision = 0.0
                            recall = 0.0
                            f1 = 0.0
    
                        # --- F1 with synonyms ---
                        synonyms_precision, synonyms_recall, synonyms_f1 = compute_f1_with_synonyms(entities_true, entities_answer)
    
                        # --- F1 with synonyms + lemmatization ---
                        # First lemmatize the entities
                        entities_true_lemmatized = lemmatize_entities(entities_true)
                        entities_answer_lemmatized = lemmatize_entities(entities_answer)
                        synonyms_lemma_precision, synonyms_lemma_recall, synonyms_lemma_f1 = compute_f1_with_synonyms(entities_true_lemmatized, entities_answer_lemmatized)
    
                        # Append results to data_records
                        data_records.append({
                            'nlp_model': nlp_name,
                            'model': model,
                            'category_id': category_id,
                            'iteration_number': iteration_number,
                            'question_index': question_index,
                            'precision': precision,
                            'recall': recall,
                            'f1_score': f1,
                            'synonyms_precision': synonyms_precision,
                            'synonyms_recall': synonyms_recall,
                            'synonyms_f1': synonyms_f1,
                            'synonyms_lemmatized_precision': synonyms_lemma_precision,
                            'synonyms_lemmatized_recall': synonyms_lemma_recall,
                            'synonyms_lemmatized_f1': synonyms_lemma_f1
                        })
    
    # Save data_records to a JSON file
    os.makedirs('./evaluation_results', exist_ok=True)
    with open('./evaluation_results/f1_results.json', 'w') as f:
        json.dump(data_records, f, indent=4)

if __name__ == "__main__":
    script_dir = os.path.dirname(os.path.abspath(__file__))
    model_answers_files_path = os.path.join(script_dir, '/model_answers/')
    model_list = ["Claude"]
    
    # Run the F1 score calculation
    parsing_and_computing_f1(model_answers_files_path, model_list)

In [None]:
def lemmatize_clinical_text(text):
    """
    Lemmatize clinical text using Stanza's en_biomedical pipeline.
    Args:
        text (str): Input clinical text to be lemmatized.
    Returns:
        str: Lemmatized text.
    """
    import stanza

    # Load the biomedical model
    nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma', package='mimic')

    # Process the text
    doc = nlp(text)
    
    # Reconstruct lemmatized text
    lemmatized_text = " ".join([word.lemma for sentence in doc.sentences for word in sentence.words])
    return lemmatized_text

In [None]:
example_text = "Based on the patient's symptoms and laboratory results, the most likely diagnosis is Bacillary Angiomatosis (BA), a skin infection caused by Bartonella henselae or Bartonella quintana, commonly seen in immunocompromised patients, particularly those with HIV/AIDS. The presence of white patches on the palate that can be scraped off suggests oral thrush, a fungal infection, but the primary focus is on treating the skin lesions. The patient's low CD4+ T-lymphocyte count (98/mm3) indicates severe immunosuppression, and the biopsy results showing vascular proliferation and small black bacteria on Warthin-Starry stain confirm the diagnosis of BA. The most appropriate pharmacotherapy for Bacillary Angiomatosis in this patient is: 1. **Doxycycline** (100 mg orally twice a day) for 3-4 months, as it is the first-line treatment for BA. 2. **Erythromycin** (500 mg orally four times a day) can be used as an alternative, especially in patients with intolerance to doxycycline. It is also essential to consider antiretroviral therapy (ART) to manage the patient's HIV infection, but this is not directly related to the treatment of BA. In addition to pharmacotherapy, the patient's overall health, including her nutritional status, smoking, and substance use, should be addressed to prevent further complications and improve her quality of life."
lemmatized_output = lemmatize_clinical_text(example_text)
print("Original Text:", example_text)
print("Lemmatized Text:", lemmatized_output)