In [1]:
#This notebook, generates a dictionary with synonyms from the true answers
#Due to library conflicts, the code can not be run in a single conda environment. Therefore, the code is divided in two chunks to be run in separate environments.
import os
import json
import stanza
import spacy

stanza.download('en', package='mimic', processors='tokenize,pos,lemma')
biomed_nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma', package='mimic')


def lemmatize_entities(entities):
    """
    Lemmatize a set of entity strings using Stanza's en_biomedical pipeline.
    
    Args:
        entities (set): A set of entity strings to be lemmatized.
    Returns:
        set: A set of lemmatized entity strings.
    """
    lemmatized_entities = set()
    for entity in entities:
        doc = biomed_nlp(entity)
        lemmatized_entity = " ".join([word.lemma for sentence in doc.sentences for word in sentence.words])
        lemmatized_entities.add(lemmatized_entity)
    return lemmatized_entities

def clean_text(text):
    # Replace newline characters with a space and remove asterisks
    text = text.replace('\n', ' ').replace('*', '')
    # Remove any extra spaces that may have resulted
    text = ' '.join(text.split())
    return text

category_ids = [str(num) for num in range(1, 6)] 

all_entities_true = set()

for category_id in category_ids:
    file_path = f".././deploy_medical_llm_evaluation/questions_files/HIV_evaluation_questionare_category_{category_id}.json"
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    nlp = spacy.load('en_core_sci_lg')
    
    # Process each answer pair
    for idx, item in enumerate(data):
        # Load the response field as JSON
        try:
            answer = item.get('true_answer', '')
        except json.JSONDecodeError as e:
            print(f"JSON decode error for file {answer_file_name}, question {idx}: {e}")
            continue
            
       # answer = response_json.get('true_answer', '')
        answer_clean = clean_text(answer)
        doc_answer = nlp(answer_clean)
        
        # Proceed with entity extraction
        entities_true = set(ent.text.lower() for ent in doc_answer.ents)

        entities_true_lemmatized = lemmatize_entities(entities_true)
        all_entities_true.update(entities_true_lemmatized)

entities_list = sorted(all_entities_true)
output_file_path = "./extracted_entities.txt"


# Save the entities list to the file
with open(output_file_path, 'w') as file:
    for entity in entities_list:
        file.write(f"'{entity}', ")

print(f"Entities list has been saved to {output_file_path}")

ModuleNotFoundError: No module named 'stanza'

In [None]:
import json
from openai import OpenAI

file_path_txt = "./extracted_entities.txt"

with open(file_path_txt, "r") as file:
    raw_entities_list = file.read().strip().split(", ")

cleaned_list = [item.strip("'") for item in raw_entities_list]

gpt4_api_key = "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"
gpt4_base_url = "http://148.187.108.173:8080"

client = OpenAI(api_key=gpt4_api_key)

batch_size = 10
batches = [cleaned_list[i:i + batch_size] for i in range(0, len(cleaned_list), batch_size)]

synonym_prompt = "You are working as a synonym dictionary for precise medical terms. For medical terms, especially medications, synonyms should only include chemical names, brand names, or closely related alternative scientific terms—avoid broader pharmacological categories. For each input term, provide a list of synonyms. Your answer should be provided in the following format (give maximum a set of 5 synonyms and don’t generate any other information):\n{\n    \"term_1\": [\"synonym_1\", \"synonym_2\", \"synonym_3\"],\n    \"term_2\": [\"synonym_1\", \"synonym_2\", \"synonym_3\", \"synonym_4\", \"synonym_5\"]\n}"

final_synonyms_dict = {}

for batch in batches:
    input_terms = ", ".join(f"'{term}'" for term in batch) 
    formatted_input_terms = f"{{{input_terms}}}"  
    try:
        res = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": synonym_prompt},
                {"role": "user", "content": input_terms}
            ],
            stream=False,
            temperature=0,
        )
        assistant_reply = res.choices[0].message.content.strip()
        
        # Parse the GPT response as JSON
        synonyms_dict = json.loads(assistant_reply)
        
        # Update the final dictionary
        final_synonyms_dict.update(synonyms_dict)

    except json.JSONDecodeError as e:
        print(f"Error decoding GPT response: {e}")
    except Exception as e:
        print(f"Error generating synonyms for batch {batch}: {e}")

# Save the synonyms dictionary to a JSON file
output_file = "synonyms_dictionary.json"
with open(output_file, 'w') as f:
    json.dump(final_synonyms_dict, f, indent=4)

print(f"Synonyms dictionary saved to {output_file}")