In [409]:
import pandas as pd
import ast
from scipy.stats import norm
import numpy as np
from abbreviations import schwartz_hearst

# Transform NER to abstract level annotation 

In [410]:
pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text='The emergency room (ER) was busy')
pairs

{'ER': 'emergency room'}

In [411]:
bert_annotated_files_path_prefix = "./predictions/rebuttal/bert/"
annotated_files_path_prefix = "./predictions/rebuttal/"
annotated_files_path_suffix = "20240529"
path_to_test_set = "../data/annotated_data/data_splits/stratified_entities/ct_neuro_test_merged_153.csv"

## Load annotations and Target Test Set

In [412]:
hugging_face_model_name_biolink = "michiyasunaga/BioLinkBERT-base"
model_name_str_biolink = hugging_face_model_name_biolink.split("/")[1]

In [413]:
hugging_face_model_name_biobert = "dmis-lab/biobert-v1.1"
model_name_str_biobert = hugging_face_model_name_biobert.split("/")[1]

In [414]:
hugging_face_model_name_bertbase = "bert-base-uncased"
model_name_str_bertbase = "bert-base-uncased"

In [415]:
# read annotated target dataset 
ds_annotated_full = pd.read_csv(path_to_test_set)
col_name_target_annot = "ner_manual_ct_target"
# select only relevant columns
df = ds_annotated_full[['nct_id', 'text', col_name_target_annot]]

In [416]:
biolinkbert_col = f'ner_prediction_{model_name_str_biolink}_normalized'
df_biolinkbert = pd.read_csv(bert_annotated_files_path_prefix + f"ct_neuro_test_annotated_{model_name_str_biolink}_{annotated_files_path_suffix}.csv")[['nct_id', biolinkbert_col]]
df = pd.merge(df, df_biolinkbert, on='nct_id', how='left')

In [417]:
biobert_col = f'ner_prediction_{model_name_str_biobert}_normalized'
file_to_read = bert_annotated_files_path_prefix + f"ct_neuro_test_annotated_{model_name_str_biobert}_20240531_NEW_Grouping.csv"
file_to_read
df_biobert = pd.read_csv(file_to_read)[['nct_id', biobert_col]]
df = pd.merge(df, df_biobert, on='nct_id', how='left')


In [418]:
bertbase_col = f'ner_prediction_{model_name_str_bertbase}_normalized'
df_bertbase = pd.read_csv(bert_annotated_files_path_prefix + f"ct_neuro_test_annotated_{model_name_str_bertbase}_{annotated_files_path_suffix}.csv")[['nct_id', bertbase_col]]
df = pd.merge(df, df_bertbase, on='nct_id', how='left')

In [419]:
df.head()

Unnamed: 0,nct_id,text,ner_manual_ct_target,ner_prediction_BioLinkBERT-base_normalized,ner_prediction_biobert-v1.1_normalized,ner_prediction_bert-base-uncased_normalized
0,NCT04045665,Anticoagulation for New-Onset Post-Operative A...,"[(0, 15, 'DRUG', 'Anticoagulation'), (20, 64, ...","[(0, 15, 'DRUG', 'anticoagulation'), (20, 64, ...","[(0, 15, 'DRUG', 'Anticoagulation'), (20, 64, ...","[(0, 4, 'DRUG', 'anti'), (20, 64, 'CONDITION',..."
1,NCT02311036,Effect and Outcome Predictors on Functional Re...,"[(59, 87, 'PHYSICAL', 'Comprehensive Rehabilit...","[(59, 87, 'OTHER', 'comprehensive rehabilitati...","[(59, 87, 'OTHER', 'Comprehensive Rehabilitati...","[(59, 87, 'OTHER', 'comprehensive rehabilitati..."
2,NCT05369793,Clinical Study Evaluating the Safety and Effic...,"[(53, 64, 'DRUG', 'Roflumilast'), (68, 83, 'CO...","[(53, 64, 'DRUG', 'roflumilast'), (68, 83, 'CO...","[(53, 64, 'DRUG', 'Roflumilast'), (68, 83, 'CO...","[(53, 64, 'DRUG', 'roflumilast'), (68, 83, 'CO..."
3,NCT04483310,Meditation-Relaxation (MR Therapy) for Sleep P...,"[(0, 21, 'BEHAVIOURAL', 'Meditation-Relaxation...","[(0, 4, 'PHYSICAL', 'medi'), (4, 21, 'BEHAVIOU...","[(0, 23, 'BEHAVIOURAL', 'Meditation - Relaxati...","[(0, 21, 'OTHER', 'meditation - relaxation'), ..."
4,NCT01418976,IMPROVING WALKING AND BALANCE IN VETERANS WITH...,"[(47, 69, 'CONDITION', 'TRAUMATIC BRAIN INJURY...","[(47, 69, 'CONDITION', 'traumatic brain injury...","[(33, 34, 'CONDITION', 'V'), (47, 69, 'CONDITI...","[(47, 69, 'CONDITION', 'traumatic brain injury..."


### add regex

In [420]:
df_regex = pd.read_csv(annotated_files_path_prefix + f"ct_neuro_test_annotated_regex_{annotated_files_path_suffix}.csv")[['nct_id', 'tokens','ner_prediction_regex_normalized']]
# Convert 'tokens' and 'ner_prediction_regex_normalized' columns to lists
df_regex['tokens'] = df_regex['tokens'].apply(lambda x: eval(x))
df_regex['ner_prediction_regex_normalized'] = df_regex['ner_prediction_regex_normalized'].apply(lambda x: eval(x))
df_regex

Unnamed: 0,nct_id,tokens,ner_prediction_regex_normalized
0,NCT04045665,"[Anticoagulation, for, New, -, Onset, Post, -,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,NCT02311036,"[Effect, and, Outcome, Predictors, on, Functio...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,NCT05369793,"[Clinical, Study, Evaluating, the, Safety, and...","[O, O, O, O, O, O, O, O, B-DRUG, O, O, O, O, O..."
3,NCT04483310,"[Meditation, -, Relaxation, (, MR, Therapy, ),...","[O, O, O, O, O, O, O, O, B-COND, I-COND, O, O,..."
4,NCT01418976,"[IMPROVING, WALKING, AND, BALANCE, IN, VETERAN...","[O, O, O, B-DRUG, O, O, O, O, B-COND, I-COND, ..."
...,...,...,...
148,NCT00433667,"[Epilepsy, Phase, III, Trial, |, The, purpose,...","[B-COND, O, O, O, O, O, O, O, O, O, O, O, O, O..."
149,NCT02139436,"[Hybrid, -, FES, Exercise, to, Prevent, Cardio...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
150,NCT02751905,"[A, Phase, 1, ,, Open, -, Label, Study, to, Ev...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
151,NCT03170856,"[The, Effects, of, a, Sub, -, maximal, Exercis...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-C..."


In [421]:
def extract_entities(tokens, annotations):
    entities = {'conditions': set(), 'drugs': set()}
    current_entity = None
    current_entity_tokens = []
    
    for token, annotation in zip(tokens, annotations):
        if annotation.startswith('B-'):
            #print(token, annotation)
            # If current entity is not None, add it to the corresponding list
            if current_entity:
                entities[current_entity].add(' '.join(current_entity_tokens).lower())
                current_entity_tokens = []
                
            # Determine the entity type
            entity_type = annotation.split('-')[1]
            current_entity = 'drugs' if entity_type == 'DRUG' else 'conditions'
            current_entity_tokens.append(token)
        elif annotation.startswith('I-'):
            # If there's a following token, add it to the current entity
            current_entity_tokens.append(token)
        else:
            if current_entity:
                entities[current_entity].add(' '.join(current_entity_tokens).lower())
                current_entity_tokens = []
            # If there's no entity, reset current_entity
            current_entity = None
            
    # Add the last entity if it exists
    if current_entity:
        entities[current_entity].add(' '.join(current_entity_tokens))
        
    # Convert sets to strings separated by '|' symbol
    conditions_str = '|'.join(entities['conditions'])
    drugs_str = '|'.join(entities['drugs'])
        
    return conditions_str, drugs_str
    #return entities

# Apply the function to each row
df_regex[['unique_condition_regex', 'unique_drug_regex']] = df_regex.apply(lambda row: pd.Series(extract_entities(row['tokens'], row['ner_prediction_regex_normalized'])), axis=1)


In [422]:
df_regex.head(10)

Unnamed: 0,nct_id,tokens,ner_prediction_regex_normalized,unique_condition_regex,unique_drug_regex
0,NCT04045665,"[Anticoagulation, for, New, -, Onset, Post, -,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",,
1,NCT02311036,"[Effect, and, Outcome, Predictors, on, Functio...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",,
2,NCT05369793,"[Clinical, Study, Evaluating, the, Safety, and...","[O, O, O, O, O, O, O, O, B-DRUG, O, O, O, O, O...",diabetic neuropathy,roflumilast
3,NCT04483310,"[Meditation, -, Relaxation, (, MR, Therapy, ),...","[O, O, O, O, O, O, O, O, B-COND, I-COND, O, O,...",paralyses|narcolepsy|sleep paralysis,aim
4,NCT01418976,"[IMPROVING, WALKING, AND, BALANCE, IN, VETERAN...","[O, O, O, B-DRUG, O, O, O, O, B-COND, I-COND, ...",brain injury,balance
5,NCT03689491,"[Investigating, the, Effects, of, Combining, r...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",stroke,no
6,NCT01075672,"[Outcomes, of, Cognitive, Behavioral, Therapy,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",,
7,NCT00701363,"[A, Prospective, ,, International, ,, Multi, -...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",,lanreotide|octreotide
8,NCT05534880,"[Analysis, of, Cortical, Activity, in, Individ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",,met|as
9,NCT02897024,"[A, Comparison, :, High, Intense, Periodic, vs...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-COND...",cerebral palsy,cp


In [423]:
df_regex = df_regex[['nct_id', 'unique_condition_regex', 'unique_drug_regex']]

In [424]:
df.columns

Index(['nct_id', 'text', 'ner_manual_ct_target',
       'ner_prediction_BioLinkBERT-base_normalized',
       'ner_prediction_biobert-v1.1_normalized',
       'ner_prediction_bert-base-uncased_normalized'],
      dtype='object')

In [425]:
df = pd.merge(df, df_regex, on='nct_id', how='left')
df.head()

Unnamed: 0,nct_id,text,ner_manual_ct_target,ner_prediction_BioLinkBERT-base_normalized,ner_prediction_biobert-v1.1_normalized,ner_prediction_bert-base-uncased_normalized,unique_condition_regex,unique_drug_regex
0,NCT04045665,Anticoagulation for New-Onset Post-Operative A...,"[(0, 15, 'DRUG', 'Anticoagulation'), (20, 64, ...","[(0, 15, 'DRUG', 'anticoagulation'), (20, 64, ...","[(0, 15, 'DRUG', 'Anticoagulation'), (20, 64, ...","[(0, 4, 'DRUG', 'anti'), (20, 64, 'CONDITION',...",,
1,NCT02311036,Effect and Outcome Predictors on Functional Re...,"[(59, 87, 'PHYSICAL', 'Comprehensive Rehabilit...","[(59, 87, 'OTHER', 'comprehensive rehabilitati...","[(59, 87, 'OTHER', 'Comprehensive Rehabilitati...","[(59, 87, 'OTHER', 'comprehensive rehabilitati...",,
2,NCT05369793,Clinical Study Evaluating the Safety and Effic...,"[(53, 64, 'DRUG', 'Roflumilast'), (68, 83, 'CO...","[(53, 64, 'DRUG', 'roflumilast'), (68, 83, 'CO...","[(53, 64, 'DRUG', 'Roflumilast'), (68, 83, 'CO...","[(53, 64, 'DRUG', 'roflumilast'), (68, 83, 'CO...",diabetic neuropathy,roflumilast
3,NCT04483310,Meditation-Relaxation (MR Therapy) for Sleep P...,"[(0, 21, 'BEHAVIOURAL', 'Meditation-Relaxation...","[(0, 4, 'PHYSICAL', 'medi'), (4, 21, 'BEHAVIOU...","[(0, 23, 'BEHAVIOURAL', 'Meditation - Relaxati...","[(0, 21, 'OTHER', 'meditation - relaxation'), ...",paralyses|narcolepsy|sleep paralysis,aim
4,NCT01418976,IMPROVING WALKING AND BALANCE IN VETERANS WITH...,"[(47, 69, 'CONDITION', 'TRAUMATIC BRAIN INJURY...","[(47, 69, 'CONDITION', 'traumatic brain injury...","[(33, 34, 'CONDITION', 'V'), (47, 69, 'CONDITI...","[(47, 69, 'CONDITION', 'traumatic brain injury...",brain injury,balance


### extract unique entities from BERT annotations

In [426]:
# aggregate annotations and sum up how often they were annotated
def extract_summary(annotation_list):
    annotation_list = eval(annotation_list)
    summary = {}
    for annotation in annotation_list:
        _, _, entity_type, entity_name = annotation
        entity_name = entity_name.lower()
        if entity_type not in summary:
            summary[entity_type] = {}
        if entity_name not in summary[entity_type]:
            summary[entity_type][entity_name] = 0
        summary[entity_type][entity_name] += 1
    return summary

In [427]:
df['summary_target_labels'] = df[col_name_target_annot].apply(extract_summary)

In [428]:
df.head(5)

Unnamed: 0,nct_id,text,ner_manual_ct_target,ner_prediction_BioLinkBERT-base_normalized,ner_prediction_biobert-v1.1_normalized,ner_prediction_bert-base-uncased_normalized,unique_condition_regex,unique_drug_regex,summary_target_labels
0,NCT04045665,Anticoagulation for New-Onset Post-Operative A...,"[(0, 15, 'DRUG', 'Anticoagulation'), (20, 64, ...","[(0, 15, 'DRUG', 'anticoagulation'), (20, 64, ...","[(0, 15, 'DRUG', 'Anticoagulation'), (20, 64, ...","[(0, 4, 'DRUG', 'anti'), (20, 64, 'CONDITION',...",,,"{'DRUG': {'anticoagulation': 1, 'oral anticoag..."
1,NCT02311036,Effect and Outcome Predictors on Functional Re...,"[(59, 87, 'PHYSICAL', 'Comprehensive Rehabilit...","[(59, 87, 'OTHER', 'comprehensive rehabilitati...","[(59, 87, 'OTHER', 'Comprehensive Rehabilitati...","[(59, 87, 'OTHER', 'comprehensive rehabilitati...",,,{'PHYSICAL': {'comprehensive rehabilitation': ...
2,NCT05369793,Clinical Study Evaluating the Safety and Effic...,"[(53, 64, 'DRUG', 'Roflumilast'), (68, 83, 'CO...","[(53, 64, 'DRUG', 'roflumilast'), (68, 83, 'CO...","[(53, 64, 'DRUG', 'Roflumilast'), (68, 83, 'CO...","[(53, 64, 'DRUG', 'roflumilast'), (68, 83, 'CO...",diabetic neuropathy,roflumilast,"{'DRUG': {'roflumilast': 2}, 'CONDITION': {'ty..."
3,NCT04483310,Meditation-Relaxation (MR Therapy) for Sleep P...,"[(0, 21, 'BEHAVIOURAL', 'Meditation-Relaxation...","[(0, 4, 'PHYSICAL', 'medi'), (4, 21, 'BEHAVIOU...","[(0, 23, 'BEHAVIOURAL', 'Meditation - Relaxati...","[(0, 21, 'OTHER', 'meditation - relaxation'), ...",paralyses|narcolepsy|sleep paralysis,aim,"{'BEHAVIOURAL': {'meditation-relaxation': 1, '..."
4,NCT01418976,IMPROVING WALKING AND BALANCE IN VETERANS WITH...,"[(47, 69, 'CONDITION', 'TRAUMATIC BRAIN INJURY...","[(47, 69, 'CONDITION', 'traumatic brain injury...","[(33, 34, 'CONDITION', 'V'), (47, 69, 'CONDITI...","[(47, 69, 'CONDITION', 'traumatic brain injury...",brain injury,balance,"{'CONDITION': {'traumatic brain injury': 1, 'c..."


In [429]:
print(df['summary_target_labels'][6])

{'BEHAVIOURAL': {'cognitive behavioral therapy': 1, 'cbt': 1, 'cognitive-behavioral therapy': 1}}


In [430]:
# Define a function to extract the unique conditions, drugs, and others from the 'ner_manual_final_annotated_ds' column
def extract_unique_entities_count(annotation_list, abbreviation_definition_pairs):
    unique_conditions = set()
    unique_drugs = set()
    unique_others = set()
    annotation_list = eval(annotation_list)
    for annotation in annotation_list:
        _, _, entity_type, entity_name = annotation
        if entity_name in abbreviation_definition_pairs:
            #print("Skipping entity {} as it is an ABBR".format(entity_name))
            continue
        entity_name = entity_name.lower()
        if entity_type == 'CONDITION':
            unique_conditions.add(entity_name)
        elif entity_type == 'DRUG':
            unique_drugs.add(entity_name)
        elif entity_type == 'OTHER':
            unique_others.add(entity_name)
    return len(unique_conditions), len(unique_drugs), len(unique_others)

def extract_unique_entities(annotation_list, abbreviation_definition_pairs, model="linkbert"):
    unique_conditions = set()
    unique_control = set()
    unique_physical = set()
    unique_behavioural = set()
    unique_surgical = set()
    unique_radiotherapy = set()
    unique_other = set()
    unique_drugs = set()

    annotation_list = eval(annotation_list)
    
    for annotation in annotation_list:
        _, _, entity_type, entity_name = annotation
        if entity_name.startswith("##"):
            continue  # THERE IS AN ISSUE WITH BIOBERT and BERT
        if (len(entity_name) == 1 or len(entity_name) == 2) and model == "biobert":
            continue  # ASSUME TOKENIZER ERROR IN BIOBERT
        # REPLACE ABBREVIATIONS WITH FULL FORM
        if entity_name in abbreviation_definition_pairs:
            entity_name = abbreviation_definition_pairs[entity_name] 
        if entity_name.upper() in abbreviation_definition_pairs:
            entity_name = abbreviation_definition_pairs[entity_name.upper()]
        entity_name = entity_name.lower()
        
        # Categorizing entities based on their type
        if entity_type == 'CONDITION':
            unique_conditions.add(entity_name)
        elif entity_type == 'CONTROL':
            unique_control.add(entity_name)
        elif entity_type == 'PHYSICAL':
            unique_physical.add(entity_name)
        elif entity_type == 'BEHAVIOURAL':
            unique_behavioural.add(entity_name)
        elif entity_type == 'SURGICAL':
            unique_surgical.add(entity_name)
        elif entity_type == 'RADIOTHERAPY':
            unique_radiotherapy.add(entity_name)
        elif entity_type == 'DRUG':
            unique_drugs.add(entity_name)
        elif entity_type == 'OTHER':
            unique_other.add(entity_name)
        
    # Preparing the output
    conditions_str = "|".join(sorted(list(unique_conditions)))
    control_str = "|".join(sorted(list(unique_control)))
    physical_str = "|".join(sorted(list(unique_physical)))
    behavioural_str = "|".join(sorted(list(unique_behavioural)))
    surgical_str = "|".join(sorted(list(unique_surgical)))
    radiotherapy_str = "|".join(sorted(list(unique_radiotherapy)))
    other_str = "|".join(sorted(list(unique_other)))
    drugs_str = "|".join(sorted(list(unique_drugs)))


    # Returning a tuple containing all the unique entities by category
    return conditions_str, drugs_str, control_str, physical_str, behavioural_str, surgical_str, radiotherapy_str, other_str


# Placeholder function to demonstrate applying the Schwartz-Hearst algorithm (Replace with actual implementation)
def extract_abbreviation_definition_pairs(doc_text):
    pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text=doc_text)
    return pairs

In [431]:
# Create a new column 'abbreviation_definition_pairs' using the 'apply' function
df['abbreviation_definition_pairs'] = df['text'].apply(extract_abbreviation_definition_pairs)

model_names = ["target", model_name_str_biolink, model_name_str_biobert, model_name_str_bertbase]  
model_cols = [col_name_target_annot, biolinkbert_col, biobert_col, bertbase_col]
i = 0
for model_name in model_names:
    results = df.apply(lambda row: extract_unique_entities(row[model_cols[i]], row['abbreviation_definition_pairs'], model=model_name), axis=1)
    df[f'unique_condition_{model_name}'] = results.apply(lambda x: x[0])
    df[f'unique_drug_{model_name}'] = results.apply(lambda x: x[1])
    df[f'unique_control_{model_name}'] = results.apply(lambda x: x[2])
    df[f'unique_physical_{model_name}'] = results.apply(lambda x: x[3])
    df[f'unique_behavioural_{model_name}'] = results.apply(lambda x: x[4])
    df[f'unique_surgical_{model_name}'] = results.apply(lambda x: x[5])
    df[f'unique_radiotherapy_{model_name}'] = results.apply(lambda x: x[6])
    df[f'unique_other_{model_name}'] = results.apply(lambda x: x[7])
    i += 1


In [432]:
df.head()

Unnamed: 0,nct_id,text,ner_manual_ct_target,ner_prediction_BioLinkBERT-base_normalized,ner_prediction_biobert-v1.1_normalized,ner_prediction_bert-base-uncased_normalized,unique_condition_regex,unique_drug_regex,summary_target_labels,abbreviation_definition_pairs,...,unique_radiotherapy_biobert-v1.1,unique_other_biobert-v1.1,unique_condition_bert-base-uncased,unique_drug_bert-base-uncased,unique_control_bert-base-uncased,unique_physical_bert-base-uncased,unique_behavioural_bert-base-uncased,unique_surgical_bert-base-uncased,unique_radiotherapy_bert-base-uncased,unique_other_bert-base-uncased
0,NCT04045665,Anticoagulation for New-Onset Post-Operative A...,"[(0, 15, 'DRUG', 'Anticoagulation'), (20, 64, ...","[(0, 15, 'DRUG', 'anticoagulation'), (20, 64, ...","[(0, 15, 'DRUG', 'Anticoagulation'), (20, 64, ...","[(0, 4, 'DRUG', 'anti'), (20, 64, 'CONDITION',...",,,"{'DRUG': {'anticoagulation': 1, 'oral anticoag...","{'OAC': 'oral anticoagulation', 'POAF': 'post-...",...,,,coronary artery bypass graft|isolated coronary...,anti|anticoagulation|oral anticoagulation,,,,,,
1,NCT02311036,Effect and Outcome Predictors on Functional Re...,"[(59, 87, 'PHYSICAL', 'Comprehensive Rehabilit...","[(59, 87, 'OTHER', 'comprehensive rehabilitati...","[(59, 87, 'OTHER', 'Comprehensive Rehabilitati...","[(59, 87, 'OTHER', 'comprehensive rehabilitati...",,,{'PHYSICAL': {'comprehensive rehabilitation': ...,{},...,,comprehensive rehabilitation,post - acute care - cerebrovascular diseases,,,,,,,comprehensive rehabilitation
2,NCT05369793,Clinical Study Evaluating the Safety and Effic...,"[(53, 64, 'DRUG', 'Roflumilast'), (68, 83, 'CO...","[(53, 64, 'DRUG', 'roflumilast'), (68, 83, 'CO...","[(53, 64, 'DRUG', 'Roflumilast'), (68, 83, 'CO...","[(53, 64, 'DRUG', 'roflumilast'), (68, 83, 'CO...",diabetic neuropathy,roflumilast,"{'DRUG': {'roflumilast': 2}, 'CONDITION': {'ty...",{},...,,,diabetic neuropathy|type 2 diabetic,roflumilast,,,,,,
3,NCT04483310,Meditation-Relaxation (MR Therapy) for Sleep P...,"[(0, 21, 'BEHAVIOURAL', 'Meditation-Relaxation...","[(0, 4, 'PHYSICAL', 'medi'), (4, 21, 'BEHAVIOU...","[(0, 23, 'BEHAVIOURAL', 'Meditation - Relaxati...","[(0, 21, 'OTHER', 'meditation - relaxation'), ...",paralyses|narcolepsy|sleep paralysis,aim,"{'BEHAVIOURAL': {'meditation-relaxation': 1, '...",{},...,,,na|narcolepsy|sleep paralyses|sleep paralysis,,sham,,,,,meditation - relaxation|meditation relaxation ...
4,NCT01418976,IMPROVING WALKING AND BALANCE IN VETERANS WITH...,"[(47, 69, 'CONDITION', 'TRAUMATIC BRAIN INJURY...","[(47, 69, 'CONDITION', 'traumatic brain injury...","[(33, 34, 'CONDITION', 'V'), (47, 69, 'CONDITI...","[(47, 69, 'CONDITION', 'traumatic brain injury...",brain injury,balance,"{'CONDITION': {'traumatic brain injury': 1, 'c...","{'IMT': 'Intensive Mobility Training', 'TBI': ...",...,,,"chronic, mild - to - moderate, traumatic brain...",,,intensive mobility training,,,,


In [433]:
filtered_columns = ['nct_id'] + [col for col in df.columns if 'unique' in col]
df_unique_labels = df[filtered_columns]

In [434]:
df_unique_labels.head(4)

Unnamed: 0,nct_id,unique_condition_regex,unique_drug_regex,unique_condition_target,unique_drug_target,unique_control_target,unique_physical_target,unique_behavioural_target,unique_surgical_target,unique_radiotherapy_target,...,unique_radiotherapy_biobert-v1.1,unique_other_biobert-v1.1,unique_condition_bert-base-uncased,unique_drug_bert-base-uncased,unique_control_bert-base-uncased,unique_physical_bert-base-uncased,unique_behavioural_bert-base-uncased,unique_surgical_bert-base-uncased,unique_radiotherapy_bert-base-uncased,unique_other_bert-base-uncased
0,NCT04045665,,,coronary artery bypass graft|isolated coronary...,anticoagulants|anticoagulation|antiplatelet th...,,,,,,...,,,coronary artery bypass graft|isolated coronary...,anti|anticoagulation|oral anticoagulation,,,,,,
1,NCT02311036,,,-cerebrovascular diseases,,,comprehensive rehabilitation,,,,...,,comprehensive rehabilitation,post - acute care - cerebrovascular diseases,,,,,,,comprehensive rehabilitation
2,NCT05369793,diabetic neuropathy,roflumilast,diabetic neuropathy|type 2 diabetic,roflumilast,,,,,,...,,,diabetic neuropathy|type 2 diabetic,roflumilast,,,,,,
3,NCT04483310,paralyses|narcolepsy|sleep paralysis,aim,narcolepsy|sleep paralyses|sleep paralysis,,sham,,meditation relaxation therapy|meditation-relax...,,,...,,,na|narcolepsy|sleep paralyses|sleep paralysis,,sham,,,,,meditation - relaxation|meditation relaxation ...


In [435]:
# Your function to remove spaces around specific characters
def cleanup(text):
    if isinstance(text, str):  # Check if the cell value is a string
        text = text.replace(" ' ", "'")  # Remove spaces around '
        text = text.replace("' s", "'s")  # Remove spaces around '
        text = text.replace(" - ", "-")  # Remove spaces around -
        text = text.replace(" / ", "/")  # Remove spaces around /
        text = text.replace("( ", "(")  # Remove spaces around (
        text = text.replace(" )", ")")  # Remove spaces around )
        #text = text.replace("place ", "placebo")  # Remove spaces around )
    return text

# Apply the function to each cell in the DataFrame
df_unique_labels = df_unique_labels.applymap(cleanup)

  df_unique_labels = df_unique_labels.applymap(cleanup)


In [436]:
df_unique_labels.head(2)

Unnamed: 0,nct_id,unique_condition_regex,unique_drug_regex,unique_condition_target,unique_drug_target,unique_control_target,unique_physical_target,unique_behavioural_target,unique_surgical_target,unique_radiotherapy_target,...,unique_radiotherapy_biobert-v1.1,unique_other_biobert-v1.1,unique_condition_bert-base-uncased,unique_drug_bert-base-uncased,unique_control_bert-base-uncased,unique_physical_bert-base-uncased,unique_behavioural_bert-base-uncased,unique_surgical_bert-base-uncased,unique_radiotherapy_bert-base-uncased,unique_other_bert-base-uncased
0,NCT04045665,,,coronary artery bypass graft|isolated coronary...,anticoagulants|anticoagulation|antiplatelet th...,,,,,,...,,,coronary artery bypass graft|isolated coronary...,anti|anticoagulation|oral anticoagulation,,,,,,
1,NCT02311036,,,-cerebrovascular diseases,,,comprehensive rehabilitation,,,,...,,comprehensive rehabilitation,post-acute care-cerebrovascular diseases,,,,,,,comprehensive rehabilitation


### add AACT labels intervention

In [437]:
df_unique_labels.shape

(153, 35)

In [438]:
df_aact_1 = pd.read_csv("../data/data_aact_sample/combined_neuro_trials_with_interventions_20240412.csv")[['nct_id', 'intervention_name', 'intervention_type']]
df_aact_2 = pd.read_csv("../data/data_aact_sample/aact_neuro_samples__20240513_minority_classes_annotated.csv")[['nct_id', 'intervention_names', 'intervention_types']]
df_aact_3 = pd.read_csv("../data/data_aact_sample/aact_neuro_samples_20240422_non_drug_biological_dietary_annotated.csv")[['nct_id', 'intervention_names', 'intervention_types']]
df_aact_2.rename(columns={'intervention_names': 'intervention_name', 'intervention_types': 'intervention_type'}, inplace=True)
df_aact_3.rename(columns={'intervention_names': 'intervention_name', 'intervention_types': 'intervention_type'}, inplace=True)
                 
df_aact = pd.concat([df_aact_1, df_aact_2, df_aact_3], ignore_index=True)

df_aact_filtered = df_aact[df_aact['nct_id'].isin(df_unique_labels['nct_id'])].drop_duplicates()
df_aact_filtered

Unnamed: 0,nct_id,intervention_name,intervention_type
1546,NCT03529708,3D CRT plus SBRT boost,Radiation
6740,NCT02311036,Comprehensive Rehabilitation,Other
7834,NCT04232163,Arm Boot Camp,Behavioral
9409,NCT04164810,Hydrotherapy,Other
9415,NCT04164810,Physical Therapy,Other
...,...,...,...
504210,NCT01824472,CBT|CC|CPAP|sham CPAP,Behavioral|Device
504432,NCT03799887,0% unweighed BWSTT|10% unweighed BWSTT|20% unw...,Behavioral
504512,NCT04331392,Educational Videos|Spatial Navigation Interven...,Behavioral
504585,NCT04992910,Functional electrical stimulation through medi...,Device|Other


In [439]:
mapping = {
    'Behavioral': 'behavioural',
    'Device': 'other',
    'Dietary Supplement': 'other',
    'Drug': 'drug',
    'Biological': 'drug',
    'Genetic': 'other',
    'Other': 'other',
    'Device|Other': 'other',
    'Procedure': 'other',
    'Other|Procedure': 'other',
    'Drug|Radiation': 'radiotherapy',
    'Radiation': 'radiotherapy',
    'Combination Product': 'other'
}

# Replace values using the mapping
df_aact_filtered['intervention_type'] = df_aact_filtered['intervention_type'].replace(mapping)
df_aact_filtered.loc[df_aact_filtered['intervention_name'].str.contains('physical', case=False), 'intervention_type'] = 'physical'
df_aact_filtered.loc[df_aact_filtered['intervention_name'].str.contains('education', case=False), 'intervention_type'] = 'behavioural'
df_aact_filtered.loc[df_aact_filtered['intervention_name'].str.contains('sham', case=False), 'intervention_type'] = 'control'
df_aact_filtered.loc[df_aact_filtered['intervention_name'].str.contains('control', case=False), 'intervention_type'] = 'control'
df_aact_filtered.loc[df_aact_filtered['intervention_name'].str.contains('placebo', case=False), 'intervention_type'] = 'control'
#df_aact_filtered = df_aact_filtered[df_aact_filtered['intervention_type'] != 'drug']

df_aact_filtered.head()

Unnamed: 0,nct_id,intervention_name,intervention_type
1546,NCT03529708,3D CRT plus SBRT boost,radiotherapy
6740,NCT02311036,Comprehensive Rehabilitation,other
7834,NCT04232163,Arm Boot Camp,behavioural
9409,NCT04164810,Hydrotherapy,other
9415,NCT04164810,Physical Therapy,physical


In [440]:
set(df_aact_filtered['intervention_type'])

{'behavioural', 'control', 'drug', 'other', 'physical', 'radiotherapy'}

In [441]:
# Pivot the DataFrame
pivot_df = pd.pivot_table(df_aact_filtered, index='nct_id', columns='intervention_type', values='intervention_name', aggfunc=lambda x: '|'.join(x))

# Rename the columns
pivot_df.columns = [f'unique_{col}_aact' for col in pivot_df.columns]

# Reset index
pivot_df.reset_index(inplace=True)
for col in pivot_df.columns:
    if col != 'nct_id':
        pivot_df[col] = pivot_df[col].apply(lambda x: x.lower() if isinstance(x, str) else x)
        
pivot_df['unique_surgical_aact'] = ''
pivot_df = pivot_df.fillna('')

pivot_df

Unnamed: 0,nct_id,unique_behavioural_aact,unique_control_aact,unique_drug_aact,unique_other_aact,unique_physical_aact,unique_radiotherapy_aact,unique_surgical_aact
0,NCT00029146,,,best medical therapy,extracranial-intracranial bypass surgery,,,
1,NCT00073853,,,,autologous incubated macrophages (cell therapy),,,
2,NCT00105638,"educational program, by telephone",,,,,,
3,NCT00135993,,,rotigotine,,,,
4,NCT00164658,familial risk assessment and personalized prev...,,,,,,
...,...,...,...,...,...,...,...,...
148,NCT05670158,,,blood sampling,perilymph sampling,,,
149,NCT05706831,,,,combinatory effect of music stimulation and ne...,,,
150,NCT05716074,,,,supervised exercise|home exercise,,,
151,NCT05741853,video-implemented script training for aphasia ...,,,,,,


In [442]:
df_eval_aact = df_unique_labels.copy()
df_eval_aact = pd.merge(df_eval_aact, pivot_df, on='nct_id', how='left')
df_eval_aact = df_eval_aact.fillna('')

df_eval_aact.head()

Unnamed: 0,nct_id,unique_condition_regex,unique_drug_regex,unique_condition_target,unique_drug_target,unique_control_target,unique_physical_target,unique_behavioural_target,unique_surgical_target,unique_radiotherapy_target,...,unique_surgical_bert-base-uncased,unique_radiotherapy_bert-base-uncased,unique_other_bert-base-uncased,unique_behavioural_aact,unique_control_aact,unique_drug_aact,unique_other_aact,unique_physical_aact,unique_radiotherapy_aact,unique_surgical_aact
0,NCT04045665,,,coronary artery bypass graft|isolated coronary...,anticoagulants|anticoagulation|antiplatelet th...,,,,,,...,,,,,,antiplatelet-only strategy|oral anticoagulant ...,,,,
1,NCT02311036,,,-cerebrovascular diseases,,,comprehensive rehabilitation,,,,...,,,comprehensive rehabilitation,,,,comprehensive rehabilitation,,,
2,NCT05369793,diabetic neuropathy,roflumilast,diabetic neuropathy|type 2 diabetic,roflumilast,,,,,,...,,,,,,alpha lipoic acid|roflumilast,,,,
3,NCT04483310,paralyses|narcolepsy|sleep paralysis,aim,narcolepsy|sleep paralyses|sleep paralysis,,sham,,meditation relaxation therapy|meditation-relax...,,,...,,,meditation-relaxation|meditation relaxation th...,mr therapy.|breathing-distraction exercise,,,,,,
4,NCT01418976,brain injury,balance,"chronic, mild-to-moderate, traumatic brain inj...",,,intensive mobility training,,,,...,,,,intensive mobility training (imt),,,,,,


### add AACT conditions

In [443]:
df_aact_1 = pd.read_csv("../data/data_aact_sample/combined_neuro_trials_with_interventions_20240412.csv")[['nct_id', 'Neurological Disease']]

df_aact_2 = pd.read_csv("../data/data_aact_sample/aact_neuro_samples__20240513_minority_classes_annotated.csv")[['nct_id', 'conditions']]
df_aact_3 = pd.read_csv("../data/data_aact_sample/aact_neuro_samples_20240422_non_drug_biological_dietary_annotated.csv")[['nct_id', 'conditions']]
df_aact_1.rename(columns={'Neurological Disease': 'conditions'}, inplace=True)

#df_aact_3.rename(columns={'intervention_names': 'intervention_name', 'intervention_types': 'intervention_type'}, inplace=True)
                 
df_aact = pd.concat([df_aact_1, df_aact_2, df_aact_3], ignore_index=True)
df_aact = df_aact.groupby('nct_id')['conditions'].agg('|'.join).reset_index()

df_aact_filtered = df_aact[df_aact['nct_id'].isin(df_unique_labels['nct_id'])].drop_duplicates()
df_aact_filtered.rename(columns={'conditions': 'unique_condition_aact'}, inplace=True)

def process_conditions(conditions):
    # Splitting the conditions, converting to lowercase, removing duplicates, and re-joining
    unique_conditions = set(condition.strip().lower() for condition in conditions.split('|'))
    return '|'.join(unique_conditions)

# Apply the function to the 'unique_condition_aact' column
df_aact_filtered['unique_condition_aact'] = df_aact_filtered['unique_condition_aact'].apply(process_conditions)

df_aact_filtered

Unnamed: 0,nct_id,unique_condition_aact
554,NCT00029146,"stroke|ischemic attack, transient|cerebral inf..."
960,NCT00073853,spinal cord injury
1220,NCT00105638,dementia|alzheimer disease
1528,NCT00135993,restless legs syndrome
1811,NCT00164658,stroke|coronary heart disease|diabetes|ovarian...
...,...,...
41749,NCT05670158,sensorineural hearing loss
42003,NCT05706831,stroke|disorder of consciousness
42062,NCT05716074,amyotrophic lateral sclerosis
42248,NCT05741853,neurocognitive disorders|primary progressive a...


In [444]:
df_eval = df_eval_aact.copy()
df_eval = pd.merge(df_eval, df_aact_filtered, on='nct_id', how='left')
df_eval = df_eval.fillna('')

df_eval.head()

Unnamed: 0,nct_id,unique_condition_regex,unique_drug_regex,unique_condition_target,unique_drug_target,unique_control_target,unique_physical_target,unique_behavioural_target,unique_surgical_target,unique_radiotherapy_target,...,unique_radiotherapy_bert-base-uncased,unique_other_bert-base-uncased,unique_behavioural_aact,unique_control_aact,unique_drug_aact,unique_other_aact,unique_physical_aact,unique_radiotherapy_aact,unique_surgical_aact,unique_condition_aact
0,NCT04045665,,,coronary artery bypass graft|isolated coronary...,anticoagulants|anticoagulation|antiplatelet th...,,,,,,...,,,,,antiplatelet-only strategy|oral anticoagulant ...,,,,,stroke|bleeding|atrial fibrillation
1,NCT02311036,,,-cerebrovascular diseases,,,comprehensive rehabilitation,,,,...,,comprehensive rehabilitation,,,,comprehensive rehabilitation,,,,cerebrovascular diseases
2,NCT05369793,diabetic neuropathy,roflumilast,diabetic neuropathy|type 2 diabetic,roflumilast,,,,,,...,,,,,alpha lipoic acid|roflumilast,,,,,type 2 diabetes (adult onset)|diabetic neuropa...
3,NCT04483310,paralyses|narcolepsy|sleep paralysis,aim,narcolepsy|sleep paralyses|sleep paralysis,,sham,,meditation relaxation therapy|meditation-relax...,,,...,,meditation-relaxation|meditation relaxation th...,mr therapy.|breathing-distraction exercise,,,,,,,sleep paralysis|narcolepsy type 1
4,NCT01418976,brain injury,balance,"chronic, mild-to-moderate, traumatic brain inj...",,,intensive mobility training,,,,...,,,intensive mobility training (imt),,,,,,,"difficulties, ambulation|brain injuries, traum..."


In [445]:
df_eval.shape

(153, 43)

### add GPT

In [446]:
df_gpt_3_5 = pd.read_csv("./predictions/rebuttal/ct_neuro_test_annotated_gpt-3.5-turbo_ALL_types_20240529_clean.csv")
df_gpt_4 = pd.read_csv("./predictions/rebuttal/ct_neuro_test_annotated_gpt-4_ALL_types_20240529_clean.csv")

def clean_and_rename_columns(df, model_name):
    # Filter out columns starting with "Unnamed"
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df = df.loc[:, ~df.columns.str.contains('text')]

    
    # Rename the columns based on your criteria
    new_column_names = {
        'nct_id': 'nct_id'
    }

    # For each of the remaining columns, create a new column name based on your template
    for col in df.columns:
        #print(col)
        if col.startswith(f'{model_name}_predictions'):
            if 'condition' in col or 'drug' in col:
                entity_type = col.split('_')[-2] + "_" + col.split('_')[-1]
            else:
                # Extract the entity type from the column name
                entity_type = col.split('_')[-1]
            # Construct the new column name
            new_column_name = f'unique_{entity_type}_{model_name}'
            new_column_names[col] = new_column_name

    # Apply the new column names
    df.rename(columns=new_column_names, inplace=True)
    
    return df

# Clean and rename columns for each DataFrame
df_gpt_3_5_cleaned = clean_and_rename_columns(df_gpt_3_5, "gpt-3.5-turbo")
df_gpt_4_cleaned = clean_and_rename_columns(df_gpt_4, "gpt-4")

In [447]:
df_gpt_3_5_cleaned.head(2)

Unnamed: 0,nct_id,unique_behavioural_gpt-3.5-turbo,unique_surgical_gpt-3.5-turbo,unique_radiotherapy_gpt-3.5-turbo,unique_physical_gpt-3.5-turbo,unique_other_gpt-3.5-turbo,unique_control_gpt-3.5-turbo,unique_condition_v1_gpt-3.5-turbo,unique_drug_v1_gpt-3.5-turbo,unique_condition_v2_gpt-3.5-turbo,unique_drug_v2_gpt-3.5-turbo
0,NCT00029146,none,extracranial-intracranial bypass surgery,none,best medical therapy|extracranial-intracranial...,none,best medical therapy,carotid occlusion|stroke|none,best medical therapy|none|extracranial-intracr...,carotid occlusion|increased cerebral oxygen ex...,none
1,NCT00073853,none,autologous incubated macrophages,none,autologous incubated macrophages,autologous incubated macrophages,none,none|spinal cord injuries,autologous incubated macrophages|procord,complete spinal cord injuries|loss of sensory ...,autologous incubated macrophages


In [448]:
df_gpt_4_cleaned.head(2)

Unnamed: 0,nct_id,unique_behavioural_gpt-4,unique_surgical_gpt-4,unique_radiotherapy_gpt-4,unique_physical_gpt-4,unique_other_gpt-4,unique_control_gpt-4,unique_condition_v1_gpt-4,unique_drug_v1_gpt-4,unique_condition_v2_gpt-4,unique_drug_v2_gpt-4
0,NCT00029146,,extracranial-intracranial bypass surgery,,extracranial-intracranial bypass surgery,best medical therapy|positron emission tomogra...,best medical therapy,carotid occlusion|stroke,none,carotid occlusion|increased cerebral oxygen ex...,none
1,NCT00073853,none,none,none,autologous incubated macrophages (procord) tre...,autologous incubated macrophages (procord),none,complete spinal cord injuries,autologous incubated macrophages|procord,complete spinal cord injuries|loss of sensory ...,autologous incubated macrophages (procord)


In [449]:
df_gpt_4_cleaned.shape

(153, 11)

In [450]:
df_eval = pd.merge(df_eval, df_gpt_3_5_cleaned, on='nct_id', how='left')
df_eval = pd.merge(df_eval, df_gpt_4_cleaned, on='nct_id', how='left')
#df_eval.drop('unique_conditions_target', axis=1, inplace=True)

In [451]:
df_eval.head()

Unnamed: 0,nct_id,unique_condition_regex,unique_drug_regex,unique_condition_target,unique_drug_target,unique_control_target,unique_physical_target,unique_behavioural_target,unique_surgical_target,unique_radiotherapy_target,...,unique_behavioural_gpt-4,unique_surgical_gpt-4,unique_radiotherapy_gpt-4,unique_physical_gpt-4,unique_other_gpt-4,unique_control_gpt-4,unique_condition_v1_gpt-4,unique_drug_v1_gpt-4,unique_condition_v2_gpt-4,unique_drug_v2_gpt-4
0,NCT04045665,,,coronary artery bypass graft|isolated coronary...,anticoagulants|anticoagulation|antiplatelet th...,,,,,,...,none,coronary artery bypass graft (cabg) surgery,none,none,oral anticoagulation (oac)|antiplatelets|antic...,background antiplatelet therapy,atrial fibrillation|coronary artery bypass gra...,oral anticoagulation (oac)|antiplatelets|antic...,new-onset post-operative atrial fibrillation|t...,none
1,NCT02311036,,,-cerebrovascular diseases,,,comprehensive rehabilitation,,,,...,,,,,none,,cerebrovascular diseases,none,cerebrovascular diseases,
2,NCT05369793,diabetic neuropathy,roflumilast,diabetic neuropathy|type 2 diabetic,roflumilast,,,,,,...,,,,roflumilast,roflumilast,,diabetic neuropathy|type 2 diabetes,roflumilast,glycemic parameters|insulin resistance|type 2 ...,roflumilast
3,NCT04483310,paralyses|narcolepsy|sleep paralysis,aim,narcolepsy|sleep paralyses|sleep paralysis,,sham,,meditation relaxation therapy|meditation-relax...,,,...,meditation relaxation therapy,,,meditation relaxation therapy,sham|meditation relaxation therapy,sham,sleep paralysis|narcolepsy,none,sleep paralysis|narcolepsy,none
4,NCT01418976,brain injury,balance,"chronic, mild-to-moderate, traumatic brain inj...",,,intensive mobility training,,,,...,"3 hours of rehabilitation per day for 20 days,...",none,none,"3 hours of rehabilitation per day for 20 days,...",magnetic resonance imaging (mri) exam|rehabili...,none,traumatic brain injury,none,locomotor impairment|gait and balance issues|t...,intensive mobility training (imt)


In [452]:
df_eval.shape

(153, 63)

In [453]:
file_with_all_annotations = "./predictions/rebuttal/all_models_aggregated_all_entity_types.csv"

In [454]:
df_eval.to_csv(file_with_all_annotations)

## Map to numerical arrays

In [455]:
import pandas as pd
import difflib

# Example dataframe
data = {
    'unique_condition_target': ['','comprehensive rehabilitation','stroke|heart attack', 'diabetes|hypertension', 'carbon-14 bia 28-6156|bia 28-6156', 'the alzheimer', 'mild schizophrenia| schizophrenia'],
    'unique_condition_BioLinkBERT-base': ['','','stroke|heart failure', 'hypertension|diabetes type 2', 'bia 28-6156', 'the diabetes', 'mild to severe schizophrenia']
}

df = pd.DataFrame(data)

# Function to check for partial matches
def partial_match_simple(predicted, target):
    predicted_parts = predicted.lower().split()
    target_parts = target.lower().split()
    return any(pred_part in target_part or target_part in pred_part for pred_part in predicted_parts for target_part in target_parts)

# Function to check for partial matches using difflib
def partial_match(predicted, target, cutoff=0.6):
    # Split the conditions into individual terms
    predicted_terms = predicted.split('|')
    target_terms = target.split('|')
    
    # Check for matches across all terms
    matches = []
    for pred in predicted_terms:
        # Use difflib to find close matches, with a cutoff for match quality
        match_found = any(difflib.get_close_matches(pred, target_terms, n=1, cutoff=cutoff))
        matches.append(match_found)
    
    # Return True if any match is found
    return any(matches)

def handle_empty_arrays(target_entities, predicted_entities):
    # Construct the results based on whether target_entities or predicted_entities are empty
    if target_entities == [''] and predicted_entities == ['']:
        return pd.Series([
            [0],  # No target entities to match against, so exact match is [0]
            [0],  # No predicted entities to match against, so exact match is [0]
            [0],  # No target entities for partial matches, so partial match is [0]
            [0]   # No predicted entities for partial matches, so partial match is [0]
        ], index=[
            f'target_{entity}_annotations_exact_{model}',
            f'predicted_{entity}_annotations_exact_{model}',
            f'target_{entity}_annotations_partial_{model}',
            f'predicted_{entity}_annotations_partial_{model}'
        ])
    elif target_entities == ['']:
        return pd.Series([
            [0] * len(predicted_entities),  
            [1] * len(predicted_entities),  
            [0] * len(predicted_entities), 
            [1] * len(predicted_entities)  
        ], index=[
            f'target_{entity}_annotations_exact_{model}',
            f'predicted_{entity}_annotations_exact_{model}',
            f'target_{entity}_annotations_partial_{model}',
            f'predicted_{entity}_annotations_partial_{model}'
        ])
    # Handle case where predicted_entities is empty, if required
    else: # predicted_entities == ['']:
        return pd.Series([
            [1] * len(target_entities), 
            [0] * len(target_entities),  
            [1] * len(target_entities),  
            [0] * len(target_entities)   
        ], index=[
            f'target_{entity}_annotations_exact_{model}',
            f'predicted_{entity}_annotations_exact_{model}',
            f'target_{entity}_annotations_partial_{model}',
            f'predicted_{entity}_annotations_partial_{model}'
        ])

# Function to compute exact and partial annotations for predicted and target conditions
def compute_annotations(row, entity, model):

    target_col = f'unique_{entity.replace("_v1","").replace("_v2","")}_target'
    model_col = f'unique_{entity}_{model}'
    
    target_entities = row[target_col].split('|')
    predicted_entities = row[model_col].split('|')

    if target_entities == [''] or predicted_entities == ['']:
        return handle_empty_arrays(target_entities, predicted_entities)

    # Create a set of all unique entities from both target and predicted for consistent indexing
    all_entities = set(target_entities + predicted_entities)

    # Target annotations (exact)
    target_annotations_exact = [1 if entity in target_entities else 0 for entity in all_entities]

    # Predicted annotations (exact)
    predicted_annotations_exact = [1 if entity in predicted_entities else 0 for entity in all_entities]

    # Target annotations (partial)
    target_annotations_partial = [1 if any(partial_match(entity, pred) for pred in predicted_entities) else 0 for entity in all_entities]

    # Predicted annotations (partial)
    predicted_annotations_partial = [1 if any(partial_match(pred, entity) for entity in target_entities) else 0 for pred in all_entities]

    return pd.Series([
        target_annotations_exact,
        predicted_annotations_exact,
        target_annotations_partial,
        predicted_annotations_partial
    ], index=[
        f'target_{entity}_annotations_exact_{model}',
        f'predicted_{entity}_annotations_exact_{model}',
        f'target_{entity}_annotations_partial_{model}',
        f'predicted_{entity}_annotations_partial_{model}'
    ])

# List of entity types and models based on your dataframe structure
entities = ['condition']#, 'drug', 'control', 'physical', 'behavioural', 'surgical', 'radiotherapy', 'other']
models = ['BioLinkBERT-base']#, 'biobert-v1.1', 'bert-base-uncased', 'aact']

# Apply the annotation functions to each row for each entity-model combination
for entity in entities:
    for model in models:
        annotations_cols = [
            f'target_{entity}_annotations_exact_{model}',
            f'predicted_{entity}_annotations_exact_{model}',
            f'target_{entity}_annotations_partial_{model}',
            f'predicted_{entity}_annotations_partial_{model}'
        ]
        df[annotations_cols] = df.apply(lambda row: compute_annotations(row, entity, model), axis=1)
df

Unnamed: 0,unique_condition_target,unique_condition_BioLinkBERT-base,target_condition_annotations_exact_BioLinkBERT-base,predicted_condition_annotations_exact_BioLinkBERT-base,target_condition_annotations_partial_BioLinkBERT-base,predicted_condition_annotations_partial_BioLinkBERT-base
0,,,[0],[0],[0],[0]
1,comprehensive rehabilitation,,[1],[0],[1],[0]
2,stroke|heart attack,stroke|heart failure,"[1, 1, 0]","[1, 0, 1]","[1, 0, 1]","[1, 1, 0]"
3,diabetes|hypertension,hypertension|diabetes type 2,"[1, 1, 0]","[1, 0, 1]","[1, 1, 1]","[1, 1, 1]"
4,carbon-14 bia 28-6156|bia 28-6156,bia 28-6156,"[1, 1]","[0, 1]","[1, 1]","[1, 1]"
5,the alzheimer,the diabetes,"[1, 0]","[0, 1]","[0, 1]","[1, 0]"
6,mild schizophrenia| schizophrenia,mild to severe schizophrenia,"[0, 1, 1]","[1, 0, 0]","[1, 1, 1]","[1, 1, 1]"


In [456]:
df_eval_arrays = df_eval.copy()
df_eval_arrays.replace([pd.NA, pd.NaT, 'none', 'None', 'NONE'], '', inplace=True)
df_eval_arrays.head(2)

Unnamed: 0,nct_id,unique_condition_regex,unique_drug_regex,unique_condition_target,unique_drug_target,unique_control_target,unique_physical_target,unique_behavioural_target,unique_surgical_target,unique_radiotherapy_target,...,unique_behavioural_gpt-4,unique_surgical_gpt-4,unique_radiotherapy_gpt-4,unique_physical_gpt-4,unique_other_gpt-4,unique_control_gpt-4,unique_condition_v1_gpt-4,unique_drug_v1_gpt-4,unique_condition_v2_gpt-4,unique_drug_v2_gpt-4
0,NCT04045665,,,coronary artery bypass graft|isolated coronary...,anticoagulants|anticoagulation|antiplatelet th...,,,,,,...,,coronary artery bypass graft (cabg) surgery,,,oral anticoagulation (oac)|antiplatelets|antic...,background antiplatelet therapy,atrial fibrillation|coronary artery bypass gra...,oral anticoagulation (oac)|antiplatelets|antic...,new-onset post-operative atrial fibrillation|t...,
1,NCT02311036,,,-cerebrovascular diseases,,,comprehensive rehabilitation,,,,...,,,,,,,cerebrovascular diseases,,cerebrovascular diseases,


In [457]:
df_eval_arrays.shape

(153, 63)

In [458]:
entities = ['condition', 'drug', 'control', 'physical', 'behavioural', 'surgical', 'radiotherapy', 'other']
models = ['BioLinkBERT-base', 'biobert-v1.1', 'bert-base-uncased', 'gpt-3.5-turbo', 'gpt-4','aact','regex']

# Initialize an empty list to collect DataFrames
new_columns = []
# Apply the annotation functions to each row for each entity-model combination
for entity in entities:
    for model in models:
        if (entity == 'condition' or entity == 'drug') and 'gpt' in model:
            continue    
        if (entity != 'condition' and entity != 'drug') and 'regex' in model:
            continue     
        annotations_cols = [
            f'target_{entity}_annotations_exact_{model}',
            f'predicted_{entity}_annotations_exact_{model}',
            f'target_{entity}_annotations_partial_{model}',
            f'predicted_{entity}_annotations_partial_{model}'
        ]
        #df_eval_arrays[annotations_cols] = df_eval_arrays.apply(lambda row: compute_annotations(row, entity, model), axis=1)
        # Apply function and collect results in a temporary DataFrame
        df_temp = df_eval_arrays.apply(lambda row: compute_annotations(row, entity, model), axis=1)
        new_columns.append(df_temp[annotations_cols])

print(len(new_columns))
# Concatenate all new columns into the original DataFrame
df_eval_arrays = pd.concat([df_eval_arrays] + new_columns, axis=1)

entities = ['condition_v1', 'condition_v2', 'drug_v1', 'drug_v2']
models = ['gpt-3.5-turbo', 'gpt-4']
for entity in entities:
    for model in models:    
        annotations_cols = [
            f'target_{entity}_annotations_exact_{model}',
            f'predicted_{entity}_annotations_exact_{model}',
            f'target_{entity}_annotations_partial_{model}',
            f'predicted_{entity}_annotations_partial_{model}'
        ]
        df_eval_arrays[annotations_cols] = df_eval_arrays.apply(lambda row: compute_annotations(row, entity, model), axis=1)


46


In [459]:
df_eval_arrays.shape

(153, 279)

In [460]:
df_eval_arrays

Unnamed: 0,nct_id,unique_condition_regex,unique_drug_regex,unique_condition_target,unique_drug_target,unique_control_target,unique_physical_target,unique_behavioural_target,unique_surgical_target,unique_radiotherapy_target,...,target_drug_v1_annotations_partial_gpt-4,predicted_drug_v1_annotations_partial_gpt-4,target_drug_v2_annotations_exact_gpt-3.5-turbo,predicted_drug_v2_annotations_exact_gpt-3.5-turbo,target_drug_v2_annotations_partial_gpt-3.5-turbo,predicted_drug_v2_annotations_partial_gpt-3.5-turbo,target_drug_v2_annotations_exact_gpt-4,predicted_drug_v2_annotations_exact_gpt-4,target_drug_v2_annotations_partial_gpt-4,predicted_drug_v2_annotations_partial_gpt-4
0,NCT04045665,,,coronary artery bypass graft|isolated coronary...,anticoagulants|anticoagulation|antiplatelet th...,,,,,,...,"[1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1]","[0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]","[0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]","[0, 0, 0, 0, 0]","[1, 1, 1, 1, 1]","[0, 0, 0, 0, 0]"
1,NCT02311036,,,-cerebrovascular diseases,,,comprehensive rehabilitation,,,,...,[0],[0],[0],[0],[0],[0],[0],[0],[0],[0]
2,NCT05369793,diabetic neuropathy,roflumilast,diabetic neuropathy|type 2 diabetic,roflumilast,,,,,,...,[1],[1],[1],[1],[1],[1],[1],[1],[1],[1]
3,NCT04483310,paralyses|narcolepsy|sleep paralysis,aim,narcolepsy|sleep paralyses|sleep paralysis,,sham,,meditation relaxation therapy|meditation-relax...,,,...,[0],[0],[0],[0],[0],[0],[0],[0],[0],[0]
4,NCT01418976,brain injury,balance,"chronic, mild-to-moderate, traumatic brain inj...",,,intensive mobility training,,,,...,[0],[0],[0],[0],[0],[0],[0],[1],[0],[1]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,NCT00433667,seizures|epilepsy,as,epilepsy|partial onset seizures,rwj-333369,,,,,,...,[1],[1],[1],[1],[1],[1],[1],[1],[1],[1]
149,NCT02139436,immobility,as,acute sci|cardiovascular declines|cardiovascul...,,arms-only exercise|arms-only-rt|time (wait-lis...,fes-rt|functional electrical stimulation (fes)...,,,,...,[0],[0],[0],[1],[0],[1],[0],[0],[0],[0]
150,NCT02751905,,balance|whole blood,,[14c]-biib074|biib074,,,,,,...,"[1, 1]","[1, 1]","[1, 1]","[1, 0]","[1, 1]","[1, 1]","[1, 1]","[1, 0]","[1, 1]","[1, 1]"
151,NCT03170856,concussion,,concussion,,,sub-maximal exercise program|sub-maximal exerc...,,,,...,[0],[0],[0],[0],[0],[0],[0],[0],[0],[0]


In [461]:
file_with_all_annotations = "./predictions/rebuttal/all_models_aggregated_all_entity_types_numarical_arrays.csv"
df_eval_arrays.to_csv(file_with_all_annotations)

In [331]:
entity = 'control'
model = 'gpt-4'
annotations_cols = [
            f'unique_{entity}_target',
            f'unique_{entity}_{model}',
            f'target_{entity}_annotations_exact_{model}',
            f'predicted_{entity}_annotations_exact_{model}',
            f'target_{entity}_annotations_partial_{model}',
            f'predicted_{entity}_annotations_partial_{model}'
        ]
df_eval_arrays[annotations_cols]

Unnamed: 0,unique_control_target,unique_control_gpt-4,target_control_annotations_exact_gpt-4,predicted_control_annotations_exact_gpt-4,target_control_annotations_partial_gpt-4,predicted_control_annotations_partial_gpt-4
0,,background antiplatelet therapy,[0],[1],[0],[1]
1,,,[0],[0],[0],[0]
2,,,[0],[0],[0],[0]
3,sham,sham,[1],[1],[1],[1]
4,,,[0],[0],[0],[0]
...,...,...,...,...,...,...
148,,,[0],[0],[0],[0]
149,arms-only exercise|arms-only-rt|time (wait-lis...,arms-only-rt|time control,"[1, 1, 1, 1]","[0, 0, 1, 1]","[1, 1, 1, 1]","[1, 1, 1, 1]"
150,,,[0],[0],[0],[0]
151,,,[0],[0],[0],[0]


In [332]:
df_eval_arrays[annotations_cols].to_csv("./predictions/rebuttal/all_models_aggregated_all_entity_types_numarical_arrays_TEST.csv")

## demo partial match

In [467]:
def partial_match(predicted, target, cutoff=0.6):
    # Split the conditions into individual terms
    predicted_terms = predicted.split('|')
    target_terms = target.split('|')
    
    # Check for matches across all terms
    matches = []
    for pred in predicted_terms:
        print(pred, target_terms)
        # Use difflib to find close matches, with a cutoff for match quality
        match_found = any(difflib.get_close_matches(pred, target_terms, n=1, cutoff=cutoff))
        matches.append(match_found)
    
    # Return True if any match is found
    return any(matches)

In [468]:
partial_match("diabetes type 2", "type 2 diabetes")

diabetes type 2 ['type 2 diabetes']


False

In [469]:
partial_match("scleriosis", "multiple sclerosis")

scleriosis ['multiple sclerosis']


True

In [470]:
partial_match("scleriosis", "sclerosis multiple")

scleriosis ['sclerosis multiple']


True

In [480]:
partial_match("age-related hearing loss", "hearing loss")

age-related hearing loss ['hearing loss']


True

In [478]:
from difflib import SequenceMatcher

s = SequenceMatcher(lambda x: x == " ",
                    "diabetes type 2",
                    "type 2 diabetes")


In [479]:
s.ratio()

0.5333333333333333