In [1]:
import pandas as pd
import ast
import numpy as np
import os

# Abstract level CT annotation

In [2]:
from abbreviations import schwartz_hearst

In [3]:
pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text='The emergency room (ER) was busy')
pairs

{'ER': 'emergency room'}

In [4]:
annotated_files_path_prefix = "./predictions/"

## Load BERT model annotations

In [5]:
hugging_face_model_name_biolink = "michiyasunaga/BioLinkBERT-base"
model_name_str_biolink = hugging_face_model_name_biolink.split("/")[1]

In [6]:
annotated_files_path = "./data/annotated_aact/ner_outputs/ner_annotations_BioLinkBERT-base_46376_20240621.csv"
#annotated_files_path_second_batch = "data/annotated_aact/ct_neuro_test_annotated_BioLinkBERT-base_20240320.csv"

In [7]:
biolinkbert_col = f'ner_prediction_{model_name_str_biolink}_normalized'
df = pd.read_csv(annotated_files_path)[['nct_id', 'text', biolinkbert_col]]
print(df.shape)

(46376, 3)


In [8]:
df.head(10)

Unnamed: 0,nct_id,text,ner_prediction_BioLinkBERT-base_normalized
0,NCT03890861,Reducing African Americans' Alzheimer's Diseas...,"[(28, 47, 'CONDITION', ""alzheimer ' s disease""..."
1,NCT03060096,Stepped-Care Telehealth for Distress in Cancer...,"[(0, 4, 'OTHER', 'step'), (4, 12, 'BEHAVIOURAL..."
2,NCT04525742,COVID-19 Pandemic From the Perspective of Pare...,"[(0, 17, 'CONDITION', 'covid - 19 pandemic'), ..."
3,NCT02324634,Early Electrical Stimulation to the Wrist Exte...,"[(0, 41, 'OTHER', 'early electrical stimulatio..."
4,NCT06036368,"6-weeks, Open-label, Single-Site Study to Eval...","[(92, 153, 'OTHER', 'home - based peroneal ele..."
5,NCT05573763,Evaluation of the F&P Toffee Nasal Pillows Mas...,"[(18, 47, 'OTHER', 'f & p toffee nasal pillows..."
6,NCT05440214,Targeting Emotion Dysregulation to Reduce Suic...,"[(10, 31, 'OTHER', 'emotion dysregulation'), (..."
7,NCT03479970,Rehabilitation of Social Cognition in Subjects...,"[(52, 74, 'CONDITION', 'traumatic brain injury..."
8,NCT04846790,The Effect of a Combined Nature-based and Virt...,"[(16, 24, 'OTHER', 'combined'), (25, 74, 'OTHE..."
9,NCT02463188,Promoting Sleep to Prevent Substance Use in Ad...,"[(119, 120, 'BEHAVIOURAL', '-'), (132, 171, 'B..."


### Aggregate annotations to unique abstract level entities

In [9]:
# aggregate annotations and sum up how often they were annotated
def extract_summary(annotation_list):
    annotation_list = eval(annotation_list)
    summary = {}
    for annotation in annotation_list:
        _, _, entity_type, entity_name = annotation
        entity_name = entity_name.lower()
        if entity_type not in summary:
            summary[entity_type] = {}
        if entity_name not in summary[entity_type]:
            summary[entity_type][entity_name] = 0
        summary[entity_type][entity_name] += 1
    return summary

In [10]:
# Define a function to extract the unique conditions, drugs, and others from the 'ner_manual_final_annotated_ds' column
def extract_unique_entities_count(annotation_list, abbreviation_definition_pairs):
    unique_conditions = set()
    unique_drugs = set()
    unique_others = set()
    annotation_list = eval(annotation_list)
    for annotation in annotation_list:
        _, _, entity_type, entity_name = annotation
        if entity_name in abbreviation_definition_pairs:
            #print("Skipping entity {} as it is an ABBR".format(entity_name))
            continue
        entity_name = entity_name.lower()
        if entity_type == 'CONDITION':
            unique_conditions.add(entity_name)
        elif entity_type == 'DRUG':
            unique_drugs.add(entity_name)
        elif entity_type == 'OTHER':
            unique_others.add(entity_name)
    return len(unique_conditions), len(unique_drugs), len(unique_others)

def extract_unique_entities(nct_id, annotation_list, abbreviation_definition_pairs, model="linkbert", keep_drug_interventions_only=True):
    unique_conditions = set()
    unique_interventions = set()
    interventions_type = set()
   
    try:
        annotation_list = eval(annotation_list)
    except SyntaxError as e:
        print(nct_id)
        print(annotation_list)
        print("Syntax error in eval:", e)
        return "issues processing line"
    
    for annotation in annotation_list:
        _, _, entity_type, entity_name = annotation
        if entity_name.startswith("##"):
            continue ## THERE IS AN ISSUE WITH BIOBERT and BERT
        if (len(entity_name) == 1 or len(entity_name)==2) and model=="biobert":
            continue ## ASSUME TOKENIZER ERROR IN BIOBERT
        # REPLACE ABBREVIATIONS WITH FULL FORM
        if entity_name in abbreviation_definition_pairs:
            #print("Skipping entity {} as it is an ABBR".format(entity_name))
            entity_name = abbreviation_definition_pairs[entity_name] 
            #continue
        if entity_name.upper() in abbreviation_definition_pairs:
            #print("Skipping entity {} as it is an ABBR".format(entity_name))
            entity_name = abbreviation_definition_pairs[entity_name.upper()] 
        entity_name = entity_name.lower()
        if entity_type == 'CONDITION':
            unique_conditions.add(entity_name)
        elif keep_drug_interventions_only and entity_type == 'DRUG':
            unique_interventions.add(entity_name)
            interventions_type.add(entity_type)
        elif not keep_drug_interventions_only:
            unique_interventions.add(entity_name)
            interventions_type.add(entity_type)
        
    return "|".join(list(unique_conditions)), "|".join(list(unique_interventions)), "|".join(list(interventions_type))

# Placeholder function to demonstrate applying the Schwartz-Hearst algorithm (Replace with actual implementation)
def extract_abbreviation_definition_pairs(doc_text):
    pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text=doc_text)
    return pairs

In [11]:
# Add a new column 'BERT failed' and initialize it with 0
df['BERT failed'] = 0

# Replace the rows containing 'Failed NER extraction!' with empty strings and set 'BERT failed' to 1
mask = df['ner_prediction_BioLinkBERT-base_normalized'] == 'Failed NER extraction!'
df.loc[mask, 'ner_prediction_BioLinkBERT-base_normalized'] = ''
df.loc[mask, 'BERT failed'] = 1

In [12]:
df.shape

(46376, 4)

In [13]:
# Create a new column 'abbreviation_definition_pairs' using the 'apply' function
df['abbreviation_definition_pairs'] = df['text'].apply(extract_abbreviation_definition_pairs)

# Apply the function to each row and create new columns 'num_unique_conditions', 'num_unique_drugs', and 'num_unique_others'
df[f'unique_conditions_{model_name_str_biolink}_predictions'], df[f'unique_interventions_{model_name_str_biolink}_predictions'], df[f'unique_interventions_type_{model_name_str_biolink}_predictions'] = zip(*df.apply(lambda row: extract_unique_entities(row['nct_id'], row[biolinkbert_col], row['abbreviation_definition_pairs']), axis=1))

#df['num_unique_conditions'], df['num_unique_drugs'], df['num_unique_others'] = zip(*df.apply(lambda row: extract_unique_entities_count(row[col_name_target_annot], row['abbreviation_definition_pairs']), axis=1))


NCT03268187

Syntax error in eval: invalid syntax (<string>, line 0)
NCT02118610

Syntax error in eval: invalid syntax (<string>, line 0)
NCT05834855

Syntax error in eval: invalid syntax (<string>, line 0)
NCT03442166

Syntax error in eval: invalid syntax (<string>, line 0)
NCT01201967

Syntax error in eval: invalid syntax (<string>, line 0)
NCT03582293

Syntax error in eval: invalid syntax (<string>, line 0)
NCT06140355

Syntax error in eval: invalid syntax (<string>, line 0)
NCT04538521

Syntax error in eval: invalid syntax (<string>, line 0)
NCT02554487

Syntax error in eval: invalid syntax (<string>, line 0)
NCT06213766

Syntax error in eval: invalid syntax (<string>, line 0)
NCT04176302

Syntax error in eval: invalid syntax (<string>, line 0)
NCT03810898

Syntax error in eval: invalid syntax (<string>, line 0)
NCT01744548

Syntax error in eval: invalid syntax (<string>, line 0)
NCT05491122

Syntax error in eval: invalid syntax (<string>, line 0)


In [14]:
df.head(2)

Unnamed: 0,nct_id,text,ner_prediction_BioLinkBERT-base_normalized,BERT failed,abbreviation_definition_pairs,unique_conditions_BioLinkBERT-base_predictions,unique_interventions_BioLinkBERT-base_predictions,unique_interventions_type_BioLinkBERT-base_predictions
0,NCT03890861,Reducing African Americans' Alzheimer's Diseas...,"[(28, 47, 'CONDITION', ""alzheimer ' s disease""...",0,{},african|alzheimer ' s disease,,
1,NCT03060096,Stepped-Care Telehealth for Distress in Cancer...,"[(0, 4, 'OTHER', 'step'), (4, 12, 'BEHAVIOURAL...",0,{},cancer|depressive|fatigue|post - treatment can...,,


In [15]:
df_unique_labels = df[['nct_id',
                       f'unique_conditions_{model_name_str_biolink}_predictions', f'unique_interventions_{model_name_str_biolink}_predictions', f'unique_interventions_type_{model_name_str_biolink}_predictions']]

In [16]:
df_unique_labels.head(2)

Unnamed: 0,nct_id,unique_conditions_BioLinkBERT-base_predictions,unique_interventions_BioLinkBERT-base_predictions,unique_interventions_type_BioLinkBERT-base_predictions
0,NCT03890861,african|alzheimer ' s disease,,
1,NCT03060096,cancer|depressive|fatigue|post - treatment can...,,


In [17]:
# Function to remove spaces around ' and -
def remove_spaces_around_apostrophe_and_dash(text):
    text = text.replace(" ' ", "'")  # Remove spaces around '
    text = text.replace("' s", "'s")  # Remove spaces around '
    text = text.replace(" - ", "-")  # Remove spaces around -
    text = text.replace(" / ", "/")  # Remove spaces around /
    text = text.replace("( ", "(")  # Remove spaces around (
    text = text.replace(" )", ")")  # Remove spaces around -
    return text

df_unique_labels[f'unique_conditions_{model_name_str_biolink}_predictions'] = df_unique_labels[f'unique_conditions_{model_name_str_biolink}_predictions'].apply(remove_spaces_around_apostrophe_and_dash)

df_unique_labels[f'unique_interventions_{model_name_str_biolink}_predictions'] = df_unique_labels[f'unique_interventions_{model_name_str_biolink}_predictions'].apply(remove_spaces_around_apostrophe_and_dash)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique_labels[f'unique_conditions_{model_name_str_biolink}_predictions'] = df_unique_labels[f'unique_conditions_{model_name_str_biolink}_predictions'].apply(remove_spaces_around_apostrophe_and_dash)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unique_labels[f'unique_interventions_{model_name_str_biolink}_predictions'] = df_unique_labels[f'unique_interventions_{model_name_str_biolink}_predictions'].apply(remove_spaces_around_apostrophe_and_dash)


In [18]:
df_unique_labels.head(2)

Unnamed: 0,nct_id,unique_conditions_BioLinkBERT-base_predictions,unique_interventions_BioLinkBERT-base_predictions,unique_interventions_type_BioLinkBERT-base_predictions
0,NCT03890861,african|alzheimer's disease,,
1,NCT03060096,cancer|depressive|fatigue|post-treatment cance...,,


### add AACT

In [26]:
df_aact_labels= pd.read_csv("./data/raw_aact/combined_neuro_trials_with_interventions_20240313.csv")
set(df_aact_labels['intervention_type'])


{'Behavioral',
 'Biological',
 'Combination Product',
 'Device',
 'Diagnostic Test',
 'Dietary Supplement',
 'Drug',
 'Genetic',
 'Other',
 'Procedure',
 'Radiation'}

In [27]:
#df_aact_labels_2 = pd.read_csv("../data/data_aact_sample/aact_neuro_samples_second_batch_202309171159_annotated.csv")
#df_aact_labels = pd.concat([df_aact_labels_1, df_aact_labels_2], ignore_index=True)
df_aact_labels.rename(columns={'Neurological Disease': 'aact_conditions'}, inplace=True)
df_aact_labels.rename(columns={'intervention_name': 'aact_intervention_names'}, inplace=True)
df_aact_labels.rename(columns={'intervention_type': 'aact_intervention_types'}, inplace=True)

# Function to replace values in aact_intervention_names based on aact_intervention_types
def replace_values(row):
    if 'Drug' not in row['aact_intervention_types'] and 'Genetic' not in row['aact_intervention_types'] and 'Biological' not in row['aact_intervention_types'] and 'Dietary Supplement' not in row['aact_intervention_types']:
        return ""
    else:
        return row['aact_intervention_names']

# Apply the custom function to replace values in aact_intervention_names
df_aact_labels['aact_intervention_names'] = df_aact_labels.apply(replace_values, axis=1)
df_aact_labels['aact_intervention_names'] = df_aact_labels['aact_intervention_names'].str.replace('|Placebo|', '')
df_aact_labels['aact_intervention_names'] = df_aact_labels['aact_intervention_names'].str.replace('|Placebo', '')
df_aact_labels['aact_intervention_names'] = df_aact_labels['aact_intervention_names'].str.replace('Placebo|', '')

df_aact_labels.head(10)

Unnamed: 0,nct_id,aact_conditions,Disease Class,brief_title,study_official_title,brief_summary_description,start_date,completion_date,phase,study_type,overall_status,country_name,aact_intervention_names,aact_intervention_types
0,NCT03890861,"Dementia, Alzheimer Type",Neurodegenerative Diseases,Reducing African Americans' Alzheimer's Diseas...,Reducing African Americans' Alzheimer's Diseas...,The RAATE proposal is designed to determine th...,2019-08-09,2026-11-30,Not Applicable,Interventional,Recruiting,United States,,Behavioral
1,NCT03890861,"Dementia, Alzheimer Type",Neurodegenerative Diseases,Reducing African Americans' Alzheimer's Diseas...,Reducing African Americans' Alzheimer's Diseas...,The RAATE proposal is designed to determine th...,2019-08-09,2026-11-30,Not Applicable,Interventional,Recruiting,United States,,Behavioral
2,NCT03890861,"Dementia, Alzheimer Type",Neurodegenerative Diseases,Reducing African Americans' Alzheimer's Diseas...,Reducing African Americans' Alzheimer's Diseas...,The RAATE proposal is designed to determine th...,2019-08-09,2026-11-30,Not Applicable,Interventional,Recruiting,United States,,Behavioral
3,NCT03890861,"Dementia, Alzheimer Type",Neurodegenerative Diseases,Reducing African Americans' Alzheimer's Diseas...,Reducing African Americans' Alzheimer's Diseas...,The RAATE proposal is designed to determine th...,2019-08-09,2026-11-30,Not Applicable,Interventional,Recruiting,United States,,Behavioral
4,NCT03890861,"Dementia, Alzheimer Type",Neurodegenerative Diseases,Reducing African Americans' Alzheimer's Diseas...,Reducing African Americans' Alzheimer's Diseas...,The RAATE proposal is designed to determine th...,2019-08-09,2026-11-30,Not Applicable,Interventional,Recruiting,United States,,Behavioral
5,NCT03890861,"Dementia, Alzheimer Type",Neurodegenerative Diseases,Reducing African Americans' Alzheimer's Diseas...,Reducing African Americans' Alzheimer's Diseas...,The RAATE proposal is designed to determine th...,2019-08-09,2026-11-30,Not Applicable,Interventional,Recruiting,United States,,Behavioral
6,NCT03060096,Depressive Symptoms,Psychiatry and Psychology Category,Stepped-Care Telehealth for Distress in Cancer...,Stepped-Care Telehealth for Distress in Cancer...,Mental health issues in post-treatment adult c...,2018-07-19,2023-12-31,Not Applicable,Interventional,Completed,United States,,Other
7,NCT03060096,Depressive Symptoms,Psychiatry and Psychology Category,Stepped-Care Telehealth for Distress in Cancer...,Stepped-Care Telehealth for Distress in Cancer...,Mental health issues in post-treatment adult c...,2018-07-19,2023-12-31,Not Applicable,Interventional,Completed,United States,,Other
8,NCT03060096,Depressive Symptoms,Psychiatry and Psychology Category,Stepped-Care Telehealth for Distress in Cancer...,Stepped-Care Telehealth for Distress in Cancer...,Mental health issues in post-treatment adult c...,2018-07-19,2023-12-31,Not Applicable,Interventional,Completed,United States,,Other
9,NCT03060096,Depressive Symptoms,Psychiatry and Psychology Category,Stepped-Care Telehealth for Distress in Cancer...,Stepped-Care Telehealth for Distress in Cancer...,Mental health issues in post-treatment adult c...,2018-07-19,2023-12-31,Not Applicable,Interventional,Completed,United States,,Other


In [28]:
df_unique_labels_with_aact = pd.merge(df_unique_labels, df_aact_labels, on='nct_id', how='left')
df_unique_labels_with_aact.head()

Unnamed: 0,nct_id,unique_conditions_BioLinkBERT-base_predictions,unique_interventions_BioLinkBERT-base_predictions,unique_interventions_type_BioLinkBERT-base_predictions,aact_conditions,Disease Class,brief_title,study_official_title,brief_summary_description,start_date,completion_date,phase,study_type,overall_status,country_name,aact_intervention_names,aact_intervention_types
0,NCT03890861,african|alzheimer's disease,,,"Dementia, Alzheimer Type",Neurodegenerative Diseases,Reducing African Americans' Alzheimer's Diseas...,Reducing African Americans' Alzheimer's Diseas...,The RAATE proposal is designed to determine th...,2019-08-09,2026-11-30,Not Applicable,Interventional,Recruiting,United States,,Behavioral
1,NCT03890861,african|alzheimer's disease,,,"Dementia, Alzheimer Type",Neurodegenerative Diseases,Reducing African Americans' Alzheimer's Diseas...,Reducing African Americans' Alzheimer's Diseas...,The RAATE proposal is designed to determine th...,2019-08-09,2026-11-30,Not Applicable,Interventional,Recruiting,United States,,Behavioral
2,NCT03890861,african|alzheimer's disease,,,"Dementia, Alzheimer Type",Neurodegenerative Diseases,Reducing African Americans' Alzheimer's Diseas...,Reducing African Americans' Alzheimer's Diseas...,The RAATE proposal is designed to determine th...,2019-08-09,2026-11-30,Not Applicable,Interventional,Recruiting,United States,,Behavioral
3,NCT03890861,african|alzheimer's disease,,,"Dementia, Alzheimer Type",Neurodegenerative Diseases,Reducing African Americans' Alzheimer's Diseas...,Reducing African Americans' Alzheimer's Diseas...,The RAATE proposal is designed to determine th...,2019-08-09,2026-11-30,Not Applicable,Interventional,Recruiting,United States,,Behavioral
4,NCT03890861,african|alzheimer's disease,,,"Dementia, Alzheimer Type",Neurodegenerative Diseases,Reducing African Americans' Alzheimer's Diseas...,Reducing African Americans' Alzheimer's Diseas...,The RAATE proposal is designed to determine th...,2019-08-09,2026-11-30,Not Applicable,Interventional,Recruiting,United States,,Behavioral


In [37]:
df_unique_labels_with_aact_to_save = df_unique_labels_with_aact[['nct_id', 'unique_conditions_BioLinkBERT-base_predictions', 'unique_interventions_BioLinkBERT-base_predictions', 'aact_conditions', 'aact_intervention_names']]

def merge_pipe_fields_case_insensitive(*fields):
    """
    Merge fields like 'ketamine|Ketamine' with:
    - strip whitespace
    - remove empty entries
    - dedupe case-insensitively
    - keep the FIRST original form seen
    """
    seen_lower = set()
    result = []

    for field in fields:
        if isinstance(field, str) and field.strip():
            parts = [p.strip() for p in field.split("|") if p.strip()]
            for item in parts:
                key = item.lower()
                if key not in seen_lower:
                    seen_lower.add(key)
                    result.append(item)

    return "|".join(result)

df = df_unique_labels_with_aact_to_save.copy()

df["combined_conditions"] = df.apply(
    lambda row: merge_pipe_fields_case_insensitive(
        row["unique_conditions_BioLinkBERT-base_predictions"],
        row["aact_conditions"]
    ),
    axis=1
)

df["combined_interventions"] = df.apply(
    lambda row: merge_pipe_fields_case_insensitive(
        row["unique_interventions_BioLinkBERT-base_predictions"],
        row["aact_intervention_names"]
    ),
    axis=1
)
df.head()

Unnamed: 0,nct_id,unique_conditions_BioLinkBERT-base_predictions,unique_interventions_BioLinkBERT-base_predictions,aact_conditions,aact_intervention_names,combined_conditions,combined_interventions
0,NCT03890861,african|alzheimer's disease,,"Dementia, Alzheimer Type",,"african|alzheimer's disease|Dementia, Alzheime...",
1,NCT03890861,african|alzheimer's disease,,"Dementia, Alzheimer Type",,"african|alzheimer's disease|Dementia, Alzheime...",
2,NCT03890861,african|alzheimer's disease,,"Dementia, Alzheimer Type",,"african|alzheimer's disease|Dementia, Alzheime...",
3,NCT03890861,african|alzheimer's disease,,"Dementia, Alzheimer Type",,"african|alzheimer's disease|Dementia, Alzheime...",
4,NCT03890861,african|alzheimer's disease,,"Dementia, Alzheimer Type",,"african|alzheimer's disease|Dementia, Alzheime...",


In [None]:
pred_cols = [
    "combined_conditions",
    "combined_interventions",
]

# Convert empty strings or whitespace-only strings to NaN
df[pred_cols] = df[pred_cols].replace(r"^\s*$", np.nan, regex=True)

# Drop rows where ANY of the prediction columns is empty
df_to_save = df.dropna(subset=pred_cols, how="any")

df_to_save = df_to_save.rename(columns={
    "combined_conditions": "unique_conditions_linkbert_predictions",
    "combined_interventions": "unique_interventions_linkbert_predictions"
})
df_to_save = df_to_save.drop_duplicates(subset=["nct_id"], keep="first")

df_to_save.shape

(19725, 7)

In [39]:
df_to_save.head()

Unnamed: 0,nct_id,unique_conditions_BioLinkBERT-base_predictions,unique_interventions_BioLinkBERT-base_predictions,aact_conditions,aact_intervention_names,unique_interventions_linkbert_predictions,unique_conditions_linkbert_predictions
58,NCT03502551,suicidal|suicidal ideation,ketamine,Suicide,Ketamine,suicidal|suicidal ideation|Suicide,ketamine
61,NCT05216770,laryngeal dystonia|voice tremor,,Tremor,Laryngeal sensory block with topical bupivacaine,laryngeal dystonia|voice tremor|Tremor,Laryngeal sensory block with topical bupivacaine
101,NCT03348735,np|subacute|pain|localized neuropathic pain|ch...,lidocaine|capsaicin,Neuropathic Pain,Lidocaine patch 5%,np|subacute|pain|localized neuropathic pain|ch...,lidocaine|capsaicin|Lidocaine patch 5%
115,NCT05995600,systemic lupus|definite|antiphospholipid syndr...,clopidogrel|warfarin|aspirin,Transient Ischemic Attack,Antiplatelet Drug,systemic lupus|definite|antiphospholipid syndr...,clopidogrel|warfarin|aspirin|Antiplatelet Drug
168,NCT02137993,schizophreniform disorder|schizoaffective diso...,zyprexa,Schizophrenia,A-prexa,schizophreniform disorder|schizoaffective diso...,zyprexa|A-prexa


In [40]:
annotated_files_path_prefix

'./predictions/'

In [41]:
df_to_save.to_csv(os.path.join("data/annotated_aact/ner_outputs", f"unique_conditions_interventions_{model_name_str_biolink}_predictions.csv"), index=False)

## Conditions

In [39]:
conditions_db = pd.read_csv("./data/neuro_diseases_terminology/diseases_dictionary_mesh_icd_2024.csv")

In [40]:
conditions_db[conditions_db['MeSH Common name']=='Depression']

Unnamed: 0.1,Unnamed: 0,ICD Node URI,ICD Parent URI,Mesh ID,MeSH Tree Number,ICD Title,MeSH Common name,MeSH Disease Class,ICD Disease Class,MeSH Synonyms
8141,8141,,,,,,Depression,Psychiatry and Psychology Category,,Depressive Symptoms | Depressive Symptom | Sym...


In [41]:
def add_variant(canonical_name, variant, drug_variant_to_canonical):
    #print(drug_variant_to_canonical)
    if variant not in drug_variant_to_canonical:
        drug_variant_to_canonical[variant] = set()
    drug_variant_to_canonical[variant].add(canonical_name)
    return drug_variant_to_canonical

In [42]:
def generate_conditions_lookup_dictionary(df):
    synonyms_dict = {}
    
    for index, row in df.iterrows():
        icd_title = row['ICD Title']
        mesh_name = row['MeSH Common name']
        if pd.notna(row['MeSH Synonyms']):
            synonyms_list = row['MeSH Synonyms'].split('|')
            for synonym in synonyms_list:
                synonym = synonym.strip().lower()
                mesh_name = mesh_name.lower()
                synonyms_dict = add_variant(mesh_name, synonym, synonyms_dict)
        elif pd.notna(row['ICD Title']):
            icd_title = icd_title.lower()
            synonyms_dict = add_variant(icd_title, icd_title, synonyms_dict) 
        elif pd.notna(row['MeSH Common name']):
            mesh_nam = mesh_name.lower()
            synonyms_dict = add_variant(mesh_name, mesh_name, synonyms_dict)
    
    return synonyms_dict

In [43]:
synonyms_dict = generate_conditions_lookup_dictionary(conditions_db)

In [44]:
synonyms_dict.get("depressive symptoms")

{'depression'}

In [45]:
synonyms_dict.get("depression")

In [55]:
import re
df_conditions = df_unique_labels_with_aact[["nct_id", f'unique_conditions_{model_name_str_biolink}_predictions', "aact_conditions", "Disease Class"]] #, "aact_conditions"
df_conditions.head(-5)

Unnamed: 0,nct_id,unique_conditions_BioLinkBERT-base_predictions,aact_conditions,Disease Class
0,NCT03890861,alzheimer's disease|african,"Dementia, Alzheimer Type",Neurodegenerative Diseases
1,NCT03890861,alzheimer's disease|african,"Dementia, Alzheimer Type",Neurodegenerative Diseases
2,NCT03890861,alzheimer's disease|african,"Dementia, Alzheimer Type",Neurodegenerative Diseases
3,NCT03890861,alzheimer's disease|african,"Dementia, Alzheimer Type",Neurodegenerative Diseases
4,NCT03890861,alzheimer's disease|african,"Dementia, Alzheimer Type",Neurodegenerative Diseases
...,...,...,...,...
418544,NCT01036581,addiction|drug addiction,Drug Abuse,Diseases Category
418545,NCT01036581,addiction|drug addiction,Nicotine Dependence,Diseases Category
418546,NCT01036581,addiction|drug addiction,Nicotine Dependence,Diseases Category
418547,NCT01036581,addiction|drug addiction,Nicotine Dependence,Diseases Category


### Normalize representations via dictionary

In [56]:
def lookup_canonical(conditions_list, synonyms_dict):
    canonical_list = []
    match_count = 0  # Count for conditions matching the dictionary
    processed_count = 0  # Count for all processed conditions
    same_condition_count = 0  # Count for conditions that are the same as looked up
    if isinstance(conditions_list, str):
        for condition in conditions_list.split('|'):
            original_condition = condition  # Keep the original condition
            condition = condition.lower().strip()
            if condition in {"none", "", "none."}:
                continue  # Skip irrelevant or placeholder conditions
            processed_count += 1  # Increment for every non-skipped condition
            if condition in synonyms_dict:
                if any(syn == original_condition for syn in synonyms_dict[condition]):
                    same_condition_count += 1
                canonical_list.extend(synonyms_dict[condition])
                match_count += 1  # Increment only when a match is found
            else:
                canonical_list.append(condition)
      
    else:
        return '', 0, 0, 0  # Return empty string and zero counts if input is not valid
    return '|'.join(canonical_list), match_count, processed_count, same_condition_count

In [69]:
def process_dataframe(df, synonyms_dict, aact_source_col, entity_type="conditions"):
    match_counts = {}
    processed_counts = {}
    same_condition_counts = {}

    column = f'canonical_{model_name_str_biolink}_{entity_type}'
    results = df[f'unique_{entity_type}_{model_name_str_biolink}_predictions'].apply(lambda x: lookup_canonical(x, synonyms_dict))
    df.loc[:, column] = [result[0] for result in results]
    match_counts[column] = sum(result[1] for result in results)  # Sum all matches for the column
    processed_counts[column] = sum(result[2] for result in results)  # Sum all processed conditions for the column
    same_condition_counts[column] = sum(result[3] for result in results)  # Sum cases where condition was unchanged

    column = f'canonical_aact_{entity_type}'
    results = df[aact_source_col].apply(lambda x: lookup_canonical(x, synonyms_dict))
    df.loc[:, column] = [result[0] for result in results]
    match_counts[column] = sum(result[1] for result in results)  # Sum all matches for the column
    processed_counts[column] = sum(result[2] for result in results)  # Sum all processed conditions for the column
    same_condition_counts[column] = sum(result[3] for result in results)  # Sum cases where condition was unchanged

    return df, match_counts, processed_counts, same_condition_counts

In [70]:
df_conditions_mapped, match_counts, processed_counts, same_condition_counts = process_dataframe(df_conditions, synonyms_dict, "aact_conditions")

In [71]:
df_conditions_mapped.head(-5)

Unnamed: 0,nct_id,unique_conditions_BioLinkBERT-base_predictions,aact_conditions,Disease Class,canonical_BioLinkBERT-base_conditions,canonical_aact_conditions
0,NCT03890861,alzheimer's disease|african,"Dementia, Alzheimer Type",Neurodegenerative Diseases,alzheimer disease|african,alzheimer disease
1,NCT03890861,alzheimer's disease|african,"Dementia, Alzheimer Type",Neurodegenerative Diseases,alzheimer disease|african,alzheimer disease
2,NCT03890861,alzheimer's disease|african,"Dementia, Alzheimer Type",Neurodegenerative Diseases,alzheimer disease|african,alzheimer disease
3,NCT03890861,alzheimer's disease|african,"Dementia, Alzheimer Type",Neurodegenerative Diseases,alzheimer disease|african,alzheimer disease
4,NCT03890861,alzheimer's disease|african,"Dementia, Alzheimer Type",Neurodegenerative Diseases,alzheimer disease|african,alzheimer disease
...,...,...,...,...,...,...
418544,NCT01036581,addiction|drug addiction,Drug Abuse,Diseases Category,addiction|substance-related disorders,substance-related disorders
418545,NCT01036581,addiction|drug addiction,Nicotine Dependence,Diseases Category,addiction|substance-related disorders,tobacco use disorder|nicotine dependence
418546,NCT01036581,addiction|drug addiction,Nicotine Dependence,Diseases Category,addiction|substance-related disorders,tobacco use disorder|nicotine dependence
418547,NCT01036581,addiction|drug addiction,Nicotine Dependence,Diseases Category,addiction|substance-related disorders,tobacco use disorder|nicotine dependence


In [72]:
counts_df_condition = pd.DataFrame({
    'Processed Condition': processed_counts,
    'Matched Condition': match_counts,
    #'Same Condition Counts': same_condition_counts
})

counts_df_condition['% Mapped Condition'] = round((counts_df_condition['Matched Condition'] / counts_df_condition['Processed Condition']) * 100, 2)
counts_df_condition.reset_index(inplace=True)
counts_df_condition.rename(columns={'index': 'Annotations Source'}, inplace=True)

counts_df_condition

Unnamed: 0,Annotations Source,Processed Condition,Matched Condition,% Mapped Condition
0,canonical_BioLinkBERT-base_conditions,1144452,363777,31.79
1,canonical_aact_conditions,418554,379866,90.76


## Drugs

In [73]:
import csv
import re

### Normalize representations via dictionary

In [74]:
path_prefix = "./data"

In [75]:
variant_regex = re.compile(r'^[A-Za-z0-9,]+[ -]?[A-Za-z0-9\-]+(?:[ -][A-Z])?$')
drug_variant_to_canonical = {}
drug_canonical_to_data = {}

def add_variant(canonical_name, variant):
    if variant not in drug_variant_to_canonical:
        drug_variant_to_canonical[variant] = set()
    drug_variant_to_canonical[variant].add(canonical_name)


def add_drug(id, synonyms):
    synonyms = [s.strip() for s in synonyms]

    #TODO: add using an exclusion list as a parameter option to the function
    #if re.sub("[- ].+", "", synonyms[0].upper()) in exclusions:
    #    return
    if not variant_regex.match(synonyms[0]):
        return
    if synonyms[0] not in drug_canonical_to_data:
        drug_canonical_to_data[synonyms[0]] = {"name": synonyms[0], "synonyms": set()}
    if id.startswith("a"):
        drug_canonical_to_data[synonyms[0]]["medline_plus_id"] = id
    elif id.startswith("https://www.nhs.uk"):
        drug_canonical_to_data[synonyms[0]]["nhs_url"] = id
    elif id.startswith("https://en.wikipedia"):
        drug_canonical_to_data[synonyms[0]]["wikipedia_url"] = id
    elif id.startswith("DB"):
        drug_canonical_to_data[synonyms[0]]["drugbank_id"] = id
    else:
        drug_canonical_to_data[synonyms[0]]["mesh_id"] = id
    for variant in synonyms:
        #if re.sub(" .+", "", variant.upper()) in exclusions:
        #    return
        if variant_regex.match(variant):
            drug_canonical_to_data[synonyms[0]]["synonyms"].add(variant)
            add_variant(synonyms[0], variant.lower())
            #add_variant(synonyms[0], variant)
            #add_variant(synonyms[0], variant.upper())
            #if variant.lower() in words_to_allow_lower_case:    

with open(path_prefix + "/drug_names_terminology/drugs_dictionary_medlineplus.csv", 'r', encoding="utf-8") as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    headers = None
    for row in spamreader:
        if not headers:
            headers = row
            continue
        id = row[0]
        name = row[1]
        synonyms = row[2].split(r"|")
        name = re.sub(
            " (Injection|Oral Inhalation|Transdermal|Ophthalmic|Topical|Vaginal Cream|Nasal Spray|Transdermal Patch|Rectal)",
            "", name)
        name = name.lower()
        if name == "abobotulinumtoxina":
            print(row[1], synonyms)

        add_drug(id, [name] + synonyms)

AbobotulinumtoxinA Injection ['Dysport', 'BoNT-A']


In [76]:

with open(path_prefix + "/drug_names_terminology/drugs_dictionary_nhs.csv", 'r', encoding="utf-8") as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    headers = None
    for row in spamreader:
        if not headers:
            headers = row
            continue
        id = row[0]
        name = row[1]
        synonyms = row[2].split(r"|")
        name = name.lower()
        add_drug(id, [name] + synonyms)


with open(path_prefix + "/drug_names_terminology/drugs_dictionary_wikipedia.csv", 'r', encoding="utf-8") as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    headers = None
    for row in spamreader:
        if not headers:
            headers = row
            continue
        id = row[0]
        name = row[1]
        synonyms = row[2].split(r"|")
        name = name.lower()
        add_drug(id, [name] + synonyms)
        
with open(path_prefix + "/drug_names_terminology/drugs_dictionary_mesh.csv", 'r', encoding="utf-8") as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    headers = None
    for row in spamreader:
        if not headers:
            headers = row
            continue
        id = row[0]
        name = row[1]
        synonyms = row[2].split(r"\|")
        name = name.lower()
        add_drug(id, [name] + synonyms)

# adding for the full db with product names included as synonyms
# TODO: create a parametrized function from this, not hard-coded inline as it is
is_new_format = False
if is_new_format:
    with open(path_prefix + "/drug_names_terminology/drugdb_full_database_parsed.csv", 'r', encoding="utf-8") as csvfile:
        spamreader = csv.reader(csvfile, delimiter=';')
        headers = None
        for row in spamreader:
            #print(row)
            if not headers:
                headers = row
                continue
            id = row[0]
            name = row[1]
            synonyms = row[4].split(r"|")
            products = row[5].split(r"|")
            syn_prod = synonyms + products
            name = name.lower()
            add_drug(id, [name] + syn_prod)
# no product names considered
else:
    with open(path_prefix + "/drug_names_terminology/drugbank vocabulary.csv", 'r', encoding="utf-8") as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        headers = None
        for row in spamreader:
            if not headers:
                headers = row
                continue
            id = row[0]
            name = row[2]
            synonyms = row[5].split(r"|")
            name = name.lower()
            add_drug(id, [name] + synonyms)

In [77]:
drug_variant_to_canonical.get("exelon")

{'rivastigmine', 'rivastigmine patch'}

In [79]:
df_interventions = df_unique_labels_with_aact[["nct_id",  f'unique_interventions_{model_name_str_biolink}_predictions', 'aact_intervention_names', 'aact_intervention_types']] #'aact_intervention_names'


In [80]:
df_interventions.head(2)

Unnamed: 0,nct_id,unique_interventions_BioLinkBERT-base_predictions,aact_intervention_names,aact_intervention_types
0,NCT03890861,,,Behavioral
1,NCT03890861,,,Behavioral


In [86]:
df_interventions_mapped, match_counts, processed_counts, same_condition_counts = process_dataframe(df_interventions, drug_variant_to_canonical, "aact_intervention_names", "interventions")

In [87]:
df_interventions_mapped.head()

Unnamed: 0,nct_id,unique_interventions_BioLinkBERT-base_predictions,aact_intervention_names,aact_intervention_types,canonical_BioLinkBERT-base_interventions,canonical_aact_interventions
0,NCT03890861,,,Behavioral,,
1,NCT03890861,,,Behavioral,,
2,NCT03890861,,,Behavioral,,
3,NCT03890861,,,Behavioral,,
4,NCT03890861,,,Behavioral,,


In [101]:
df_interventions_mapped[df_interventions_mapped['nct_id'] == 'NCT01997255']

Unnamed: 0,nct_id,unique_interventions_BioLinkBERT-base_predictions,aact_intervention_names,aact_intervention_types,canonical_BioLinkBERT-base_interventions,canonical_aact_interventions
340171,NCT01997255,everolimus|rad 001|afinitor,Everolimus,Drug,everolimus|rad 001|everolimus,everolimus
340172,NCT01997255,everolimus|rad 001|afinitor,Everolimus,Drug,everolimus|rad 001|everolimus,everolimus
340173,NCT01997255,everolimus|rad 001|afinitor,Everolimus,Drug,everolimus|rad 001|everolimus,everolimus


In [89]:
counts_df_interventions = pd.DataFrame({
    'Processed Intervention': processed_counts,
    'Matched Intervention': match_counts,
    #'Same Condition Counts': same_condition_counts
})

counts_df_interventions['% Mapped Intervention'] = round((counts_df_interventions['Matched Intervention'] / counts_df_interventions['Processed Intervention']) * 100, 2)
counts_df_interventions.reset_index(inplace=True)
counts_df_interventions.rename(columns={'index': 'Annotations Source'}, inplace=True)

counts_df_interventions

Unnamed: 0,Annotations Source,Processed Intervention,Matched Intervention,% Mapped Intervention
0,canonical_BioLinkBERT-base_interventions,523646,278653,53.21
1,canonical_aact_interventions,236885,79382,33.51


## Combine drugs and conditions

In [90]:
# Perform an inner join on the 'nct_id' column
merged_df = pd.merge(df_interventions_mapped, df_conditions_mapped, on='nct_id', how='left')
# Remove all duplicates, keeping only rows that are unique across all columns
merged_df = merged_df.drop_duplicates()

merged_df_canonical = merged_df[['nct_id','canonical_BioLinkBERT-base_interventions','canonical_aact_interventions', 'aact_intervention_types', 'canonical_BioLinkBERT-base_conditions', 'canonical_aact_conditions',"Disease Class"]]
# Display the merged DataFrame
merged_df_canonical.head(10)

Unnamed: 0,nct_id,canonical_BioLinkBERT-base_interventions,canonical_aact_interventions,aact_intervention_types,canonical_BioLinkBERT-base_conditions,canonical_aact_conditions,Disease Class
0,NCT03890861,,,Behavioral,alzheimer disease|african,alzheimer disease,Neurodegenerative Diseases
36,NCT03060096,,,Other,mental|post-treatment|cancer|distress|anxiety|...,depression,Psychiatry and Psychology Category
72,NCT04525742,,,Other,covid-19 pandemic|disabled,muscular dystrophies,Neuromuscular Diseases
75,NCT04525742,,,Other,covid-19 pandemic|disabled,cerebral palsy,Central Nervous System Diseases
108,NCT02324634,,,Device,paral|joint deformities|pain|stroke|paretic|mu...,stroke,Central Nervous System Diseases
117,NCT06036368,,,Device,essential tremor|parkinson disease|parkinson's...,essential tremor,Central Nervous System Diseases
120,NCT06036368,,,Device,essential tremor|parkinson disease|parkinson's...,parkinson disease,Central Nervous System Diseases
153,NCT05573763,,,Device,,"sleep apnea, obstructive",Sleep Wake Disorders
162,NCT05440214,,,Behavioral,emotion dysregulation|psychotic disorders|psyc...,psychotic disorders,Psychiatry and Psychology Category
164,NCT05440214,,,Behavioral,emotion dysregulation|psychotic disorders|psyc...,suicide,Psychiatry and Psychology Category


In [91]:
merged_df.shape

(81907, 11)

In [92]:
len(set(merged_df['nct_id']))

46376

### Keep DRUG interventions

In [93]:
# Remove rows where both 'canonical_BioLinkBERT-base_interventions' and 'canonical_aact_interventions' are empty
filtered_df = merged_df_canonical.loc[~((merged_df_canonical['canonical_BioLinkBERT-base_interventions'].isna() | merged_df_canonical['canonical_BioLinkBERT-base_interventions'].eq('')) & (merged_df_canonical['canonical_aact_interventions'].isna() | merged_df_canonical['canonical_aact_interventions'].eq('')))]
filtered_df = filtered_df[~filtered_df['canonical_aact_interventions'].str.contains('placebo', na=False)]

filtered_df.head(10)

Unnamed: 0,nct_id,canonical_BioLinkBERT-base_interventions,canonical_aact_interventions,aact_intervention_types,canonical_BioLinkBERT-base_conditions,canonical_aact_conditions,Disease Class
458,NCT03502551,ketamine,ketamine,Drug,suicidal ideation|suicidal,suicide,Psychiatry and Psychology Category
468,NCT05216770,,laryngeal sensory block with topical bupivacaine,Drug,voice tremor|laryngeal dystonia,tremor,Neurologic Manifestations
685,NCT03348735,capsaicin|lidocaine,lidocaine patch 5%,Drug,pain|subacute|localized neuropathic pain|chron...,neuropathic pain|neuralgia,Neuromuscular Diseases
694,NCT03348735,capsaicin|lidocaine,capsaicin 8% patch,Drug,pain|subacute|localized neuropathic pain|chron...,neuropathic pain|neuralgia,Neuromuscular Diseases
703,NCT03348735,capsaicin|lidocaine,pregabalin,Drug,pain|subacute|localized neuropathic pain|chron...,neuropathic pain|neuralgia,Neuromuscular Diseases
779,NCT05995600,clopidogrel|warfarin|aspirin|acetylsalicylic acid,antiplatelet drug,Drug,antiphospholipid syndrome|aps-stroke|antiphosp...,"ischemic attack, transient",Central Nervous System Diseases
785,NCT05995600,clopidogrel|warfarin|aspirin|acetylsalicylic acid,antiplatelet drug,Drug,antiphospholipid syndrome|aps-stroke|antiphosp...,cerebrovascular disorders,Central Nervous System Diseases
791,NCT05995600,clopidogrel|warfarin|aspirin|acetylsalicylic acid,warfarin,Drug,antiphospholipid syndrome|aps-stroke|antiphosp...,"ischemic attack, transient",Central Nervous System Diseases
797,NCT05995600,clopidogrel|warfarin|aspirin|acetylsalicylic acid,warfarin,Drug,antiphospholipid syndrome|aps-stroke|antiphosp...,cerebrovascular disorders,Central Nervous System Diseases
1508,NCT02137993,olanzapine,a-prexa,Drug,schizoaffective disorder|psychotic disorders|p...,schizophrenia,Diseases of the nervous system


In [94]:
len(set(filtered_df['nct_id'])), filtered_df.shape

(19632, (36007, 7))

In [134]:
len(set(filtered_df['nct_id'])), filtered_df.shape

(19607, (57475, 7))

### aggregate annotations per row

In [95]:
# Function to join unique values
def join_unique(values):
    return '|'.join(set(values))

# Group by 'nct_id' and other relevant columns, then join interventions and conditions with '|', ensuring uniqueness
grouped_df = filtered_df.groupby(['nct_id', 'canonical_BioLinkBERT-base_interventions', 'canonical_BioLinkBERT-base_conditions'], as_index=False).agg({
    'canonical_aact_interventions': join_unique,
    'aact_intervention_types': join_unique,
    'canonical_aact_conditions': join_unique,
    'Disease Class': join_unique
})


In [96]:
# making sure there are no duplicate entities
grouped_df['canonical_BioLinkBERT-base_interventions'] = grouped_df['canonical_BioLinkBERT-base_interventions'].apply(lambda x: '|'.join(sorted(set(x.split('|')))))
grouped_df['canonical_BioLinkBERT-base_conditions'] = grouped_df['canonical_BioLinkBERT-base_conditions'].apply(lambda x: '|'.join(sorted(set(x.split('|')))))
grouped_df = grouped_df[['nct_id', 'canonical_BioLinkBERT-base_interventions', 'canonical_aact_interventions', 'aact_intervention_types', 'canonical_BioLinkBERT-base_conditions', 'canonical_aact_conditions', 'Disease Class']]
grouped_df['canonical_aact_interventions'] = grouped_df['canonical_aact_interventions'].str.lstrip('|')

grouped_df.head(10)

Unnamed: 0,nct_id,canonical_BioLinkBERT-base_interventions,canonical_aact_interventions,aact_intervention_types,canonical_BioLinkBERT-base_conditions,canonical_aact_conditions,Disease Class
0,NCT00000117,intravenous immunoglobulin|ivig,immunoglobulin,Drug,multiple sclerosis|optic neuritis,optic neuritis,Cranial Nerve Diseases
1,NCT00000146,corticosteroid,prednisone|methylprednisolone,Drug,multiple sclerosis|optic neuritis,optic neuritis|multiple sclerosis,Demyelinating Diseases|Cranial Nerve Diseases
2,NCT00000147,corticosteroid,prednisone|methylprednisolone,Drug,multiple sclerosis|optic neuritis,optic neuritis|multiple sclerosis,Demyelinating Diseases|Cranial Nerve Diseases
3,NCT00000151,acetylsalicylic acid|aspirin,aspirin|acetylsalicylic acid,Procedure|Drug,diabetes mellitus|diabetic retinopathy|early d...,blindness,Neurologic Manifestations
4,NCT00000170,atropine,atropine,Drug|Device,amblyopia|anisometropia|moderate amblyopia|str...,amblyopia,Neurologic Manifestations
5,NCT00000171,melatonin,melatonin,Drug,alzheimer disease|sleep|sleep disturbances,alzheimer disease|dyssomnias,Sleep Wake Disorders|Neurodegenerative Diseases
6,NCT00000172,galantamine,galantamine,Drug,alzheimer disease,alzheimer disease,Neurodegenerative Diseases
7,NCT00000173,donepezil|donepezil hcl|vitamin e,vitamin e|donepezil,Drug,alzheimer disease|cognitive decline|dementia|m...,alzheimer disease,Neurodegenerative Diseases
8,NCT00000174,rivastigmine|rivastigmine patch,rivastigmine,Drug,alzheimer '|alzheimer disease|dementia|disease...,alzheimer disease|cognition disorders,Psychiatry and Psychology Category|Neurodegene...
9,NCT00000175,estrogen|hormone|testosterone,testosterone|estrogen,Drug,,mood disorders|cognition disorders,Psychiatry and Psychology Category


In [102]:
len(grouped_df)

19632

In [100]:
grouped_df.to_csv(f'data/annotated_aact/ner_outputs/aggregated_ner_annotations_basic_dict_mapped_{len(grouped_df)}.csv')

### flattened

In [103]:
len(set(grouped_df['nct_id'])), grouped_df.shape

(19632, (19632, 7))

In [104]:
# Splitting 'canonical_BioLinkBERT-base_interventions' and 'canonical_BioLinkBERT-base_conditions' into separate rows
filtered_df['canonical_BioLinkBERT-base_interventions'] = filtered_df['canonical_BioLinkBERT-base_interventions'].str.split('|')
filtered_df['canonical_BioLinkBERT-base_conditions'] = filtered_df['canonical_BioLinkBERT-base_conditions'].str.split('|')
filtered_df['canonical_aact_conditions'] = filtered_df['canonical_aact_conditions'].str.split('|')

# Exploding both columns to create new rows for each value
df_exploded = filtered_df.explode('canonical_BioLinkBERT-base_interventions')
df_exploded = df_exploded.explode('canonical_BioLinkBERT-base_conditions')
df_exploded = df_exploded.explode('canonical_aact_conditions')
df_exploded = df_exploded.drop_duplicates()
# Resetting the index to have a continuous index after exploding
df_exploded.reset_index(drop=True, inplace=True)

In [105]:
df_exploded.head()

Unnamed: 0,nct_id,canonical_BioLinkBERT-base_interventions,canonical_aact_interventions,aact_intervention_types,canonical_BioLinkBERT-base_conditions,canonical_aact_conditions,Disease Class
0,NCT03502551,ketamine,ketamine,Drug,suicidal ideation,suicide,Psychiatry and Psychology Category
1,NCT03502551,ketamine,ketamine,Drug,suicidal,suicide,Psychiatry and Psychology Category
2,NCT05216770,,laryngeal sensory block with topical bupivacaine,Drug,voice tremor,tremor,Neurologic Manifestations
3,NCT05216770,,laryngeal sensory block with topical bupivacaine,Drug,laryngeal dystonia,tremor,Neurologic Manifestations
4,NCT03348735,capsaicin,lidocaine patch 5%,Drug,pain,neuropathic pain,Neuromuscular Diseases


In [106]:
len(set(df_exploded['nct_id'])), df_exploded.shape

(19632, (230854, 7))

In [108]:
df_exploded.to_csv(f'data/annotated_aact/ner_outputs/flat_ner_annotations_basic_dict_mapped_{len(set(df_exploded['nct_id']))}.csv')

# Annotate Dict NeuroTrialNER test set

In [115]:
drug_testset = pd.read_csv(f"data/annotated_aact/neurotrial_ner_test_set/target_ner_drug_to_link_{100}.csv", index_col=0)
condition_testset = pd.read_csv(f"data/annotated_aact/neurotrial_ner_test_set/target_ner_conditions_to_link_{345}.csv", index_col=0)

In [134]:
drug_testset.head()

Unnamed: 0,nct_id,unique_drug_target,unique_drug_target_mapped,unique_drug_target_mapped_dictionary
0,NCT04045665,anticoagulants,n.a.,n.a.
0,NCT04045665,anticoagulation,n.a.,n.a.
0,NCT04045665,antiplatelet therapy,n.a.,n.a.
0,NCT04045665,antiplatelets,n.a.,n.a.
0,NCT04045665,oral anticoagulation,n.a.,n.a.


In [127]:
def lookup_canonical(conditions_list, synonyms_dict):
    canonical_list = []
    match_count = 0  # Count for conditions matching the dictionary
    processed_count = 0  # Count for all processed conditions
    same_condition_count = 0  # Count for conditions that are the same as looked up
    if isinstance(conditions_list, str):
        for condition in conditions_list.split('|'):
            original_condition = condition  # Keep the original condition
            condition = condition.lower().strip()
            if condition in {"none", "", "none."}:
                continue  # Skip irrelevant or placeholder conditions
            processed_count += 1  # Increment for every non-skipped condition
            if condition in synonyms_dict:
                if any(syn == original_condition for syn in synonyms_dict[condition]):
                    same_condition_count += 1
                canonical_list.extend(synonyms_dict[condition])
                match_count += 1  # Increment only when a match is found
            else:
                canonical_list.append("n.a.")
      
    else:
        return '', 0, 0, 0  # Return empty string and zero counts if input is not valid
    return '|'.join(canonical_list), match_count, processed_count, same_condition_count

In [131]:
def process_dataframe(df, synonyms_dict, col_with_annotations):
    match_counts = {}
    processed_counts = {}
    same_condition_counts = {}

    column = col_with_annotations
    mapped_col = column + "_mapped_dictionary"
    results = df[col_with_annotations].apply(lambda x: lookup_canonical(x, synonyms_dict))
    df.loc[:, mapped_col] = [result[0] for result in results]
    match_counts[column] = sum(result[1] for result in results)  # Sum all matches for the column
    processed_counts[column] = sum(result[2] for result in results)  # Sum all processed conditions for the column
    same_condition_counts[column] = sum(result[3] for result in results)  # Sum cases where condition was unchanged

    return df, match_counts, processed_counts, same_condition_counts

In [132]:
df_drugs_mapped, match_counts, processed_counts, same_condition_counts = process_dataframe(drug_testset, drug_variant_to_canonical, "unique_drug_target")

In [142]:
len(drug_variant_to_canonical)

25933

In [133]:
df_drugs_mapped

Unnamed: 0,nct_id,unique_drug_target,unique_drug_target_mapped,unique_drug_target_mapped_dictionary
0,NCT04045665,anticoagulants,n.a.,n.a.
0,NCT04045665,anticoagulation,n.a.,n.a.
0,NCT04045665,antiplatelet therapy,n.a.,n.a.
0,NCT04045665,antiplatelets,n.a.,n.a.
0,NCT04045665,oral anticoagulation,n.a.,n.a.
...,...,...,...,...
152,NCT03661411,antiplatelet,n.a.,n.a.
152,NCT03661411,aspirin,aspirin|acetylsalicylic acid,aspirin|acetylsalicylic acid
152,NCT03661411,clopidogrel,clopidogrel,clopidogrel
152,NCT03661411,r-tpa,n.a.,n.a.


In [139]:
df_drugs_mapped.to_csv(f"data/annotated_aact/neurotrial_ner_test_set/dictionary_entities_linked/neurotrial_ner_linked_drugs_dictionary.csv")

In [135]:
counts_df_interventions = pd.DataFrame({
    'Processed Intervention': processed_counts,
    'Matched Intervention': match_counts,
    #'Same Condition Counts': same_condition_counts
})

counts_df_interventions['% Mapped Intervention'] = round((counts_df_interventions['Matched Intervention'] / counts_df_interventions['Processed Intervention']) * 100, 2)
counts_df_interventions.reset_index(inplace=True)
counts_df_interventions.rename(columns={'index': 'Annotations Source'}, inplace=True)

counts_df_interventions

Unnamed: 0,Annotations Source,Processed Intervention,Matched Intervention,% Mapped Intervention
0,unique_drug_target,100,52,52.0


In [136]:
df_conditions_mapped, match_counts, processed_counts, same_condition_counts = process_dataframe(condition_testset, synonyms_dict, "unique_condition_target")

In [141]:
len(synonyms_dict)

18458

In [137]:
df_conditions_mapped.head()

Unnamed: 0,nct_id,unique_condition_target,unique_condition_target_mapped_dictionary
0,NCT04045665,coronary artery bypass graft,n.a.
0,NCT04045665,isolated coronary artery bypass graft (cabg) s...,n.a.
0,NCT04045665,new-onset post-operative atrial fibrillation,n.a.
0,NCT04045665,post-operative atrial fibrillation,n.a.
1,NCT02311036,-cerebrovascular diseases,n.a.


In [138]:
counts_df_condition = pd.DataFrame({
    'Processed Intervention': processed_counts,
    'Matched Intervention': match_counts,
    #'Same Condition Counts': same_condition_counts
})

ounts_df_condition['% Mapped Intervention'] = round((counts_df_condition['Matched Intervention'] / ounts_df_condition['Processed Intervention']) * 100, 2)
ounts_df_condition.reset_index(inplace=True)
ounts_df_condition.rename(columns={'index': 'Annotations Source'}, inplace=True)

ounts_df_condition

Unnamed: 0,Annotations Source,Processed Intervention,Matched Intervention,% Mapped Intervention
0,unique_condition_target,345,123,35.65


In [140]:
df_conditions_mapped.to_csv(f"data/annotated_aact/neurotrial_ner_test_set/dictionary_entities_linked/neurotrial_ner_linked_conditions_dictionary.csv")