In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import numpy as np
import re
from pathlib import Path


## Prepare AACT Data for NER Annotation

In [6]:
aact_data = pd.read_csv("data/raw_aact/combined_neuro_trials_with_interventions_20240313.csv")[['nct_id','study_official_title', 'brief_summary_description']].drop_duplicates()
aact_data.shape

(46376, 3)

In [7]:
len(set(aact_data['nct_id']))

46376

In [8]:
# Create a new column 'text' by concatenating 'study_official_title' and 'brief_summary_description'
aact_data['text'] = aact_data['study_official_title'].fillna('') + '| ' + aact_data['brief_summary_description'].fillna('')

# Display the updated DataFrame
aact_data.head()

Unnamed: 0,nct_id,study_official_title,brief_summary_description,text
0,NCT03890861,Reducing African Americans' Alzheimer's Diseas...,The RAATE proposal is designed to determine th...,Reducing African Americans' Alzheimer's Diseas...
6,NCT03060096,Stepped-Care Telehealth for Distress in Cancer...,Mental health issues in post-treatment adult c...,Stepped-Care Telehealth for Distress in Cancer...
12,NCT04525742,COVID-19 Pandemic From the Perspective of Pare...,Pandemic period could affect the disabled chil...,COVID-19 Pandemic From the Perspective of Pare...
15,NCT02324634,Early Electrical Stimulation to the Wrist Exte...,Stroke is the largest cause of adult disabilit...,Early Electrical Stimulation to the Wrist Exte...
18,NCT06036368,"6-weeks, Open-label, Single-Site Study to Eval...","This is a 6-week exploratory clinical study, d...","6-weeks, Open-label, Single-Site Study to Eval..."


In [9]:
aact_data.to_csv(f"data/aact_for_ner/aact_texts_{len(aact_data)}.csv")

# Prep annotated data for TSNE embedding 

In [37]:
reference_data_new = pd.read_csv("data/raw_aact/combined_neuro_trials_with_interventions_20240313.csv")[['nct_id','brief_title', 'start_date', 'study_official_title', 'brief_summary_description']].drop_duplicates()


In [38]:
reference_data_new.head()

Unnamed: 0,nct_id,brief_title,start_date,study_official_title,brief_summary_description
0,NCT03890861,Reducing African Americans' Alzheimer's Diseas...,2019-08-09,Reducing African Americans' Alzheimer's Diseas...,The RAATE proposal is designed to determine th...
6,NCT03060096,Stepped-Care Telehealth for Distress in Cancer...,2018-07-19,Stepped-Care Telehealth for Distress in Cancer...,Mental health issues in post-treatment adult c...
12,NCT04525742,COVID-19 Pandemic and Parents of Disabled Chil...,2020-07-05,COVID-19 Pandemic From the Perspective of Pare...,Pandemic period could affect the disabled chil...
15,NCT02324634,Early Electrical Stimulation to Prevent Compli...,2015-06-01,Early Electrical Stimulation to the Wrist Exte...,Stroke is the largest cause of adult disabilit...
18,NCT06036368,Study to Evaluate Safety and Efficacy of Peron...,2023-09-15,"6-weeks, Open-label, Single-Site Study to Eval...","This is a 6-week exploratory clinical study, d..."


In [39]:
reference_data_new.shape

(46376, 5)

In [40]:
data_path = Path("data/annotated_aact/")

In [41]:
df_annotations_all = pd.read_csv(data_path/ "ner_outputs/aggregated_ner_annotations_basic_dict_mapped_19632.csv")


In [42]:
model_prefix = "combined_union"
entity_types = ["conditions", "interventions"]
if model_prefix == "linkbert":
    mapped_size = [18212, 16917]
elif model_prefix == "aact":
    mapped_size = [19632, 18720]
else:
    mapped_size = [19632, 19632]

In [43]:
# Loop through each entity type, read the corresponding file, and perform the join
for entity_type, mapped_nr in zip(entity_types, mapped_size):
    # Construct the file path
    file_path = data_path / f"snomed_linking_outputs/mapped_to_hierarchy/aact_bert_combined/hierarchical_mapping_to_snomed_{model_prefix}_{entity_type}_aggregated_{mapped_nr}.csv"
    
    # Read the data
    df_entity = pd.read_csv(file_path, index_col=0)
    
    # Perform the left join on nct_id
    df_annotations_all = df_annotations_all.merge(df_entity, on='nct_id', how='left')

if model_prefix == "combined":
    df_annotations_all[f'canonical_combined_{entity_type}'] = df_annotations_all[f'canonical_BioLinkBERT-base_{entity_type}'].combine_first(df_annotations_all[f'canonical_aact_{entity_type}'])


In [44]:
df_annotations_all.shape

(19632, 20)

In [45]:
df_annotations_all.head()

Unnamed: 0.1,Unnamed: 0,nct_id,canonical_BioLinkBERT-base_interventions,canonical_aact_interventions,aact_intervention_types,canonical_BioLinkBERT-base_conditions,canonical_aact_conditions,Disease Class,linkbert_snomed_term_canonical_conditions,linkbert_top_concept_canonical_first_conditions,aact_snomed_term_canonical_conditions,aact_top_concept_canonical_first_conditions,combined_union_snomed_term_canonical_conditions,combined_union_top_concept_canonical_first_conditions,linkbert_snomed_term_canonical_interventions,linkbert_top_concept_canonical_first_interventions,aact_snomed_term_canonical_interventions,aact_top_concept_canonical_first_interventions,combined_union_snomed_term_canonical_interventions,combined_union_top_concept_canonical_first_interventions
0,0,NCT00000117,intravenous immunoglobulin|ivig,immunoglobulin,Drug,multiple sclerosis|optic neuritis,optic neuritis,Cranial Nerve Diseases,Multiple sclerosis (disorder)|Optic neuritis (...,Multiple sclerosis (disorder)|Optic neuritis (...,Optic neuritis (disorder),Optic neuritis (disorder),Multiple sclerosis (disorder)|Optic neuritis (...,Multiple sclerosis (disorder)|Optic neuritis (...,Immunoglobulin (substance),Immunoglobulin (substance),Immunoglobulin (substance),Blood component (substance),Immunoglobulin (substance),Blood component (substance)|Immunoglobulin (su...
1,1,NCT00000146,corticosteroid,prednisone|methylprednisolone,Drug,multiple sclerosis|optic neuritis,optic neuritis|multiple sclerosis,Demyelinating Diseases|Cranial Nerve Diseases,Multiple sclerosis (disorder)|Optic neuritis (...,Multiple sclerosis (disorder)|Optic neuritis (...,Optic neuritis (disorder)|Multiple sclerosis (...,Optic neuritis (disorder)|Multiple sclerosis (...,Multiple sclerosis (disorder)|Optic neuritis (...,Multiple sclerosis (disorder)|Optic neuritis (...,Corticosteroid and/or corticosteroid derivativ...,Corticosteroid and/or corticosteroid derivativ...,Prednisone (substance)|Methylprednisolone (sub...,Corticosteroid and/or corticosteroid derivativ...,Corticosteroid and/or corticosteroid derivativ...,Corticosteroid and/or corticosteroid derivativ...
2,2,NCT00000147,corticosteroid,prednisone|methylprednisolone,Drug,multiple sclerosis|optic neuritis,optic neuritis|multiple sclerosis,Demyelinating Diseases|Cranial Nerve Diseases,Multiple sclerosis (disorder)|Optic neuritis (...,Multiple sclerosis (disorder)|Optic neuritis (...,Optic neuritis (disorder)|Multiple sclerosis (...,Optic neuritis (disorder)|Multiple sclerosis (...,Multiple sclerosis (disorder)|Optic neuritis (...,Multiple sclerosis (disorder)|Optic neuritis (...,Corticosteroid and/or corticosteroid derivativ...,Corticosteroid and/or corticosteroid derivativ...,Prednisone (substance)|Methylprednisolone (sub...,Corticosteroid and/or corticosteroid derivativ...,Corticosteroid and/or corticosteroid derivativ...,Corticosteroid and/or corticosteroid derivativ...
3,3,NCT00000151,acetylsalicylic acid|aspirin,aspirin|acetylsalicylic acid,Procedure|Drug,diabetes mellitus|diabetic retinopathy|early d...,blindness,Neurologic Manifestations,Diabetes mellitus (disorder)|Retinopathy due t...,Diabetes mellitus (disorder)|Retinal disorder ...,Legal blindness (disorder),Legal blindness (disorder),Diabetes mellitus (disorder)|Legal blindness (...,Diabetes mellitus (disorder)|Legal blindness (...,Aspirin (substance),Analgesic (substance),Aspirin (substance),Analgesic (substance),Aspirin (substance),Analgesic (substance)
4,4,NCT00000170,atropine,atropine,Drug|Device,amblyopia|anisometropia|moderate amblyopia|str...,amblyopia,Neurologic Manifestations,Amblyopia (disorder)|Anisometropia (disorder)|...,Amblyopia (disorder)|Anisometropia (disorder)|...,Amblyopia (disorder),Amblyopia (disorder),Amblyopia (disorder)|Anisometropia (disorder)|...,Amblyopia (disorder)|Anisometropia (disorder)|...,Atropine (substance),Bronchodilator (substance),Atropine (substance),Antidote (substance),Atropine (substance),Antidote (substance)|Bronchodilator (substance)


In [46]:
evaluate_snomed_link = True
hierarchical_mapping_conditions = True
hierarchical_mapping_interventions = False

if  model_prefix == "linkbert":
    target_ner_column_prefix = 'canonical_BioLinkBERT-base'
elif model_prefix == "aact":
    target_ner_column_prefix = 'canonical_aact'
else:
    target_ner_column_prefix = 'canonical_combined'

if evaluate_snomed_link:
    if hierarchical_mapping_conditions and hierarchical_mapping_interventions:
        target_interventions_column = f'{model_prefix}_top_concept_canonical_first_interventions'
        target_conditions_column = f'{model_prefix}_top_concept_canonical_first_conditions'
        annotations_type = f'{model_prefix}_sapbert_hierarchical'
    elif hierarchical_mapping_conditions and (not hierarchical_mapping_interventions):
        target_interventions_column = f'{model_prefix}_snomed_term_canonical_interventions'
        target_conditions_column = f'{model_prefix}_top_concept_canonical_first_conditions'
        annotations_type = f'{model_prefix}_sapbert_cond_hier'
    elif hierarchical_mapping_interventions and (not hierarchical_mapping_conditions):
        target_interventions_column = f'{model_prefix}_top_concept_canonical_first_interventions'
        target_conditions_column = f'{model_prefix}_snomed_term_canonical_conditions'
        annotations_type = f'{model_prefix}_sapbert_interv_hier'
    else:
        target_interventions_column = f'{model_prefix}_snomed_term_canonical_interventions'
        target_conditions_column = f'{model_prefix}_snomed_term_canonical_conditions'
        annotations_type = f'{model_prefix}_sapbert'
else:
    target_interventions_column = f'{target_ner_column_prefix}_interventions'
    target_conditions_column = f'{target_ner_column_prefix}_conditions'
    annotations_type = model_prefix


In [47]:
df_normalized_annotations = df_annotations_all[['nct_id', target_interventions_column, target_conditions_column]]
reference_data_with_target_annotations = df_normalized_annotations.merge(reference_data_new, on='nct_id', how='left')
reference_data_with_target_annotations.head(2)

Unnamed: 0,nct_id,combined_union_snomed_term_canonical_interventions,combined_union_top_concept_canonical_first_conditions,brief_title,start_date,study_official_title,brief_summary_description
0,NCT00000117,Immunoglobulin (substance),Multiple sclerosis (disorder)|Optic neuritis (...,Intravenous Immunoglobulin Therapy in Optic Ne...,1995-08-31,,To determine whether high-dose intravenous imm...
1,NCT00000146,Corticosteroid and/or corticosteroid derivativ...,Multiple sclerosis (disorder)|Optic neuritis (...,Optic Neuritis Treatment Trial (ONTT),1988-07-31,,To assess the beneficial and adverse effects o...


In [48]:
df_normalized_annotations.shape

(19632, 3)

In [50]:
reference_data_with_target_annotations.shape

(19632, 7)

In [54]:
# Extract the year from 'start_date' and 'completion_date'
reference_data_with_target_annotations['start_date'] = pd.to_datetime(reference_data_with_target_annotations['start_date'])
reference_data_with_target_annotations['start_year'] = reference_data_with_target_annotations['start_date'].dt.year

reference_data_with_target_annotations = reference_data_with_target_annotations[reference_data_with_target_annotations['start_year']<2024]
reference_data_with_target_annotations = reference_data_with_target_annotations[reference_data_with_target_annotations['start_year']>=2000]

In [56]:
reference_data_with_target_annotations.shape

(18609, 8)

In [57]:
reference_data_with_target_annotations

Unnamed: 0,nct_id,combined_union_snomed_term_canonical_interventions,combined_union_top_concept_canonical_first_conditions,brief_title,start_date,study_official_title,brief_summary_description,start_year
89,NCT00000307,Naltrexone (substance),Disorder caused by alcohol (disorder)|Disorder...,Naltrexone as Adjunct in Alcoholic Cocaine Dep...,2003-04-30,Naltrexone as an Adjunct in Alcoholic Cocaine ...,The purpose of this study is to evaluate naltr...,2003.0
106,NCT00000333,Atropine (substance)|Benzatropine (substance)|...,Cocaine|Disorder caused by cocaine (disorder),Evaluation of Benztropine for Cocaine Craving - 2,2001-05-31,Evaluation of Efficacy of Benztropine for Coca...,The purpose of this study is to compare the ef...,2001.0
145,NCT00000428,Amitriptyline (substance)|Fluoxetine (substanc...,Fibromyalgia (disorder),Combining N-of-1 Trials to Assess Fibromyalgia...,2000-09-30,Combining N-of-1 Trials to Assess Fibromyalgia...,This study will compare the effectiveness of c...,2000.0
146,NCT00000439,Sodium valproate (substance)|Valproate (substa...,Bipolar disorder (disorder)|Disorder caused by...,Drug Treatment for Alcoholics With Bipolar Dis...,2000-10-31,Efficacy of Valproate Maintenance in Bipolar A...,The purpose of this study is to test the effec...,2000.0
220,NCT00001956,Ketorolac (substance)|Lidocaine (substance)|Mi...,Central pain syndrome (disorder)|Disorder of m...,Influence of Genetics in Pain Sensitivity,2000-01-31,,The purpose of this study is to learn more abo...,2000.0
...,...,...,...,...,...,...,...,...
19606,NCT06282640,Blood group antigen V (substance)|Ergocalcifer...,Cts|Median neuropathy (disorder)|Pain|Peripher...,Comparıson Of Electromyography Results Before ...,2021-12-30,Comparıson Of Electromyography Results Before ...,"Therefore, we aimed to evaluate the effectiven...",2021.0
19613,NCT06287502,2-methyl-3-hydroxybutyrate (substance),Degenerative disorder of muscle (disorder)|Ost...,Efficacy of Structured Exercise-Nutritional In...,2022-09-09,Efficacy of Structured Exercise-Nutritional In...,"This is a prospective parallel group, double-b...",2022.0
19616,NCT06289335,Dexmedetomidine (substance)|Ondansetron (subst...,Complication of surgical procedure (disorder)|...,Dexmedetomidine Compared to Ondansetron for Po...,2020-08-05,Dexmedetomidine Compared With Ondansetron in T...,Evaluate the efficiency of management with int...,2020.0
19623,NCT06292351,"Dimetan (substance)|N-methyl-1-(1,3-benzodioxo...",Alzheimer's disease (disorder)|Dementia (disor...,DMB-I in the Treatment of Alzheimer Type Dementia,2023-12-27,Multicenter Randomized Double-blind Placebo-co...,The purpose of this study is to assess the eff...,2023.0


In [58]:
reference_data_with_target_annotations.to_csv(f"data/annotated_aact/snomed_linking_outputs/{model_prefix}_annotations_with_details_{len(reference_data_with_target_annotations)}.csv")