#### Attempt to map the conditions and interventions to CURIEs

In [1]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
import re
import collections
import os
import json
import numpy as np

In [2]:
%pip install thefuzz

Note: you may need to restart the kernel to use updated packages.


In [3]:
from thefuzz import fuzz # fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

In [4]:
# for testing
data_extracted = "/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/2023-04-03_extracted"

In [178]:
# read in pipe-delimited files
conditions_df = pd.read_csv(data_extracted + '/conditions.txt', sep='|', index_col=False, header=0)
interventions_df = pd.read_csv(data_extracted + '/interventions.txt', sep='|', index_col=False, header=0)
browse_conditions_df = pd.read_csv(data_extracted + '/browse_conditions.txt', sep='|', index_col=False, header=0)
browse_interventions_df = pd.read_csv(data_extracted + '/browse_interventions.txt', sep='|', index_col=False, header=0)

browse_interventions_df.head()

Unnamed: 0,id,nct_id,mesh_term,downcase_mesh_term,mesh_type
0,57856435,NCT00246285,Risperidone,risperidone,mesh-list
1,57856584,NCT04626778,Hydrogen Peroxide,hydrogen peroxide,mesh-list
2,57856732,NCT00219908,Mitoxantrone,mitoxantrone,mesh-list
3,57856807,NCT00220064,Vitamins,vitamins,mesh-ancestor
4,57857560,NCT04612894,Apatinib,apatinib,mesh-list


In [296]:
conditions_df.head()

Unnamed: 0,id,nct_id,name,downcase_name
0,31338340,NCT01819987,Overweight,overweight
1,31338518,NCT01821599,Rehabilitation,rehabilitation
2,31338693,NCT01823731,Ovarian Cancer,ovarian cancer
3,31338879,NCT01825850,Healthy,healthy
4,31339149,NCT01829022,Myoma,myoma


In [164]:
browse_conditions_df.head()
browse_conditions = browse_conditions_df[["nct_id", "downcase_mesh_term", "mesh_type"]]

Unnamed: 0,id,nct_id,mesh_term,downcase_mesh_term,mesh_type
0,113697939,NCT00185796,Syndrome,syndrome,mesh-list
1,113698103,NCT00185913,Neoplasms,neoplasms,mesh-ancestor
2,113698185,NCT03516604,Depression,depression,mesh-list
3,113698265,NCT03318952,Pain,pain,mesh-ancestor
4,113698420,NCT02789800,Tauopathies,tauopathies,mesh-ancestor


In [179]:
# every clinical trial has at least one MeSH-list (leaf), and may have 0 or more MeSH-ancestors (see MeSH analytics script)
# subset browse_conditions to get only the MeSH-list terms or leaves
condition_mesh_leaves = browse_conditions_df[browse_conditions_df["mesh_type"].str.contains("mesh-list")]
condition_mesh_leaves


Unnamed: 0,id,nct_id,mesh_term,downcase_mesh_term,mesh_type
0,113697939,NCT00185796,Syndrome,syndrome,mesh-list
2,113698185,NCT03516604,Depression,depression,mesh-list
5,113698730,NCT00461539,Depression,depression,mesh-list
6,113698920,NCT05324137,Polyps,polyps,mesh-list
11,113700140,NCT00176514,Mucositis,mucositis,mesh-list
...,...,...,...,...,...
2849547,111234990,NCT00733525,Bulimia Nervosa,bulimia nervosa,mesh-list
2849551,111234994,NCT02653131,Short Bowel Syndrome,short bowel syndrome,mesh-list
2849552,111234995,NCT02653131,Syndrome,syndrome,mesh-list
2849564,111204205,NCT04834908,Infections,infections,mesh-list


In [297]:
conditions = conditions_df[["nct_id", "downcase_name"]]
conditions

Unnamed: 0,nct_id,downcase_name
0,NCT01819987,overweight
1,NCT01821599,rehabilitation
2,NCT01823731,ovarian cancer
3,NCT01825850,healthy
4,NCT01829022,myoma
...,...,...
771537,NCT00323960,juvenile dermatomyositis
771538,NCT00323037,congestive heart failure
771539,NCT00246857,primary immune deficiency
771540,NCT00128297,breast cancer


In [181]:
# some conditions have a MESH term match, even if it doesn't show up in the same clinical trial
# add those matches based on MESH term alone (irrespective of clinical trial)
all_mesh_conditions = browse_conditions.downcase_mesh_term.unique()
print(len(all_mesh_conditions))

4315


In [299]:
# how many unique conditions do we have to map
len(conditions.downcase_name.unique())

102760

In [182]:
# map any matching MESH term condition to the conditions 
mask = np.isin(conditions['downcase_name'], all_mesh_conditions)
conditions_exact_mapped['mesh_conditions_exact_mapped'] = np.where(mask, conditions_exact_mapped['downcase_name'], np.nan)

with pd.option_context("display.max_rows", 30000):
    display(conditions_exact_mapped[:3000])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conditions_exact_mapped['mesh_conditions_exact_mapped'] = np.where(mask, conditions_exact_mapped['downcase_name'], np.nan)


Unnamed: 0,nct_id,downcase_name,mesh_conditions_exact_mapped
0,NCT01819987,overweight,overweight
1,NCT01821599,rehabilitation,
2,NCT01823731,ovarian cancer,
3,NCT01825850,healthy,
4,NCT01829022,myoma,myoma
5,NCT01838148,spect,
6,NCT01840995,pain,pain
7,NCT01841983,sleep,
8,NCT01846520,ureter cancer,
9,NCT01849302,obesity,obesity


In [184]:
conditions_unmapped = conditions_exact_mapped[conditions_exact_mapped['mesh_conditions_exact_mapped'].isnull()] # get the rows where mesh_term is empty bc there was no match there
conditions_unmapped

Unnamed: 0,nct_id,downcase_name,mesh_conditions_exact_mapped
1,NCT01821599,rehabilitation,
2,NCT01823731,ovarian cancer,
3,NCT01825850,healthy,
5,NCT01838148,spect,
7,NCT01841983,sleep,
...,...,...,...
771528,NCT00413998,mitral regurgitation,
771537,NCT00323960,juvenile dermatomyositis,
771538,NCT00323037,congestive heart failure,
771539,NCT00246857,primary immune deficiency,


## Use fuzzy string matching to get MESH terms matches that are identical to downcase_name, but rearranged

In [185]:
# fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

def get_token_sort_ratio(str1, str2):
    try:
        return fuzz.token_sort_ratio(str1, str2)
    except:
        return None
    
sort_ratio = np.vectorize(get_token_sort_ratio)

def get_token_set_ratio(str1, str2):
    try:
        return fuzz.token_set_ratio(str1, str2)
    except:
        return None  
set_ratio = np.vectorize(get_token_set_ratio)

def get_similarity_score(str1, str2):
    try:
        return fuzz.ratio(str1, str2)
    except:
        return None
sim_score = np.vectorize(get_similarity_score)



In [186]:
mesh_conditions_per_study = pd.DataFrame(browse_conditions[["nct_id", "downcase_mesh_term", "mesh_type"]].groupby("nct_id")["downcase_mesh_term"].apply(list))
with pd.option_context("max_colwidth", None):
    display(mesh_conditions_per_study[:30])


Unnamed: 0_level_0,downcase_mesh_term
nct_id,Unnamed: 1_level_1
NCT00000102,"[adrenal hyperplasia, congenital, adrenogenital syndrome, adrenocortical hyperfunction, hyperplasia, pathologic processes, disorders of sex development, urogenital abnormalities, congenital abnormalities, genetic diseases, inborn, steroid metabolism, inborn errors, metabolism, inborn errors, metabolic diseases, adrenal gland diseases, endocrine system diseases, gonadal disorders]"
NCT00000104,"[poisoning, lead poisoning, chemically-induced disorders, heavy metal poisoning]"
NCT00000105,"[tetanus, clostridium infections, gram-positive bacterial infections, bacterial infections, bacterial infections and mycoses, infections]"
NCT00000106,"[rheumatic diseases, collagen diseases, musculoskeletal diseases, connective tissue diseases]"
NCT00000107,"[heart defects, congenital, congenital abnormalities, heart diseases, cardiovascular diseases, cardiovascular abnormalities]"
NCT00000108,"[cardiovascular diseases, coronary disease, myocardial ischemia, heart diseases, vascular diseases]"
NCT00000111,"[mouth diseases, stomatognathic diseases]"
NCT00000112,"[melanosis, acanthosis nigricans, glucose intolerance, hyperglycemia, glucose metabolism disorders, metabolic diseases, hyperpigmentation, pigmentation disorders, skin diseases]"
NCT00000113,"[myopia, refractive errors, eye diseases]"
NCT00000114,"[retinitis, retinitis pigmentosa, retinal diseases, eye diseases, eye diseases, hereditary, retinal dystrophies, retinal degeneration, genetic diseases, inborn]"


In [300]:
print(len(conditions_unmapped.downcase_name.unique()))
conditions_unmapped

99160


In [244]:
conditions_unmapped_all_mesh_terms = pd.merge(conditions_unmapped[["nct_id", "downcase_name"]], # only these columns needed bc all other columns have null anyway (no match from previous merge)
                                                mesh_conditions_per_study,
                                                how='left',
                                                left_on=['nct_id'],
                                                right_on = ['nct_id'])

# some clinical trials are missing from browse_conditions (those nct_ids are not present in the browse_conditions text) They have NaN in the downcase_mesh_term column
conditions_unmapped_all_mesh_terms = conditions_unmapped_all_mesh_terms[~conditions_unmapped_all_mesh_terms['downcase_mesh_term'].isnull()] # subset or delete rows where either column is empty/Nonetype bc fuzzymatching functions will throw error if handling
conditions_unmapped_all_mesh_terms


Unnamed: 0,nct_id,downcase_name,downcase_mesh_term
1,NCT01823731,ovarian cancer,"[ovarian neoplasms, carcinoma, ovarian epithel..."
3,NCT01838148,spect,"[coronary artery disease, myocardial ischemia,..."
5,NCT01846520,ureter cancer,"[carcinoma, pancreatic neoplasms, sarcoma, sto..."
7,NCT02985749,pdd,"[disease, autistic disorder, autism spectrum d..."
9,NCT01865357,brain mri,"[multiple sclerosis, syndrome, sclerosis, dise..."
...,...,...,...
545872,NCT00413998,mitral regurgitation,"[coronary artery disease, mitral valve insuffi..."
545873,NCT00323960,juvenile dermatomyositis,"[dermatomyositis, polymyositis, myositis, musc..."
545874,NCT00323037,congestive heart failure,"[heart failure, heart diseases, cardiovascular..."
545875,NCT00246857,primary immune deficiency,"[genetic diseases, inborn, primary immunodefic..."


In [245]:
conditions_unmapped_all_mesh_terms = conditions_unmapped_all_mesh_terms.explode('downcase_mesh_term')
conditions_unmapped_all_mesh_terms

Unnamed: 0,nct_id,downcase_name,downcase_mesh_term
1,NCT01823731,ovarian cancer,ovarian neoplasms
1,NCT01823731,ovarian cancer,"carcinoma, ovarian epithelial"
1,NCT01823731,ovarian cancer,endocrine gland neoplasms
1,NCT01823731,ovarian cancer,neoplasms by site
1,NCT01823731,ovarian cancer,neoplasms
...,...,...,...
545876,NCT00128297,breast cancer,neoplasms
545876,NCT00128297,breast cancer,breast diseases
545876,NCT00128297,breast cancer,skin diseases
545876,NCT00128297,breast cancer,neoplastic processes


In [246]:
conditions_unmapped_all_mesh_terms = conditions_unmapped_all_mesh_terms[~conditions_unmapped_all_mesh_terms['downcase_name'].isnull()] # subset or delete rows where either column is empty/Nonetype bc fuzzymatching functions will throw error if handling
conditions_unmapped_all_mesh_terms = conditions_unmapped_all_mesh_terms[~conditions_unmapped_all_mesh_terms['downcase_mesh_term'].isnull()] # subset or delete rows where either column is empty/Nonetype bc fuzzymatching functions will throw error if handling
print(len(conditions_unmapped_all_mesh_terms))

conditions_unmapped_all_mesh_terms["sort_ratio"] = sort_ratio(conditions_unmapped_all_mesh_terms[["downcase_mesh_term"]].values, conditions_unmapped_all_mesh_terms[["downcase_name"]].values)
# conditions_unmapped_all_mesh_terms["set_ratio"] = set_ratio(conditions_unmapped_all_mesh_terms[["downcase_mesh_term"]].values, conditions_unmapped_all_mesh_terms[["downcase_name"]].values)
conditions_unmapped_all_mesh_terms["sim_score"] = sim_score(conditions_unmapped_all_mesh_terms[["downcase_mesh_term"]].values, conditions_unmapped_all_mesh_terms[["downcase_name"]].values)

conditions_unmapped_all_mesh_terms


5176844


Unnamed: 0,nct_id,downcase_name,downcase_mesh_term,sort_ratio,sim_score
1,NCT01823731,ovarian cancer,ovarian neoplasms,65,65
1,NCT01823731,ovarian cancer,"carcinoma, ovarian epithelial",57,42
1,NCT01823731,ovarian cancer,endocrine gland neoplasms,31,41
1,NCT01823731,ovarian cancer,neoplasms by site,32,26
1,NCT01823731,ovarian cancer,neoplasms,35,17
...,...,...,...,...,...
545876,NCT00128297,breast cancer,neoplasms,27,27
545876,NCT00128297,breast cancer,breast diseases,64,64
545876,NCT00128297,breast cancer,skin diseases,38,31
545876,NCT00128297,breast cancer,neoplastic processes,42,42


In [286]:
conditions_mesh_fuzz_scored = conditions_unmapped_all_mesh_terms[(conditions_unmapped_all_mesh_terms['sim_score'] > 88) | (conditions_unmapped_all_mesh_terms['sort_ratio'] > 88)]
conditions_mesh_fuzz_scored = conditions_mesh_fuzz_scored.sort_values(by = ['nct_id', 'downcase_name'], ascending = [True, True], na_position = 'first')

conditions_mesh_fuzz_scored

Unnamed: 0,nct_id,downcase_name,downcase_mesh_term,sort_ratio,sim_score
17708,NCT00000102,congenital adrenal hyperplasia,"adrenal hyperplasia, congenital",100,62
191694,NCT00000126,ischemic optic neuropathy,"optic neuropathy, ischemic",100,63
172984,NCT00000127,ischemic optic neuropathy,"optic neuropathy, ischemic",100,63
191695,NCT00000132,open-angle glaucoma,"glaucoma, open-angle",100,51
20691,NCT00000140,proliferative vitreoretinopathy,"vitreoretinopathy, proliferative",100,54
...,...,...,...,...,...
332368,NCT05793684,obstructive sleep apnea,"sleep apnea, obstructive",100,47
332366,NCT05793697,chronic obstructive pulmonary disease,"pulmonary disease, chronic obstructive",100,51
332350,NCT05793827,traumatic brain injury,"brain injuries, traumatic",91,47
332333,NCT05793983,"cirrhosis, liver",liver cirrhosis,100,58


In [287]:
conditions_mesh_fuzz_scored = conditions_mesh_fuzz_scored.sort_values('sort_ratio', ascending=False).drop_duplicates(['nct_id', 'downcase_name']).sort_index()
conditions_mesh_fuzz_scored = conditions_mesh_fuzz_scored.sort_values(['nct_id'], ascending=False)

print(len(conditions_mesh_fuzz_scored))
with pd.option_context("display.max_rows", 30000):
    display(conditions_mesh_fuzz_scored[:3000])
    

73181


Unnamed: 0,nct_id,downcase_name,downcase_mesh_term,sort_ratio,sim_score
332335,NCT05793983,"liver failure, acute on chronic",acute-on-chronic liver failure,100,46
332333,NCT05793983,"cirrhosis, liver",liver cirrhosis,100,58
332350,NCT05793827,traumatic brain injury,"brain injuries, traumatic",91,47
332366,NCT05793697,chronic obstructive pulmonary disease,"pulmonary disease, chronic obstructive",100,51
332368,NCT05793684,obstructive sleep apnea,"sleep apnea, obstructive",100,47
332403,NCT05793268,chronic hepatitis b,"hepatitis b, chronic",100,56
332412,NCT05793190,alcoholic liver disease,"liver diseases, alcoholic",98,54
332418,NCT05793151,head and neck squamous cell carcinoma,squamous cell carcinoma of head and neck,96,60
332425,NCT05793125,hypertension in pregnancy,"hypertension, pregnancy-induced",91,79
332448,NCT05792878,chronic hepatitis b,"hepatitis b, chronic",100,56


In [None]:
### 100% reassortment score indicates that the mapped mesh term is probably correct bc liver failure, acute is considered identical to acute liver failure
### Look for examples where there reassorted score is high, but there is no comma, to see whether mappings are legitimate
no_comma_reassortment_score_check = conditions_mesh_fuzz_scored[~conditions_mesh_fuzz_scored.astype(str).apply(lambda x: x.str.contains(','), axis=1).any(axis=1)]
no_comma_reassortment_score_check.to_csv('reassorted_high_no_comma.tsv', sep ='\t')
no_comma_reassortment_score_check

In [301]:
keys = list(conditions_mesh_fuzz_scored[["nct_id", "downcase_name"]].columns.values)
i1 = conditions_unmapped.set_index(keys).index
i2 = conditions_mesh_fuzz_scored.set_index(keys).index
print("length of unmapped conditions unmapped after fuzzy matching: {}".format(len(conditions_unmapped.downcase_name.unique())))

conditions_unmapped = conditions_unmapped[~i1.isin(i2)]
conditions_unmapped


length of unmapped conditions unmapped after fuzzy matching: 94632


In [197]:
# conditions_mesh_fuzz_mapped = pd.merge(conditions_unmapped[["nct_id", "downcase_name"]], # only these columns needed bc all other columns have null anyway (no match from previous merge)
#                                         conditions_mesh_fuzz_scored,
#                                         how='left',
#                                         left_on=['nct_id', 'downcase_name'],
#                                         right_on = ['nct_id', 'downcase_name'])

In [199]:
with pd.option_context("display.max_rows", 30000):
    display(conditions_mesh_fuzz_mapped[:3000])

Unnamed: 0,nct_id,downcase_name,downcase_mesh_term,sort_ratio,sim_score
0,NCT01821599,rehabilitation,,,
1,NCT01823731,ovarian cancer,,,
2,NCT01825850,healthy,,,
3,NCT01838148,spect,,,
4,NCT01841983,sleep,,,
5,NCT01846520,ureter cancer,,,
6,NCT01850576,hiv,,,
7,NCT02985749,pdd,,,
8,NCT01862146,fmd,,,
9,NCT01865357,brain mri,,,


In [None]:
conditions_unmapped = conditions_unmapped[["nct_id", "downcase_name"]][~conditions_no_fuzz['downcase_name'].isnull()] # subset or delete rows where either column is empty/Nonetype bc fuzzymatching functions will throw error if handling


In [None]:
conditions_unmapped_all_mesh_terms = conditions_unmapped_all_mesh_terms[~conditions_unmapped_all_mesh_terms['downcase_name'].isnull()] # subset or delete rows where either column is empty/Nonetype bc fuzzymatching functions will throw error if handling


In [77]:
conditions_reassorted_mapped = conditions_unmapped_all_mesh_terms[conditions_unmapped_all_mesh_terms["reassorted_ratio"] > 95]
with pd.option_context("display.max_rows", 1000):
    display(conditions_reassorted_mapped[:20])
print(len(conditions_reassorted_mapped))

Unnamed: 0,nct_id,downcase_name,downcase_mesh_term,reassorted_ratio
0,NCT00000102,congenital adrenal hyperplasia,"adrenal hyperplasia, congenital",100
7,NCT00000126,ischemic optic neuropathy,"optic neuropathy, ischemic",100
8,NCT00000127,ischemic optic neuropathy,"optic neuropathy, ischemic",100
11,NCT00000132,open-angle glaucoma,"glaucoma, open-angle",100
18,NCT00000140,proliferative vitreoretinopathy,"vitreoretinopathy, proliferative",100
22,NCT00000144,open-angle glaucoma,"glaucoma, open-angle",100
24,NCT00000149,open-angle glaucoma,"glaucoma, open-angle",100
26,NCT00000370,social phobia,"phobia, social",100
28,NCT00000375,major depressive disorder,"depressive disorder, major",100
35,NCT00000389,social phobia,"phobia, social",100


51671


In [78]:
conditions_unmapped

Unnamed: 0,nct_id,name,downcase_name
25683,NCT00000102,Congenital Adrenal Hyperplasia,congenital adrenal hyperplasia
228818,NCT00000105,Cancer,cancer
249384,NCT00000110,Obesity,obesity
203653,NCT00000112,Diabetes,diabetes
203651,NCT00000112,Obesity,obesity
...,...,...,...
472887,NCT05794061,Psychiatric Disorder,psychiatric disorder
472886,NCT05794061,Dementia,dementia
472885,NCT05794061,Cognitive Impairment,cognitive impairment
472884,NCT05794074,Nutrition Deficiency (Xanth Deficiency) Due to...,nutrition deficiency (xanth deficiency) due to...


In [79]:
conditions_95_reassorted_mapped = pd.merge(conditions_unmapped, # only these columns needed bc all other columns have null anyway (no match from previous merge)
                                            conditions_reassorted_mapped,
                                            how='left',
                                            left_on=['nct_id', 'downcase_name'],
                                            right_on = ['nct_id', 'downcase_name'])
conditions_95_reassorted_mapped


Unnamed: 0,nct_id,name,downcase_name,downcase_mesh_term,reassorted_ratio
0,NCT00000102,Congenital Adrenal Hyperplasia,congenital adrenal hyperplasia,"adrenal hyperplasia, congenital",100.0
1,NCT00000105,Cancer,cancer,,
2,NCT00000110,Obesity,obesity,,
3,NCT00000112,Diabetes,diabetes,,
4,NCT00000112,Obesity,obesity,,
...,...,...,...,...,...
563768,NCT05794061,Psychiatric Disorder,psychiatric disorder,,
563769,NCT05794061,Dementia,dementia,,
563770,NCT05794061,Cognitive Impairment,cognitive impairment,,
563771,NCT05794074,Nutrition Deficiency (Xanth Deficiency) Due to...,nutrition deficiency (xanth deficiency) due to...,,


In [80]:
conditions_unmapped = conditions_95_reassorted_mapped[conditions_95_reassorted_mapped['reassorted_ratio'].isnull()] # get the rows where mesh_term is empty bc there was no match there
conditions_unmapped

Unnamed: 0,nct_id,name,downcase_name,downcase_mesh_term,reassorted_ratio
1,NCT00000105,Cancer,cancer,,
2,NCT00000110,Obesity,obesity,,
3,NCT00000112,Diabetes,diabetes,,
4,NCT00000112,Obesity,obesity,,
5,NCT00000115,"Macular Edema, Cystoid","macular edema, cystoid",,
...,...,...,...,...,...
563768,NCT05794061,Psychiatric Disorder,psychiatric disorder,,
563769,NCT05794061,Dementia,dementia,,
563770,NCT05794061,Cognitive Impairment,cognitive impairment,,
563771,NCT05794074,Nutrition Deficiency (Xanth Deficiency) Due to...,nutrition deficiency (xanth deficiency) due to...,,


In [None]:
# find the best match from MESH list terms per term (one with highest ratio)


In [20]:
conditions_unmapped_list = conditions_unmapped.downcase_name.to_list()[:10]

NameError: name 'conditions_unmapped' is not defined

In [134]:
conditions_fuzzmatched = conditions_unmapped_fuzzmatch.dropna(subset=['downcase_mesh_term'])
conditions_fuzzmatched[:20]

Unnamed: 0,nct_id,downcase_name,downcase_mesh_term
0,NCT00000102,congenital adrenal hyperplasia,"[adrenal hyperplasia, congenital, adrenogenita..."
1,NCT00000105,cancer,"[tetanus, clostridium infections, gram-positiv..."
3,NCT00000112,diabetes,"[melanosis, acanthosis nigricans, glucose into..."
4,NCT00000112,obesity,"[melanosis, acanthosis nigricans, glucose into..."
5,NCT00000115,"macular edema, cystoid","[retinal diseases, macular edema, edema, macul..."
6,NCT00000123,myopia,"[astigmatism, refractive errors, eye diseases]"
7,NCT00000126,ischemic optic neuropathy,"[ischemia, optic nerve diseases, optic neuropa..."
8,NCT00000127,ischemic optic neuropathy,"[optic nerve diseases, optic neuropathy, ische..."
9,NCT00000128,esophoria,"[strabismus, esotropia, eye diseases, ocular m..."
10,NCT00000128,myopia,"[strabismus, esotropia, eye diseases, ocular m..."


In [141]:
len(conditions_fuzzmatched)

462404

In [140]:
for index, row in conditions_fuzzmatched.iterrows():
#     print(row["downcase_mesh_term"])
    for term in row["downcase_mesh_term"]:
        reassorted_ratio = ratio(row["downcase_name"], term)
#         print([row["downcase_name"], term])
#         print(reassorted_ratio)
        if reassorted_ratio == 100:
            conditions_fuzzmatched.loc[index,"reassorted_mesh_match"] = term

conditions_fuzzmatched[:30]    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conditions_fuzzmatched.loc[index,"reassorted_mesh_match"] = term


KeyboardInterrupt: 

In [118]:
# find studies for which there is only 1 condition listed (from unmapped conditions)
conditions_unmapped_per_study = pd.DataFrame(conditions_unmapped.groupby("nct_id")["downcase_name"].apply(list))
with pd.option_context("display.max_rows", 1000):
    display(conditions_unmapped_per_study[:20])


Unnamed: 0_level_0,downcase_name
nct_id,Unnamed: 1_level_1
NCT00000102,[congenital adrenal hyperplasia]
NCT00000105,[cancer]
NCT00000110,[obesity]
NCT00000112,"[diabetes, obesity]"
NCT00000115,"[macular edema, cystoid]"
NCT00000123,[myopia]
NCT00000126,[ischemic optic neuropathy]
NCT00000127,[ischemic optic neuropathy]
NCT00000128,"[esophoria, myopia]"
NCT00000132,[open-angle glaucoma]


In [10]:
conditions_unmapped_per_study["condition_counts"] = conditions_unmapped_per_study["downcase_name"].str.len()
conditions_unmapped_per_study

Unnamed: 0_level_0,downcase_name,condition_counts
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1
NCT00000102,[congenital adrenal hyperplasia],1
NCT00000105,[cancer],1
NCT00000110,[obesity],1
NCT00000112,"[diabetes, obesity]",2
NCT00000115,"[macular edema, cystoid]",1
...,...,...
NCT05794035,"[skin cancer, non-melanoma]",1
NCT05794048,"[pancreatic tumor, hepatocarcinoma]",2
NCT05794061,"[psychiatric disorder, dementia, cognitive imp...",3
NCT05794074,[nutrition deficiency (xanth deficiency) due t...,1


In [11]:
unmapped_single_conditions = conditions_unmapped_per_study[conditions_unmapped_per_study["condition_counts"] == 1]
unmapped_single_conditions

Unnamed: 0_level_0,downcase_name,condition_counts
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1
NCT00000102,[congenital adrenal hyperplasia],1
NCT00000105,[cancer],1
NCT00000110,[obesity],1
NCT00000115,"[macular edema, cystoid]",1
NCT00000123,[myopia],1
...,...,...
NCT05793944,[pregnancy],1
NCT05794009,[exercise therapy],1
NCT05794035,"[skin cancer, non-melanoma]",1
NCT05794074,[nutrition deficiency (xanth deficiency) due t...,1


In [13]:
mapped_single_conditions = pd.merge(unmapped_single_conditions, condition_mesh_leaves[["nct_id", "downcase_mesh_term"]], left_on=["nct_id"], right_on=["nct_id"])
with pd.option_context("display.max_rows", 1000):
    display(mapped_single_conditions[:20])
    

Unnamed: 0,nct_id,downcase_name,condition_counts,downcase_mesh_term
0,NCT00000102,[congenital adrenal hyperplasia],1,"adrenal hyperplasia, congenital"
1,NCT00000102,[congenital adrenal hyperplasia],1,adrenogenital syndrome
2,NCT00000102,[congenital adrenal hyperplasia],1,adrenocortical hyperfunction
3,NCT00000102,[congenital adrenal hyperplasia],1,hyperplasia
4,NCT00000105,[cancer],1,tetanus
5,NCT00000115,"[macular edema, cystoid]",1,macular edema
6,NCT00000115,"[macular edema, cystoid]",1,edema
7,NCT00000123,[myopia],1,astigmatism
8,NCT00000126,[ischemic optic neuropathy],1,ischemia
9,NCT00000126,[ischemic optic neuropathy],1,optic nerve diseases


### Conclusion: there are multiple candidate MeSH terms (leaf nodes only, not ancestors) available per 1 condition

# Use MetaMap to find more candidate matches

In [14]:
CAS_SERVERURL = "https://utslogin.nlm.nih.gov/cas/v1"
II_SKR_SERVERURL = 'https://ii.nlm.nih.gov/cgi-bin/II/UTS_Required'
METAMAP_INTERACTIVE_URL = II_SKR_SERVERURL + "/API_MM_interactive.pl"
stserverurl = "https://utslogin.nlm.nih.gov/cas/v1/tickets"
tgtserverurl = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
apikey = '54041f07-fc66-4558-b038-b46ca8bdcc6b'
serviceurl = METAMAP_INTERACTIVE_URL
ksource = '2020AB'

def get_service_ticket(serverurl, ticket_granting_ticket, serviceurl):
    """ Obtain a Single-Use Proxy Ticket (also known as service ticket).
    Request for a Service Ticket:
        POST /cas/v1/tickets/{TGT id} HTTP/1.0
    data:
           service={form encoded parameter for the service url}
    Sucessful Response:
        200 OK
        ST-1-FFDFHDSJKHSDFJKSDHFJKRUEYREWUIFSD2132
    @param serverurl authentication server
    @param ticketGrantingTicket a Proxy Granting Ticket.
    @param serviceurl url of service with protected resources
    @return authentication ticket for service. """
    resp = requests.post("{}/{}".format(serverurl, ticket_granting_ticket),
                         {"service": serviceurl})
    if resp.status_code == 200:
        return resp.content
    return 'Error: status: {}'.format(resp.content)


def extract_tgt_ticket(htmlcontent):
    "Extract ticket granting ticket from HTML."
    # print('htmlcontent: {}'.format(htmlcontent))
    html = HTML(html=htmlcontent)
    # get form element
    elements = html.xpath("//form")
    # print('html response: {}'.format(etree.tostring(html.lxml).decode()))
    # print('action attribure: {}'.format(elements[0].attrs['action']))
    # extract ticket granting ticket out of 'action' attribute
    if elements != []:
        return elements[0].attrs['action'].split('/')[-1]
    else:
        return "form element missing from ticket granting ticket response"

def get_ticket(cas_serverurl, apikey, serviceurl):
    # set ticket granting ticket server url
    tgtserverurl = cas_serverurl + "/api-key"
    # set service ticket server url
    stserverurl = cas_serverurl + "/tickets"
    tgt = get_ticket_granting_ticket(tgtserverurl, apikey)
    return get_service_ticket(stserverurl, tgt, serviceurl)

def get_ticket_granting_ticket(tgtserverurl, apikey):
    # http://serviceurl/cas/v1/tickets/{TGT id}
    response = requests.post(tgtserverurl, {'apikey': apikey},
                             headers={'Accept': 'test/plain'})
    return extract_tgt_ticket(response.content)

def extract_tgt_ticket(htmlcontent):
    "Extract ticket granting ticket from HTML."    
    soup = BeautifulSoup(htmlcontent)
#     print(soup.find('form').get("action"))
    cas_url = soup.find("form").get("action")
    "Extract ticket granting ticket out of 'action' attribute"
#     tgt = cas_url.rsplit('/')[-1]
    return cas_url.rsplit('/')[-1]
    
ticket = get_ticket(CAS_SERVERURL, apikey, serviceurl)

In [16]:
def get_redirect_target(resp):
        """Receives a Response. Returns a redirect URI or ``None``"""
        # Due to the nature of how requests processes redirects this method will
        # be called at least once upon the original response and at least twice
        # on each subsequent redirect response (if any).
        # If a custom mixin is used to handle this logic, it may be advantageous
        # to cache the redirect location onto the response object as a private
        # attribute.
        if resp.is_redirect:
            location = resp.headers["location"]
            # Currently the underlying http module on py3 decode headers
            # in latin1, but empirical evidence suggests that latin1 is very
            # rarely used with non-ASCII characters in HTTP headers.
            # It is more likely to get UTF8 header rather than latin1.
            # This causes incorrect handling of UTF8 encoded location headers.
            # To solve this, we re-encode the location in latin1.
#             print(location)
            location = location.encode("latin1")
#             print(location)
#             print(to_native_string(location, "utf8"))
            return to_native_string(location, "utf8")
        return None

if response.status_code == 302:
    newurl = s.get_redirect_target(response)
    response = s.post(newurl, form, headers=headers, params=params, allow_redirects=False)

In [17]:
# get concepts to map
conditions_unmapped.downcase_name.to_list()[:10]

['congenital adrenal hyperplasia',
 'cancer',
 'obesity',
 'diabetes',
 'obesity',
 'macular edema, cystoid',
 'myopia',
 'ischemic optic neuropathy',
 'ischemic optic neuropathy',
 'esophoria']

In [18]:
conditions_unmapped

Unnamed: 0,nct_id,name,downcase_name,mesh_term,downcase_mesh_term,mesh_type
25683,NCT00000102,Congenital Adrenal Hyperplasia,congenital adrenal hyperplasia,,,
228818,NCT00000105,Cancer,cancer,,,
249384,NCT00000110,Obesity,obesity,,,
203653,NCT00000112,Diabetes,diabetes,,,
203651,NCT00000112,Obesity,obesity,,,
...,...,...,...,...,...,...
472887,NCT05794061,Psychiatric Disorder,psychiatric disorder,,,
472886,NCT05794061,Dementia,dementia,,,
472885,NCT05794061,Cognitive Impairment,cognitive impairment,,,
472884,NCT05794074,Nutrition Deficiency (Xanth Deficiency) Due to...,nutrition deficiency (xanth deficiency) due to...,,,


In [19]:
len(conditions_unmapped['downcase_name'].to_list())

563678

In [20]:
len(set(conditions_unmapped['downcase_name'].to_list()))

99253

In [23]:
conditions_unmapped_list = list(set(conditions_unmapped.downcase_name.to_list())) # get unique unmapped terms
conditions_unmapped_list = [s + "\r\n" for s in conditions_unmapped_list] # you must include newline to process each term separately!
print(len(conditions_unmapped_list))

99253


In [21]:
def split_list_by_length(lst):
    result = []
    current_sublist = []
    current_length = 0
    for item in lst:
        item_length = len(item)
        if current_length + item_length > 9990: # max is 10,000 char allowed by MetaMap
            result.append(current_sublist)
            current_sublist = []
            current_length = 0
        current_sublist.append(item)
        current_length += item_length
    result.append(current_sublist)
    return result

In [26]:
chunked_conditions_unmapped = split_list_by_length(conditions_unmapped_list)
chunked_conditions_unmapped[:1]

[['kearns sayer\r\n',
  'hereditary angioedema - type 1\r\n',
  'respiratory muscles\r\n',
  'hepatic tumor\r\n',
  'parent-child relationship\r\n',
  'caesarean scar pregnancy\r\n',
  'sleep hygiene\r\n',
  'ganglioneuroblastoma of central nervous system\r\n',
  'g6pd\r\n',
  'antineoplastic adverse reaction\r\n',
  'isoflurane\r\n',
  'ocular tumor\r\n',
  'triple negative breast cancer, tnbc\r\n',
  'risk of bone fracture occurrences\r\n',
  'infants\r\n',
  'attenuated or transient psychosis\r\n',
  'erythema migrans\r\n',
  'malignant pleural mesothelioma, advanced\r\n',
  'degenerative\r\n',
  'ataxia\r\n',
  'non alcoholic fatty liver\r\n',
  'myoma of uterus\r\n',
  'macular degeneration, choroidal neovascularization\r\n',
  'newborn, infant, disease\r\n',
  'viral; infection, coxsackie(virus)\r\n',
  'functional dysphonia\r\n',
  'advanced recurrent ovarian tumors\r\n',
  'primary focal hyperhidrosis of the hands\r\n',
  'gallbladder adenocarcinoma, biliary type\r\n',
  'sacra

In [82]:
"""
-I = return CUIs/identifiers
-i = ignore word order
-C = relaxed model
-z = term processing
-f = give numbers to the final mappings (adds 1. to the first mapped concept, 2. to the 2nd, and so on...)
-c = give numbers to the candidates (adds 1. to the first candidate, 2. to the 2nd, and so on...)
--sldi = read each term in list separately, do not lump into large phrase
-N = MMI formatted output

https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/ListOfTerms.pdf
https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/TermProcessing.pdf
https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MM_2016_Usage.pdf
List of MetaMap semantic types: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt
"""

cwd_absolute_path = os.path.dirname(os.getcwd())
outputs_folder = "outputs/version_2"
outputs_path = os.path.join(absolute_path, outputs_folder)

# args = ['--sldi -i -I -C -z -J acab,anab,bhvr,cgab,clna,dsyn,fndg,inpo,mobd,neop,patf,sosy'] # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
args = ['--sldi -i -I -C -z'] # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
form = {}
form['KSOURCE'] = ksource
form['COMMAND_ARGS'] = args
headers = {'Accept': 'application/json'}

mm_conditions = {}
cui_pattern = r"C\d+(?=:)"
name_pattern = r"(?<=:)[^[]+"
semtype_pattern = r"\[(.*?)\]"

# with open('../outputs/version_2/metamapped_conditions_.txt') as output:
# with open(os.path.join(outputs_path, "metamapped_conditions.txt"), 'w') as output:

for chunk in chunked_conditions_unmapped[:3]: # demo for testing
    chunk = chunk[:20] # demo for testing
#     for chunk in chunked_conditions_unmapped:
    service_ticket = get_ticket(CAS_SERVERURL, apikey, serviceurl)
    form['APIText'] = chunk
    params = {'ticket': service_ticket}
    response = s.post(serviceurl, form, headers=headers, params=params, allow_redirects=False)
    if response.status_code == 302:
        newurl = s.get_redirect_target(response)
        response = s.post(newurl, form, headers=headers, params=params, allow_redirects=False)

    for line in response.text.splitlines():
#         output.write(line)
        if not any(s in line for s in ["Meta Mapping", "Processing", "/dmzfiler/"]):
#             mm_dict = {}
            if "Phrase:" in line:
                cuis_per_input = []
                mm_input = line.split(":")[1].strip()
                print(mm_input)
            else:
                cui_match = re.findall(cui_pattern, line)
                if cui_match:
                    name_match = re.findall(name_pattern, line)
                    semtype_match = re.findall(semtype_pattern, line)
                    cui_info = [cui_match[0].strip(), name_match[0].strip(), semtype_match[0].strip()]
                    cuis_per_input.append(cui_info)    

                mm_conditions[mm_input] = cuis_per_input
    
mm_conditions


kearns sayer
hereditary angioedema - type 1
respiratory muscles
hepatic tumor
parent-child relationship
caesarean scar pregnancy
sleep hygiene
ganglioneuroblastoma of central nervous system
g6pd
antineoplastic adverse reaction
isoflurane
ocular tumor
triple negative breast cancer, tnbc
risk of bone fracture occurrences
infants
attenuated or transient psychosis
erythema migrans
malignant pleural mesothelioma, advanced
degenerative
ataxia
full thickness supraspinatus tendon tear
post endodontic pain
community-acquired mrsa infections
low blood pressure
transmission
urinary infection
mini-stroke
flus
infective pneumonia
rhinosinusitis acute
ovarian reserve
femoral neuropathy in haemophilic patients
anaplastic lymphoma kinase positive large b-cell lymphoma
hyperhidrosis palmaris et plantaris
complex regional pain syndrome
antibodies; anti-d
metastatic non-small cell lung cancer
pure autonomic failure
congenital torticollis
disruption or dehiscence of closure of skull or craniotomy
lens; an

{'bone status': [['C0262950',
   'BONE (Skeletal bone)',
   'Body Part, Organ, or Organ Component'],
  ['C0449438', 'Status', 'Qualitative Concept']],
 'kearns sayer': [],
 'hereditary angioedema - type 1': [['C0398775',
   'Hereditary angioedema - type 1 (Hereditary C1 esterase inhibitor deficiency - deficient factor)',
   'Disease or Syndrome'],
  ['C2717905',
   'Hereditary angio oedema Type 1 (Hereditary Angioedema Types I and II)',
   'Disease or Syndrome']],
 'respiratory muscles': [['C0021724',
   'Respiratory Muscles (Structure of intercostal muscle)',
   'Body Part, Organ, or Organ Component'],
  ['C0035231', 'Respiratory Muscles', 'Body Part, Organ, or Organ Component']],
 'hepatic tumor': [['C0023903',
   'Hepatic tumour (Liver neoplasms)',
   'Neoplastic Process']],
 'parent-child relationship': [['C0030542',
   'Parent-Child Relationship',
   'Social Behavior'],
  ['C1705423',
   'Parent-Child Relationship (isa Relationship)',
   'Idea or Concept'],
  ['C2826537',
   'Pare

In [95]:
conditions_unmapped_list = list(set(conditions_unmapped.downcase_name.to_list())) # get unique unmapped terms
conditions_unmapped_list[:100]

['kearns sayer',
 'hereditary angioedema - type 1',
 'respiratory muscles',
 'hepatic tumor',
 'parent-child relationship',
 'caesarean scar pregnancy',
 'sleep hygiene',
 'ganglioneuroblastoma of central nervous system',
 'g6pd',
 'antineoplastic adverse reaction',
 'isoflurane',
 'ocular tumor',
 'triple negative breast cancer, tnbc',
 'risk of bone fracture occurrences',
 'infants',
 'attenuated or transient psychosis',
 'erythema migrans',
 'malignant pleural mesothelioma, advanced',
 'degenerative',
 'ataxia',
 'non alcoholic fatty liver',
 'myoma of uterus',
 'macular degeneration, choroidal neovascularization',
 'newborn, infant, disease',
 'viral; infection, coxsackie(virus)',
 'functional dysphonia',
 'advanced recurrent ovarian tumors',
 'primary focal hyperhidrosis of the hands',
 'gallbladder adenocarcinoma, biliary type',
 'sacral myelomeningocele',
 'gender differences',
 'home blood pressure monitoring',
 'slippery pulse',
 'neonatal phototherapy',
 'gene transfer',
 'th

In [104]:
conditions_unmapped_list = list(set(conditions_unmapped.downcase_name.to_list())) # get unique unmapped terms
nr_url = 'https://name-resolution-sri.renci.org/lookup'

nr_dict =  {}
for condition in conditions_unmapped_list[:3]:
    params = {'string':condition,'limit':3}
    response = requests.post(nr_url,params=params)
#     print(response.json())
#     print(json.dumps(response.json(),indent=2))
    jres = response.json()
#     print(jres)
#     jres = json.load(res)
    nr_dict[condition] = jres
    
    
nr_dict

{'kearns sayer': {'MONDO:0010787': ['kearns-sayer syndrome',
   'kearns sayer syndrome',
   'kearns sayers syndrome',
   'KSS',
   'KSS',
   'OCS syndrome',
   'Kearns-Sayre',
   'Kearn Syndrome',
   'Kearns Syndrome',
   'kearns syndrome',
   "Kearns' Syndrome",
   'Syndrome, Kearns',
   "Syndrome, Kearns'",
   'CPEO WITH MYOPATHY',
   'CPEO with Myopathy',
   'CPEO with myopathy',
   'Myopathy, CPEO with',
   'kearn sayre syndrome',
   'CPEO with Myopathies',
   'kearn sayer syndrome',
   'kearn sayers syndrome',
   'Kearns Sayre Syndrome',
   'kearns-sayre syndrome',
   'Kearns-Sayre syndrome',
   'kearns sayre syndrome',
   'Kearns-Sayre syndrome',
   'Kearns-Sayre Syndrome',
   'KEARNS-SAYRE SYNDROME',
   'Myopathies, CPEO with',
   'Syndrome, Kearns-Sayre',
   'Syndrome, Kearns Sayre',
   'Sayre Syndrome, Kearns',
   'mitochondrial Cytopathy',
   'CPEO with ragged-Red fibers',
   'KSS - Kearns-Sayre syndrome',
   'Oculocraniosomatic Syndrome',
   'OCULOCRANIOSOMATIC SYNDROME',
  

In [None]:
diseases = ['planned rrso', 'human bocavirus', 'articular cartilage defects of knee', 'pustular psoriasis', 'chronic migraine headache']

params = {'string':diseases,'limit':5}
response = requests.post(nr_url,params=params)
print(json.dumps(response.json(),indent=2))


In [142]:
diseases = ['planned rrso', 'human bocavirus', 'articular cartilage defects of knee', 'pustular psoriasis', 'chronic migraine headache']

params = {'string':"RxCUI 2555",'limit':5}
response = requests.post(nr_url,params=params)
print(json.dumps(response.json(),indent=2))

{}


In [85]:
with open(os.path.join(outputs_path, "metamapped_conditions.txt"), 'w') as output:
    for key, val in mm_conditions.items():
        print(key)
#             output.write(key + "\t" + val[0] + "\t" + val[1] + "\t" + val[2])


bone status
kearns sayer
hereditary angioedema - type 1
respiratory muscles
hepatic tumor
parent-child relationship
caesarean scar pregnancy
sleep hygiene
ganglioneuroblastoma of central nervous system
g6pd
antineoplastic adverse reaction
isoflurane
ocular tumor
triple negative breast cancer, tnbc
risk of bone fracture occurrences
infants
attenuated or transient psychosis
erythema migrans
malignant pleural mesothelioma, advanced
degenerative
ataxia
full thickness supraspinatus tendon tear
post endodontic pain
community-acquired mrsa infections
low blood pressure
transmission
urinary infection
mini-stroke
flus
infective pneumonia
rhinosinusitis acute
ovarian reserve
femoral neuropathy in haemophilic patients
anaplastic lymphoma kinase positive large b-cell lymphoma
hyperhidrosis palmaris et plantaris
complex regional pain syndrome
antibodies; anti-d
metastatic non-small cell lung cancer
pure autonomic failure
congenital torticollis
disruption or dehiscence of closure of skull or craniot

In [45]:
mm_conditions

{'degenerative joint disease of the ankle': [[['C0409931'],
   ['Degenerative joint disease of ankle (Osteoarthritis of ankle) '],
   ['Disease or Syndrome']],
  [[], [], []]],
 'kearns sayer': [[[], [], []]],
 'hereditary angioedema - type 1': [[['C0398775'],
   ['Hereditary angioedema - type 1 (Hereditary C1 esterase inhibitor deficiency - deficient factor) '],
   ['Disease or Syndrome']],
  [['C2717905'],
   ['Hereditary angio oedema Type 1 (Hereditary Angioedema Types I and II) '],
   ['Disease or Syndrome']],
  [[], [], []]],
 'respiratory muscles': [[['C0021724'],
   ['Respiratory Muscles (Structure of intercostal muscle) '],
   ['Body Part, Organ, or Organ Component']],
  [['C0035231'],
   ['Respiratory Muscles '],
   ['Body Part, Organ, or Organ Component']],
  [[], [], []]],
 'hepatic tumor': [[['C0023903'],
   ['Hepatic tumour (Liver neoplasms) '],
   ['Neoplastic Process']],
  [[], [], []]],
 'parent-child relationship': [[['C0030542'],
   ['Parent-Child Relationship '],
   

# TRY FUZZY MAPPING USING BOTH CANDIDATES FROM METAMAP AND MESH

In [24]:
conditions_unmapped_per_study

Unnamed: 0_level_0,downcase_name,condition_counts
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1
NCT00000102,[congenital adrenal hyperplasia],1
NCT00000105,[cancer],1
NCT00000110,[obesity],1
NCT00000112,"[diabetes, obesity]",2
NCT00000115,"[macular edema, cystoid]",1
...,...,...
NCT05794035,"[skin cancer, non-melanoma]",1
NCT05794048,"[pancreatic tumor, hepatocarcinoma]",2
NCT05794061,"[psychiatric disorder, dementia, cognitive imp...",3
NCT05794074,[nutrition deficiency (xanth deficiency) due t...,1


In [25]:
condition_mesh_leaves

Unnamed: 0,id,nct_id,mesh_term,downcase_mesh_term,mesh_type
0,113697939,NCT00185796,Syndrome,syndrome,mesh-list
2,113698185,NCT03516604,Depression,depression,mesh-list
5,113698730,NCT00461539,Depression,depression,mesh-list
6,113698920,NCT05324137,Polyps,polyps,mesh-list
11,113700140,NCT00176514,Mucositis,mucositis,mesh-list
...,...,...,...,...,...
2849547,111234990,NCT00733525,Bulimia Nervosa,bulimia nervosa,mesh-list
2849551,111234994,NCT02653131,Short Bowel Syndrome,short bowel syndrome,mesh-list
2849552,111234995,NCT02653131,Syndrome,syndrome,mesh-list
2849564,111204205,NCT04834908,Infections,infections,mesh-list


In [27]:
# merge all available MeSH terms for each condition (that is still unmapped)
condition_mesh_terms_per_study = pd.DataFrame(condition_mesh_leaves[["nct_id", "downcase_mesh_term"]].groupby("nct_id")["downcase_mesh_term"].apply(list))
condition_mesh_terms_per_study


Unnamed: 0_level_0,downcase_mesh_term
nct_id,Unnamed: 1_level_1
NCT00000102,"[adrenal hyperplasia, congenital, adrenogenita..."
NCT00000104,"[poisoning, lead poisoning]"
NCT00000105,[tetanus]
NCT00000106,"[rheumatic diseases, collagen diseases]"
NCT00000107,"[heart defects, congenital, congenital abnorma..."
...,...
NCT05793996,"[anemia, iron-deficiency]"
NCT05794022,"[myocardial infarction, infarction]"
NCT05794035,[skin neoplasms]
NCT05794048,"[pancreatic neoplasms, carcinoma, hepatocellular]"


In [28]:
unmapped_condition_candidates = pd.merge(conditions_unmapped_per_study, condition_mesh_terms_per_study, left_on=["nct_id"], right_on=["nct_id"])
unmapped_condition_candidates

Unnamed: 0_level_0,downcase_name,condition_counts,downcase_mesh_term
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT00000102,[congenital adrenal hyperplasia],1,"[adrenal hyperplasia, congenital, adrenogenita..."
NCT00000105,[cancer],1,[tetanus]
NCT00000112,"[diabetes, obesity]",2,"[acanthosis nigricans, glucose intolerance]"
NCT00000115,"[macular edema, cystoid]",1,"[macular edema, edema]"
NCT00000123,[myopia],1,[astigmatism]
...,...,...,...
NCT05793983,"[liver failure, acute on chronic, infections, ...",5,"[liver cirrhosis, liver diseases, liver failur..."
NCT05793996,"[chronic heart failure, iron deficiency, latent]",2,"[anemia, iron-deficiency]"
NCT05794035,"[skin cancer, non-melanoma]",1,[skin neoplasms]
NCT05794048,"[pancreatic tumor, hepatocarcinoma]",2,"[pancreatic neoplasms, carcinoma, hepatocellular]"


In [None]:
# FOLLOW THIS TUTORIAL:
https://www.datacamp.com/tutorial/fuzzy-string-python

In [15]:
single_mapped_conditions = condition_mesh_leaves[condition_mesh_leaves.nct_id.isin(unmapped_single_conditions.index)]
single_mapped_conditions

Unnamed: 0,id,nct_id,mesh_term,downcase_mesh_term,mesh_type
0,113697939,NCT00185796,Syndrome,syndrome,mesh-list
2,113698185,NCT03516604,Depression,depression,mesh-list
5,113698730,NCT00461539,Depression,depression,mesh-list
6,113698920,NCT05324137,Polyps,polyps,mesh-list
14,113700556,NCT00165893,Back Pain,back pain,mesh-list
...,...,...,...,...,...
2849543,111234985,NCT00733447,Heart Failure,heart failure,mesh-list
2849546,111234988,NCT00733525,Bulimia,bulimia,mesh-list
2849547,111234990,NCT00733525,Bulimia Nervosa,bulimia nervosa,mesh-list
2849564,111204205,NCT04834908,Infections,infections,mesh-list


In [13]:
# add the single MeSH term that exists for studies with only 1 condition listed
single_mapped_conditions = pd.merge(conditions_unmapped, unmapped_single_conditions, left_on=['nct_id'], right_on = ['nct_id'])
single_mapped_conditions

Unnamed: 0,nct_id,name,downcase_name_x,mesh_term,downcase_mesh_term,mesh_type,downcase_name_y,condition_counts
0,NCT00000102,Congenital Adrenal Hyperplasia,congenital adrenal hyperplasia,,,,[congenital adrenal hyperplasia],1
1,NCT00000105,Cancer,cancer,,,,[cancer],1
2,NCT00000110,Obesity,obesity,,,,[obesity],1
3,NCT00000115,"Macular Edema, Cystoid","macular edema, cystoid",,,,"[macular edema, cystoid]",1
4,NCT00000123,Myopia,myopia,,,,[myopia],1
...,...,...,...,...,...,...,...,...
241954,NCT05793944,Pregnancy,pregnancy,,,,[pregnancy],1
241955,NCT05794009,Exercise Therapy,exercise therapy,,,,[exercise therapy],1
241956,NCT05794035,"Skin Cancer, Non-Melanoma","skin cancer, non-melanoma",,,,"[skin cancer, non-melanoma]",1
241957,NCT05794074,Nutrition Deficiency (Xanth Deficiency) Due to...,nutrition deficiency (xanth deficiency) due to...,,,,[nutrition deficiency (xanth deficiency) due t...,1


In [None]:
# what are the MeSH terms for a study with exactly 1 condition listed. Are there any clinical trials with only 1 condition where there's only 1 MeSH term for it?
condition_mesh_leaves_per_study = pd.DataFrame(condition_mesh_leaves[["nct_id", "downcase_mesh_term"]].groupby('nct_id')['downcase_mesh_term'].apply(list))
condition_mesh_leaves_per_study['mesh_leaf_list_count'] = condition_mesh_leaves_per_study['downcase_mesh_term'].str.len()
singular_condition_mesh_leaves = condition_mesh_leaves_per_study.loc[condition_mesh_leaves_per_study["mesh_leaf_list_count"] == 1] 
single_conditions = singular_condition_mesh_leaves['downcase_mesh_term'].str.get(0)

single_conditions


In [None]:
# find the studies with only 1 condition listed, and merge the studies with only one term listed with the condition_mesh_leaves with only 1 
conditions_single_mapped = pd.merge(conditions_unmapped, single_conditions, left_on=['nct_id'], right_on = ['nct_id'])
conditions_single_mapped[:20]
