#### Attempt to map the conditions and interventions to CURIEs

In [334]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
import re
import collections

In [23]:
%pip install thefuzz

Collecting thefuzz
  Using cached thefuzz-0.19.0-py2.py3-none-any.whl (17 kB)
Installing collected packages: thefuzz
Successfully installed thefuzz-0.19.0
Note: you may need to restart the kernel to use updated packages.


In [85]:
# for testing
data_extracted = "/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/2023-04-03_extracted"

In [86]:
# read in pipe-delimited files
conditions_df = pd.read_csv(data_extracted + '/conditions.txt', sep='|', index_col=False, header=0)
interventions_df = pd.read_csv(data_extracted + '/interventions.txt', sep='|', index_col=False, header=0)
browse_conditions_df = pd.read_csv(data_extracted + '/browse_conditions.txt', sep='|', index_col=False, header=0)
browse_interventions_df = pd.read_csv(data_extracted + '/browse_interventions.txt', sep='|', index_col=False, header=0)

browse_interventions_df.head()

Unnamed: 0,id,nct_id,mesh_term,downcase_mesh_term,mesh_type
0,57856435,NCT00246285,Risperidone,risperidone,mesh-list
1,57856584,NCT04626778,Hydrogen Peroxide,hydrogen peroxide,mesh-list
2,57856732,NCT00219908,Mitoxantrone,mitoxantrone,mesh-list
3,57856807,NCT00220064,Vitamins,vitamins,mesh-ancestor
4,57857560,NCT04612894,Apatinib,apatinib,mesh-list


In [87]:
conditions_df.head()

Unnamed: 0,id,nct_id,name,downcase_name
0,31338340,NCT01819987,Overweight,overweight
1,31338518,NCT01821599,Rehabilitation,rehabilitation
2,31338693,NCT01823731,Ovarian Cancer,ovarian cancer
3,31338879,NCT01825850,Healthy,healthy
4,31339149,NCT01829022,Myoma,myoma


In [88]:
browse_conditions_df.head()

Unnamed: 0,id,nct_id,mesh_term,downcase_mesh_term,mesh_type
0,113697939,NCT00185796,Syndrome,syndrome,mesh-list
1,113698103,NCT00185913,Neoplasms,neoplasms,mesh-ancestor
2,113698185,NCT03516604,Depression,depression,mesh-list
3,113698265,NCT03318952,Pain,pain,mesh-ancestor
4,113698420,NCT02789800,Tauopathies,tauopathies,mesh-ancestor


In [89]:
# every clinical trial has at least one MeSH-list (leaf), and may have 0 or more MeSH-ancestors (see MeSH analytics script)
# subset browse_conditions to get only the MeSH-list terms or leaves
condition_mesh_leaves = browse_conditions_df[browse_conditions_df["mesh_type"].str.contains("mesh-list")]
condition_mesh_leaves


Unnamed: 0,id,nct_id,mesh_term,downcase_mesh_term,mesh_type
0,113697939,NCT00185796,Syndrome,syndrome,mesh-list
2,113698185,NCT03516604,Depression,depression,mesh-list
5,113698730,NCT00461539,Depression,depression,mesh-list
6,113698920,NCT05324137,Polyps,polyps,mesh-list
11,113700140,NCT00176514,Mucositis,mucositis,mesh-list
...,...,...,...,...,...
2849547,111234990,NCT00733525,Bulimia Nervosa,bulimia nervosa,mesh-list
2849551,111234994,NCT02653131,Short Bowel Syndrome,short bowel syndrome,mesh-list
2849552,111234995,NCT02653131,Syndrome,syndrome,mesh-list
2849564,111204205,NCT04834908,Infections,infections,mesh-list


In [90]:
conditions_exact_mapped = pd.merge(conditions_df[['nct_id', 'name', 'downcase_name']],
                             browse_conditions_df[['nct_id', 'mesh_term', 'downcase_mesh_term', 'mesh_type']],
                             how='left',
                             left_on=['nct_id', 'downcase_name'],
                             right_on = ['nct_id', 'downcase_mesh_term'])
                             
conditions_exact_mapped = conditions_exact_mapped.sort_values(by=['nct_id'])

In [130]:
conditions_unmapped = conditions_exact_mapped[conditions_exact_mapped['mesh_term'].isnull()]
conditions_unmapped

Unnamed: 0,nct_id,name,downcase_name,mesh_term,downcase_mesh_term,mesh_type
25683,NCT00000102,Congenital Adrenal Hyperplasia,congenital adrenal hyperplasia,,,
228818,NCT00000105,Cancer,cancer,,,
249384,NCT00000110,Obesity,obesity,,,
203653,NCT00000112,Diabetes,diabetes,,,
203651,NCT00000112,Obesity,obesity,,,
...,...,...,...,...,...,...
472887,NCT05794061,Psychiatric Disorder,psychiatric disorder,,,
472886,NCT05794061,Dementia,dementia,,,
472885,NCT05794061,Cognitive Impairment,cognitive impairment,,,
472884,NCT05794074,Nutrition Deficiency (Xanth Deficiency) Due to...,nutrition deficiency (xanth deficiency) due to...,,,


In [92]:
# find studies for which there is only 1 condition listed (from unmapped conditions)
conditions_unmapped_per_study = pd.DataFrame(conditions_unmapped.groupby("nct_id")["downcase_name"].apply(list))
conditions_unmapped_per_study

Unnamed: 0_level_0,downcase_name
nct_id,Unnamed: 1_level_1
NCT00000102,[congenital adrenal hyperplasia]
NCT00000105,[cancer]
NCT00000110,[obesity]
NCT00000112,"[diabetes, obesity]"
NCT00000115,"[macular edema, cystoid]"
...,...
NCT05794035,"[skin cancer, non-melanoma]"
NCT05794048,"[pancreatic tumor, hepatocarcinoma]"
NCT05794061,"[psychiatric disorder, dementia, cognitive imp..."
NCT05794074,[nutrition deficiency (xanth deficiency) due t...


In [93]:
conditions_unmapped_per_study["condition_counts"] = conditions_unmapped_per_study["downcase_name"].str.len()
conditions_unmapped_per_study

Unnamed: 0_level_0,downcase_name,condition_counts
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1
NCT00000102,[congenital adrenal hyperplasia],1
NCT00000105,[cancer],1
NCT00000110,[obesity],1
NCT00000112,"[diabetes, obesity]",2
NCT00000115,"[macular edema, cystoid]",1
...,...,...
NCT05794035,"[skin cancer, non-melanoma]",1
NCT05794048,"[pancreatic tumor, hepatocarcinoma]",2
NCT05794061,"[psychiatric disorder, dementia, cognitive imp...",3
NCT05794074,[nutrition deficiency (xanth deficiency) due t...,1


In [94]:
unmapped_single_conditions = conditions_unmapped_per_study[conditions_unmapped_per_study["condition_counts"] == 1]
unmapped_single_conditions

Unnamed: 0_level_0,downcase_name,condition_counts
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1
NCT00000102,[congenital adrenal hyperplasia],1
NCT00000105,[cancer],1
NCT00000110,[obesity],1
NCT00000115,"[macular edema, cystoid]",1
NCT00000123,[myopia],1
...,...,...
NCT05793944,[pregnancy],1
NCT05794009,[exercise therapy],1
NCT05794035,"[skin cancer, non-melanoma]",1
NCT05794074,[nutrition deficiency (xanth deficiency) due t...,1


In [95]:
mapped_single_conditions = pd.merge(unmapped_single_conditions, condition_mesh_leaves[["nct_id", "downcase_mesh_term"]], left_on=["nct_id"], right_on=["nct_id"])
with pd.option_context("display.max_rows", 1000):
    display(mapped_single_conditions[:100])
    

Unnamed: 0,nct_id,downcase_name,condition_counts,downcase_mesh_term
0,NCT00000102,[congenital adrenal hyperplasia],1,"adrenal hyperplasia, congenital"
1,NCT00000102,[congenital adrenal hyperplasia],1,adrenogenital syndrome
2,NCT00000102,[congenital adrenal hyperplasia],1,adrenocortical hyperfunction
3,NCT00000102,[congenital adrenal hyperplasia],1,hyperplasia
4,NCT00000105,[cancer],1,tetanus
5,NCT00000115,"[macular edema, cystoid]",1,macular edema
6,NCT00000115,"[macular edema, cystoid]",1,edema
7,NCT00000123,[myopia],1,astigmatism
8,NCT00000126,[ischemic optic neuropathy],1,ischemia
9,NCT00000126,[ischemic optic neuropathy],1,optic nerve diseases


### Conclusion: there are multiple candidate MeSH terms (leaf nodes only, not ancestors) available per 1 condition

# Use MetaMap to find more candidate matches

In [80]:
CAS_SERVERURL = "https://utslogin.nlm.nih.gov/cas/v1"
II_SKR_SERVERURL = 'https://ii.nlm.nih.gov/cgi-bin/II/UTS_Required'
METAMAP_INTERACTIVE_URL = II_SKR_SERVERURL + "/API_MM_interactive.pl"
stserverurl = "https://utslogin.nlm.nih.gov/cas/v1/tickets"
tgtserverurl = "https://utslogin.nlm.nih.gov/cas/v1/api-key"
apikey = '54041f07-fc66-4558-b038-b46ca8bdcc6b'
serviceurl = METAMAP_INTERACTIVE_URL
ksource = '2020AB'

def get_service_ticket(serverurl, ticket_granting_ticket, serviceurl):
    """ Obtain a Single-Use Proxy Ticket (also known as service ticket).
    Request for a Service Ticket:
        POST /cas/v1/tickets/{TGT id} HTTP/1.0
    data:
           service={form encoded parameter for the service url}
    Sucessful Response:
        200 OK
        ST-1-FFDFHDSJKHSDFJKSDHFJKRUEYREWUIFSD2132
    @param serverurl authentication server
    @param ticketGrantingTicket a Proxy Granting Ticket.
    @param serviceurl url of service with protected resources
    @return authentication ticket for service. """
    resp = requests.post("{}/{}".format(serverurl, ticket_granting_ticket),
                         {"service": serviceurl})
    if resp.status_code == 200:
        return resp.content
    return 'Error: status: {}'.format(resp.content)


def extract_tgt_ticket(htmlcontent):
    "Extract ticket granting ticket from HTML."
    # print('htmlcontent: {}'.format(htmlcontent))
    html = HTML(html=htmlcontent)
    # get form element
    elements = html.xpath("//form")
    # print('html response: {}'.format(etree.tostring(html.lxml).decode()))
    # print('action attribure: {}'.format(elements[0].attrs['action']))
    # extract ticket granting ticket out of 'action' attribute
    if elements != []:
        return elements[0].attrs['action'].split('/')[-1]
    else:
        return "form element missing from ticket granting ticket response"

def get_ticket(cas_serverurl, apikey, serviceurl):
    # set ticket granting ticket server url
    tgtserverurl = cas_serverurl + "/api-key"
    # set service ticket server url
    stserverurl = cas_serverurl + "/tickets"
    tgt = get_ticket_granting_ticket(tgtserverurl, apikey)
    return get_service_ticket(stserverurl, tgt, serviceurl)

def get_ticket_granting_ticket(tgtserverurl, apikey):
    # http://serviceurl/cas/v1/tickets/{TGT id}
    response = requests.post(tgtserverurl, {'apikey': apikey},
                             headers={'Accept': 'test/plain'})
    return extract_tgt_ticket(response.content)

def extract_tgt_ticket(htmlcontent):
    "Extract ticket granting ticket from HTML."    
    soup = BeautifulSoup(htmlcontent)
#     print(soup.find('form').get("action"))
    cas_url = soup.find("form").get("action")
    "Extract ticket granting ticket out of 'action' attribute"
#     tgt = cas_url.rsplit('/')[-1]
    return cas_url.rsplit('/')[-1]
    
ticket = get_ticket(CAS_SERVERURL, apikey, serviceurl)

In [81]:
args = ['-I -i -z -C']
form = {}
form['APIText'] = "A spinal tap was performed and oligoclonal bands were \
detected in the cerebrospinal fluid.\n"
form['KSOURCE'] = ksource
form['COMMAND_ARGS'] = args
serviceticket = ticket
params = {'ticket': serviceticket}
headers = {'Accept': 'application/json'}
s = requests.Session()
response = s.post(serviceurl, form, headers=headers, params=params, allow_redirects=False)


In [82]:
def get_redirect_target(resp):
        """Receives a Response. Returns a redirect URI or ``None``"""
        # Due to the nature of how requests processes redirects this method will
        # be called at least once upon the original response and at least twice
        # on each subsequent redirect response (if any).
        # If a custom mixin is used to handle this logic, it may be advantageous
        # to cache the redirect location onto the response object as a private
        # attribute.
        if resp.is_redirect:
            location = resp.headers["location"]
            # Currently the underlying http module on py3 decode headers
            # in latin1, but empirical evidence suggests that latin1 is very
            # rarely used with non-ASCII characters in HTTP headers.
            # It is more likely to get UTF8 header rather than latin1.
            # This causes incorrect handling of UTF8 encoded location headers.
            # To solve this, we re-encode the location in latin1.
#             print(location)
            location = location.encode("latin1")
#             print(location)
#             print(to_native_string(location, "utf8"))
            return to_native_string(location, "utf8")
        return None

if response.status_code == 302:
    newurl = s.get_redirect_target(response)
    response = s.post(newurl, form, headers=headers, params=params, allow_redirects=False)

In [98]:
# get concepts to map
conditions_unmapped.downcase_name.to_list()[:10]

['congenital adrenal hyperplasia',
 'cancer',
 'obesity',
 'diabetes',
 'obesity',
 'macular edema, cystoid',
 'myopia',
 'ischemic optic neuropathy',
 'ischemic optic neuropathy',
 'esophoria']

In [151]:
conditions_unmapped

Unnamed: 0,nct_id,name,downcase_name,mesh_term,downcase_mesh_term,mesh_type
25683,NCT00000102,Congenital Adrenal Hyperplasia,congenital adrenal hyperplasia,,,
228818,NCT00000105,Cancer,cancer,,,
249384,NCT00000110,Obesity,obesity,,,
203653,NCT00000112,Diabetes,diabetes,,,
203651,NCT00000112,Obesity,obesity,,,
...,...,...,...,...,...,...
472887,NCT05794061,Psychiatric Disorder,psychiatric disorder,,,
472886,NCT05794061,Dementia,dementia,,,
472885,NCT05794061,Cognitive Impairment,cognitive impairment,,,
472884,NCT05794074,Nutrition Deficiency (Xanth Deficiency) Due to...,nutrition deficiency (xanth deficiency) due to...,,,


In [153]:
len(conditions_unmapped['downcase_name'].to_list())

563678

In [154]:
len(set(conditions_unmapped['downcase_name'].to_list()))

99253

In [164]:
conditions_unmapped_list = list(set(conditions_unmapped.downcase_name.to_list())) # get unique unmapped terms
conditions_unmapped_list = [s + "\r\n" for s in conditions_unmapped_list] # you must include newline to process each term separately!
print(len(conditions_unmapped_list))

99253


In [171]:
def split_list_by_length(lst):
    result = []
    current_sublist = []
    current_length = 0
    for item in lst:
        item_length = len(item)
        if current_length + item_length > 9990: # max is 10,000 char allowed by MetaMap
            result.append(current_sublist)
            current_sublist = []
            current_length = 0
        current_sublist.append(item)
        current_length += item_length
    result.append(current_sublist)
    return result

In [172]:
chunked_conditions_unmapped = split_list_by_length(conditions_unmapped_list)
chunked_conditions_unmapped[:1]

[['ankle foot injury\r\n',
  'knowledge about condom use\r\n',
  'floor of mouth squamous cell carcinoma\r\n',
  'tendinoses, rotator cuff\r\n',
  'trauma patients in icu\r\n',
  'noninsulin-dependent diabetes mellitus\r\n',
  'rabies in animals\r\n',
  'stage iiia soft tissue sarcoma of the trunk and extremities ajcc v8\r\n',
  'mild persistent asthma\r\n',
  'bacteremia caused by gram-negative bacteria\r\n',
  'acquired bone marrow aplasia\r\n',
  'psychogenic non-epileptic seizures, motivational interviewing\r\n',
  'post-operative pulmonary complications\r\n',
  'acute noninfectious posterior, intermediate, or pan uveitis\r\n',
  'relapsing multiple sclerosis (rms)\r\n',
  'long-term outcome\r\n',
  'hemophilia a with inhibitors\r\n',
  'recurrent endometrioid adenocarcinoma\r\n',
  'covid 19 positive\r\n',
  'a known allergy to chlorohexidine alcohol or shellfis\r\n',
  'ocular surface\r\n',
  'antisocial behavior\r\n',
  'elopement\r\n',
  'perforation of the colon\r\n',
  'varus

In [340]:
"""
-I = return CUIs/identifiers
-i = ignore word order
-C = relaxed model
-z = term processing
-f = give numbers to the final mappings (adds 1. to the first mapped concept, 2. to the 2nd, and so on...)
-c = give numbers to the candidates (adds 1. to the first candidate, 2. to the 2nd, and so on...)
--sldi = read each term in list separately, do not lump into large phrase
-N = MMI formatted output

https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/ListOfTerms.pdf
https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/TermProcessing.pdf
https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MM_2016_Usage.pdf
"""
args = ['--sldi -i -I -C -z']
# args = ['-I --prune 5']
form = {}
form['APIText'] = chunk
form['KSOURCE'] = ksource
form['COMMAND_ARGS'] = args
headers = {'Accept': 'application/json'}

mm_conditions = {}
for chunk in chunked_conditions_unmapped[:20]:
# for chunk in chunked_conditions_unmapped:
#     print(chunk[:3])
#     chunk = chunk[:3]
    service_ticket = get_ticket(CAS_SERVERURL, apikey, serviceurl)
#     chunk = chunk[:5] # demo for testing
    form['APIText'] = chunk
    params = {'ticket': service_ticket}
    response = s.post(serviceurl, form, headers=headers, params=params, allow_redirects=False)
    if response.status_code == 302:
        newurl = s.get_redirect_target(response)
        response = s.post(newurl, form, headers=headers, params=params, allow_redirects=False)
        
#     print(response.text)
#     print(response.content)
    for line in response.text.splitlines():
#         for line.startswith('Phrase:'):
#             mm_input = line.split(":")[1].strip()
        if not any(s in line for s in ["Meta Mapping", "Processing", "/dmzfiler/"]):
            mm_dict = {}
            if "Phrase:" in line:
                mm_input = line.split(":")[1].strip()
#                 print(mm_input)
            else:
                cuis_per_input = []
                cui_line = [s.strip() for s in line.split('   ') if s]
                if cui_line:
#                     print(cui_line)
                    cui = cui_line[1].split(":")[0].strip()
#                     print(cui)
                    name_semtype = cui_line[1].split(":")[1].strip()
                    name = name_semtype.split("[")[0].strip()
#                     print(name)
                    semtype = re.findall(r'\[([^]]+)\]', name_semtype)
                    cui_info = [cui, name, semtype]
#                     print(mm_input)
#                     print(c_info)
                    cuis_per_input.append(cui_info)
                mm_dict[mm_input] = cuis_per_input
        
        

    


    
    
#     print(response.text.splitlines())

#     user_lines = [line for line in response.text.splitlines() if line.startswith('USER|')]
#     print(user_lines)
#     print("\n")
#     for i, unmapped_concept in enumerate(chunk):
#         print(unmapped_concept)
# #         print(user_lines[i])
# #         print(user_lines[i].count("|")) 
#         metamap_output = user_lines[i].split("|")
#         print(metamap_output)
# #         print(metamap_output[3])
# #         print(metamap_output[4])
# #         print(metamap_output[5])
#         print("\n")


    

IndexError: list index out of range

In [None]:
mm_dict

In [175]:
"""
-I = return CUIs/identifiers
-i = ignore word order
-C = relaxed model
-z = term processing
--sldi = read each term in list separately, do not lump into large phrase

https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/ListOfTerms.pdf
https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/TermProcessing.pdf
https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MM_2016_Usage.pdf
"""
args = ['--sldi -i -I -C -z -f']
# args = ['-I --prune 5']
form = {}
form['APIText'] = chunk
form['KSOURCE'] = ksource
form['COMMAND_ARGS'] = args
headers = {'Accept': 'application/json'}

for chunk in chunked_conditions_unmapped[:2]:
#     print(chunk[:10])
#     chunk = chunk[:10]
    service_ticket = get_ticket(CAS_SERVERURL, apikey, serviceurl)
#     chunk = chunk[:5] # demo for testing
    form['APIText'] = chunk
    params = {'ticket': service_ticket}
    response = s.post(serviceurl, form, headers=headers, params=params, allow_redirects=False)
    if response.status_code == 302:
        newurl = s.get_redirect_target(response)
        response = s.post(newurl, form, headers=headers, params=params, allow_redirects=False)
    print(response.text)

/dmzfiler/II_Group/MetaMap2020/public_mm/bin/SKRrun.20 /dmzfiler/II_Group/MetaMap2020/public_mm/bin/metamap20.BINARY.Linux --lexicon db -Z 2020AB --silent -i -I -C -z --sldi
Processing USER.tx.1: ankle foot injury

Phrase: ankle foot injury
Meta Mapping (1000):
  1000   C0348772:ankle foot injury (injuries to the ankle and foot) [Injury or Poisoning]
Processing USER.tx.1: knowledge about condom use

Phrase: knowledge about condom use
Meta Mapping (783):
   770   C0376554:Knowledge [Intellectual Product]
   666   C0679782:Condom use (condom use) [Individual Behavior]
Processing USER.tx.1: floor of mouth squamous cell carcinoma

Phrase: floor of mouth squamous cell carcinoma
Meta Mapping (1000):
  1000   C0280300:Floor of Mouth Squamous Cell Carcinoma (Squamous cell carcinoma of floor of mouth) [Neoplastic Process]
Processing USER.tx.1: tendinoses, rotator cuff

Phrase: tendinoses, rotator cuff
Meta Mapping (1000):
  1000   C4280057:Tendinoses, Rotator Cuff (Rotator Cuff Tendinosis) [Inj

KeyboardInterrupt: 

# TRY FUZZY MAPPING USING BOTH CANDIDATES FROM METAMAP AND MESH

In [24]:
conditions_unmapped_per_study

Unnamed: 0_level_0,downcase_name,condition_counts
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1
NCT00000102,[congenital adrenal hyperplasia],1
NCT00000105,[cancer],1
NCT00000110,[obesity],1
NCT00000112,"[diabetes, obesity]",2
NCT00000115,"[macular edema, cystoid]",1
...,...,...
NCT05794035,"[skin cancer, non-melanoma]",1
NCT05794048,"[pancreatic tumor, hepatocarcinoma]",2
NCT05794061,"[psychiatric disorder, dementia, cognitive imp...",3
NCT05794074,[nutrition deficiency (xanth deficiency) due t...,1


In [25]:
condition_mesh_leaves

Unnamed: 0,id,nct_id,mesh_term,downcase_mesh_term,mesh_type
0,113697939,NCT00185796,Syndrome,syndrome,mesh-list
2,113698185,NCT03516604,Depression,depression,mesh-list
5,113698730,NCT00461539,Depression,depression,mesh-list
6,113698920,NCT05324137,Polyps,polyps,mesh-list
11,113700140,NCT00176514,Mucositis,mucositis,mesh-list
...,...,...,...,...,...
2849547,111234990,NCT00733525,Bulimia Nervosa,bulimia nervosa,mesh-list
2849551,111234994,NCT02653131,Short Bowel Syndrome,short bowel syndrome,mesh-list
2849552,111234995,NCT02653131,Syndrome,syndrome,mesh-list
2849564,111204205,NCT04834908,Infections,infections,mesh-list


In [27]:
# merge all available MeSH terms for each condition (that is still unmapped)
condition_mesh_terms_per_study = pd.DataFrame(condition_mesh_leaves[["nct_id", "downcase_mesh_term"]].groupby("nct_id")["downcase_mesh_term"].apply(list))
condition_mesh_terms_per_study


Unnamed: 0_level_0,downcase_mesh_term
nct_id,Unnamed: 1_level_1
NCT00000102,"[adrenal hyperplasia, congenital, adrenogenita..."
NCT00000104,"[poisoning, lead poisoning]"
NCT00000105,[tetanus]
NCT00000106,"[rheumatic diseases, collagen diseases]"
NCT00000107,"[heart defects, congenital, congenital abnorma..."
...,...
NCT05793996,"[anemia, iron-deficiency]"
NCT05794022,"[myocardial infarction, infarction]"
NCT05794035,[skin neoplasms]
NCT05794048,"[pancreatic neoplasms, carcinoma, hepatocellular]"


In [28]:
unmapped_condition_candidates = pd.merge(conditions_unmapped_per_study, condition_mesh_terms_per_study, left_on=["nct_id"], right_on=["nct_id"])
unmapped_condition_candidates

Unnamed: 0_level_0,downcase_name,condition_counts,downcase_mesh_term
nct_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
NCT00000102,[congenital adrenal hyperplasia],1,"[adrenal hyperplasia, congenital, adrenogenita..."
NCT00000105,[cancer],1,[tetanus]
NCT00000112,"[diabetes, obesity]",2,"[acanthosis nigricans, glucose intolerance]"
NCT00000115,"[macular edema, cystoid]",1,"[macular edema, edema]"
NCT00000123,[myopia],1,[astigmatism]
...,...,...,...
NCT05793983,"[liver failure, acute on chronic, infections, ...",5,"[liver cirrhosis, liver diseases, liver failur..."
NCT05793996,"[chronic heart failure, iron deficiency, latent]",2,"[anemia, iron-deficiency]"
NCT05794035,"[skin cancer, non-melanoma]",1,[skin neoplasms]
NCT05794048,"[pancreatic tumor, hepatocarcinoma]",2,"[pancreatic neoplasms, carcinoma, hepatocellular]"


In [None]:
# FOLLOW THIS TUTORIAL:
https://www.datacamp.com/tutorial/fuzzy-string-python

In [15]:
single_mapped_conditions = condition_mesh_leaves[condition_mesh_leaves.nct_id.isin(unmapped_single_conditions.index)]
single_mapped_conditions

Unnamed: 0,id,nct_id,mesh_term,downcase_mesh_term,mesh_type
0,113697939,NCT00185796,Syndrome,syndrome,mesh-list
2,113698185,NCT03516604,Depression,depression,mesh-list
5,113698730,NCT00461539,Depression,depression,mesh-list
6,113698920,NCT05324137,Polyps,polyps,mesh-list
14,113700556,NCT00165893,Back Pain,back pain,mesh-list
...,...,...,...,...,...
2849543,111234985,NCT00733447,Heart Failure,heart failure,mesh-list
2849546,111234988,NCT00733525,Bulimia,bulimia,mesh-list
2849547,111234990,NCT00733525,Bulimia Nervosa,bulimia nervosa,mesh-list
2849564,111204205,NCT04834908,Infections,infections,mesh-list


In [13]:
# add the single MeSH term that exists for studies with only 1 condition listed
single_mapped_conditions = pd.merge(conditions_unmapped, unmapped_single_conditions, left_on=['nct_id'], right_on = ['nct_id'])
single_mapped_conditions

Unnamed: 0,nct_id,name,downcase_name_x,mesh_term,downcase_mesh_term,mesh_type,downcase_name_y,condition_counts
0,NCT00000102,Congenital Adrenal Hyperplasia,congenital adrenal hyperplasia,,,,[congenital adrenal hyperplasia],1
1,NCT00000105,Cancer,cancer,,,,[cancer],1
2,NCT00000110,Obesity,obesity,,,,[obesity],1
3,NCT00000115,"Macular Edema, Cystoid","macular edema, cystoid",,,,"[macular edema, cystoid]",1
4,NCT00000123,Myopia,myopia,,,,[myopia],1
...,...,...,...,...,...,...,...,...
241954,NCT05793944,Pregnancy,pregnancy,,,,[pregnancy],1
241955,NCT05794009,Exercise Therapy,exercise therapy,,,,[exercise therapy],1
241956,NCT05794035,"Skin Cancer, Non-Melanoma","skin cancer, non-melanoma",,,,"[skin cancer, non-melanoma]",1
241957,NCT05794074,Nutrition Deficiency (Xanth Deficiency) Due to...,nutrition deficiency (xanth deficiency) due to...,,,,[nutrition deficiency (xanth deficiency) due t...,1


In [None]:
# what are the MeSH terms for a study with exactly 1 condition listed. Are there any clinical trials with only 1 condition where there's only 1 MeSH term for it?
condition_mesh_leaves_per_study = pd.DataFrame(condition_mesh_leaves[["nct_id", "downcase_mesh_term"]].groupby('nct_id')['downcase_mesh_term'].apply(list))
condition_mesh_leaves_per_study['mesh_leaf_list_count'] = condition_mesh_leaves_per_study['downcase_mesh_term'].str.len()
singular_condition_mesh_leaves = condition_mesh_leaves_per_study.loc[condition_mesh_leaves_per_study["mesh_leaf_list_count"] == 1] 
single_conditions = singular_condition_mesh_leaves['downcase_mesh_term'].str.get(0)

single_conditions


In [None]:
# find the studies with only 1 condition listed, and merge the studies with only one term listed with the condition_mesh_leaves with only 1 
conditions_single_mapped = pd.merge(conditions_unmapped, single_conditions, left_on=['nct_id'], right_on = ['nct_id'])
conditions_single_mapped[:20]
