### THIS SCRIPT USES MetaMap to try and map the bulk of terms, and Name Resolver to pick up what's left

In [1]:
# display cells to maximum width 
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))

# lets you preint multiple outputs per cell, not just last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
import re
import collections
import os
import json
import numpy as np
import pickle
from functools import reduce
import time
from time import sleep
# import concurrent
import concurrent.futures
import multiprocessing
import datetime as dt
from datetime import date
import pathlib
import configparser
import sys
import urllib
import zipfile
import csv
sys.path.insert(0, '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/pymetamap-master')
from pymetamap import MetaMap  # https://github.com/AnthonyMRios/pymetamap/blob/master/pymetamap/SubprocessBackend.py
from pandas import ExcelWriter
import ast
import glob
from tqdm import tqdm
import subprocess
import shlex
from collections import Counter

# %pip install thefuzz
# %pip install levenshtein
# %pip install xlsxwriter
# %pip install ratelimit

from thefuzz import fuzz # fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

In [3]:
def get_token_sort_ratio(str1, str2):
    """ fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python """
    try:
        return fuzz.token_sort_ratio(str1, str2)
    except:
        return None

def get_token_set_ratio(str1, str2):
    """ fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python """
    try:
        return fuzz.token_set_ratio(str1, str2)
    except:
        return None  

def get_similarity_score(str1, str2):
    """ fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python """
    try:
        return fuzz.ratio(str1, str2)
    except:
        return None
    
def convert_seconds_to_hms(seconds):
    """ converts the elapsed time or runtime to hours, min, sec """
    hours = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return hours, minutes, seconds

def de_ascii_er(text):
    non_ascii = "[^\x00-\x7F]"
    pattern = re.compile(r"[^\x00-\x7F]")
    non_ascii_text = re.sub(pattern, ' ', text)
    return non_ascii_text

def start_metamap_servers(metamap_dirs):
    global metamap_pos_server_dir
    global metamap_wsd_server_dir
    metamap_pos_server_dir = 'bin/skrmedpostctl' # Part of speech tagger
    metamap_wsd_server_dir = 'bin/wsdserverctl' # Word sense disambiguation 
    
    metamap_executable_path_pos = os.path.join(metamap_dirs['metamap_base_dir'], metamap_pos_server_dir)
    metamap_executable_path_wsd = os.path.join(metamap_dirs['metamap_base_dir'], metamap_wsd_server_dir)
    command_pos = [metamap_executable_path_pos, 'start']
    command_wsd = [metamap_executable_path_wsd, 'start']

    # Start servers, with open portion redirects output of metamap server printing output to NULL
    with open(os.devnull, "w") as fnull:
        result_post = subprocess.call(command_pos, stdout = fnull, stderr = fnull)
        result_wsd = subprocess.call(command_wsd, stdout = fnull, stderr = fnull)
    sleep(5)

def stop_metamap_servers(metamap_dirs):
    metamap_executable_path_pos = os.path.join(metamap_dirs['metamap_base_dir'], metamap_pos_server_dir)
    metamap_executable_path_wsd = os.path.join(metamap_dirs['metamap_base_dir'], metamap_wsd_server_dir)
    command_pos = [metamap_executable_path_pos, 'stop']
    command_wsd = [metamap_executable_path_wsd, 'stop']
    
    # Stop servers, with open portion redirects output of metamap server printing output to NULL
    with open(os.devnull, "w") as fnull:
        result_post = subprocess.call(command_pos, stdout = fnull, stderr = fnull)
        result_wsd = subprocess.call(command_wsd, stdout = fnull, stderr = fnull)
    sleep(2)  
    
def add_mappings_to_cache(flag_and_path):
    relevant_date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data
    with open("metamapped_terms_cache.tsv", 'a+', encoding="utf-8") as cache:
        with open(f"{relevant_date}_metamap_output.tsv", 'r', encoding="utf-8", errors='ignore') as new_metamapped_terms:
            # Read the first line from new_metamapped_terms to move the cursor
            line = new_metamapped_terms.readline()

            # Move the cursor to the position after the first line
            while line:
                line = new_metamapped_terms.readline()
                if line:
                    # Append the line to file_1
                    cache.write(line)
    """ Remove duplicate rows from cache """
    cache = pd.read_csv("metamapped_terms_cache.tsv", sep='\t', index_col=False, header=0, on_bad_lines = 'warn')
    cache = cache.drop_duplicates()
    cache.to_csv('metamapped_terms_cache.tsv', sep="\t", index=False, header=True) # output deduplicated cache terms to TSV

def add_manually_selected_terms_to_cache():
    # -----     ------     GENERATE MANUALLY SELECTED CACHE     -----     ------  #
    try:
        #  --- --- --   CONDITIONS     --- --- --   #
        files = glob.glob("*.xlsx")
        conditions_manselected_files = [i for i in files if "conditions_manual_review" in i if not i.startswith("~")][0]  
        conditions_manselected = pd.read_excel(conditions_manselected_files)
        conditions_manselected.name.ffill(inplace=True)
        conditions_manselected.orig_con.ffill(inplace=True)
        conditions_manselected = conditions_manselected[~conditions_manselected['manually_selected_CURIE'].isnull()] # check if the conditions got mapped to any CURIEs
        conditions_manselected.drop(["curie_info"], axis = 1, inplace = True)
        conditions_manselected.rename(columns = {'name':'original_clin_trial_term', 'orig_con':'modified_clin_trial_term'}, inplace = True)

        with open('conditions_manually_selected_cache.tsv', 'a') as output:
            conditions_manselected.to_csv(output, mode='a',sep="\t", index=False, header=output.tell()==0)
        """ Remove duplicate rows from cache """
        cache = pd.read_csv("conditions_manually_selected_cache.tsv", sep='\t', index_col=False, header=0, on_bad_lines = 'warn')
        cache = cache.drop_duplicates()
        cache.to_csv('conditions_manually_selected_cache.tsv', sep="\t", index=False, header=True) # output deduplicated cache terms to TSV

        #  --- --- --   INTERVENTIONS and Alternate INTERVENTIONS   --- --- --   #
        files = glob.glob("*.xlsx")
        interventions_manselected_files = [i for i in files if "interventions_manual_review" in i if not i.startswith("~")][0]  
        interventions_manselected = pd.read_excel(interventions_manselected_files)
        interventions_manselected.name.ffill(inplace=True)
        interventions_manselected.orig_int.ffill(inplace=True)
        interventions_manselected = interventions_manselected[~interventions_manselected['manually_selected_CURIE'].isnull()] # check if the conditions got mapped to any CURIEs
        interventions_manselected.drop(["curie_info", "description"], axis = 1, inplace = True)
        interventions_manselected.rename(columns = {'name':'original_clin_trial_term', 'orig_int':'modified_clin_trial_term'}, inplace = True)

        with open('interventions_manually_selected_cache.tsv', 'a') as output:
            interventions_manselected.to_csv(output, mode='a',sep="\t", index=False, header=output.tell()==0)
        """ Remove duplicate rows from cache """
        cache = pd.read_csv("interventions_manually_selected_cache.tsv", sep='\t', index_col=False, header=0, on_bad_lines = 'warn')
        cache = cache.drop_duplicates()
        cache.to_csv('interventions_manually_selected_cache.tsv', sep="\t", index=False, header=True) # output deduplicated cache terms to TSV
    except:
        print("No terms in manual select column; either column is empty or bug. Proceeding without them")
        
def check_os():
    if "linux" in sys.platform:
        print("Linux platform detected")
        metamap_base_dir = "{}/metamap/".format(pathlib.Path.cwd().parents[0])
        metamap_bin_dir = 'bin/metamap20'
    else:
        metamap_base_dir = '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/' # for running on local
        metamap_bin_dir = 'bin/metamap18'
        
    return {"metamap_base_dir":metamap_base_dir, "metamap_bin_dir":metamap_bin_dir}  


In [21]:
def get_raw_ct_data():
    term_program_flag = True
    global data_dir
    global data_extracted
    
    try:
        # get all the links and associated dates of upload into a dict called date_link
        url_all = "https://aact.ctti-clinicaltrials.org/download"
        response = requests.get(url_all)
        soup = BeautifulSoup(response.text, features="lxml")
        body = soup.find_all('option') #Find all
        date_link = {}
        for el in body:
            tags = el.find('a')
            try:
                zip_name = tags.contents[0].split()[0]
                date = zip_name.split("_")[0]
                date = dt.datetime.strptime(date, '%Y%m%d').date()
                date_link[date] = tags.get('href')
            except:
                pass
        latest_file_date = max(date_link.keys())   # get the date of the latest upload
        url = date_link[latest_file_date]   # get the corresponding download link of the latest upload so we can download the raw data
        date_string = latest_file_date.strftime("%m_%d_%Y")
        data_dir = "{}/data".format(pathlib.Path.cwd())
        data_extracted = data_dir + "/{}_extracted".format(date_string)
        data_path = "{}/{}_pipe-delimited-export.zip".format(data_dir, date_string)
    except:
        print("continue")

    if not os.path.exists(data_path):   # if folder containing most recent data doesn't exist, download and extract it into data folder

        term_program_flag = False   # flag below for terminating program if latest download exists (KG is assumed up to date)
        print("Attempting download of Clinical Trial data as of {}\n".format(date_string))
        try:
            response = requests.get(url)
            if response.status_code == 200:
                with open(data_path, 'wb') as file:
                    file.write(response.content)
                print("Finished download of zip")
                with zipfile.ZipFile(data_path, 'r') as download:
                    print("Unzipping data")
                    download.extractall(data_extracted)
        except:
            print("Failed to scrape AACT for download. Please navigate to https://aact.ctti-clinicaltrials.org/download and manually download zip file.")
            print("Please store the downloaded zip in the /data directory. This should be the only item besides the cache file, condition manual review file, and intervention manual review file, in the directory at this time.")
            done = input("Type Done when done: ")
            if done == "Done":
                data_dir = "{}/data".format(pathlib.Path.cwd())
                # list_of_files = glob.glob(data_dir + "/*") # get all files in directory
                try:
                    # latest_file = max(list_of_files, key=os.path.getctime) # get the most recent file in the directory
                    pattern = os.path.join(data_dir, "*.zip")
                    zip_file = glob.glob(pattern) # look for file in directory that ends in ".zip"
                    zip_file = zip_file[0]
                    print("File found at: ")
                    print(zip_file)
                    # print(latest_file)
                    print("Please make sure this the correct zip file from AACT")
                    if not os.path.exists(data_extracted):   # if folder of unzipped data does not exist, unzip
                        try:
                            with zipfile.ZipFile(zip_file, 'r') as download:
                                print("Unzipping data into")
                                cttime = os.path.getctime(zip_file)
                                date_string = dt.datetime.fromtimestamp(cttime).strftime('%m_%d_%Y')
                                data_extracted = data_dir + "/{}_extracted".format(date_string)
                                print(data_extracted)
                                download.extractall(data_extracted)
                        except:
                            pattern = os.path.join(data_dir, "*_extracted")
                            extracted_file = glob.glob(pattern) # look for file in directory that ends in "_extracted"
                            data_extracted = extracted_file[0]
                            extracted_name = os.path.basename(os.path.normpath(extracted_file[0]))
                            date_string = extracted_name.replace('_extracted', '')
                            print("Assuming data is already unzipped")
                        
                except:
                    print("Unable to download and extract Clincal Trial data.")
                    print("Cannot find pipe-delimited zip in /data folder.")
    else:
        print("KG is already up to date.")

    return {"term_program_flag": term_program_flag, "data_extracted_path": data_extracted, "date_string": date_string}


In [5]:
def read_raw_ct_data(flag_and_path, subset_size):
    if flag_and_path["term_program_flag"]:
        print("Exiting program. Assuming KG has already been constructed from most recent data dump from AACT.")
        exit()
    else:
        data_extracted = flag_and_path["data_extracted_path"]
        # read in pipe-delimited files 
        conditions_df = pd.read_csv(data_extracted + '/conditions.txt', sep='|', index_col=False, header=0, on_bad_lines = 'warn')
        interventions_df = pd.read_csv(data_extracted + '/interventions.txt', sep='|', index_col=False, header=0, on_bad_lines = 'warn')
        interventions_alts_df = pd.read_csv(data_extracted + '/intervention_other_names.txt', sep='|', index_col=False, header=0, on_bad_lines = 'warn')

        if subset_size:   # if a subset size is given, we are running this script on a small subset of the dataset
            conditions_df = conditions_df.sample(n=subset_size)
            interventions_df = interventions_df.sample(n=subset_size)
            interventions_alts_df = interventions_alts_df.sample(n=subset_size)
    
    df_dict = {"conditions": conditions_df, "interventions": interventions_df, "interventions_alts": interventions_alts_df}
    return df_dict


# Check against cache, retrieve terms not already mapped

In [6]:
def check_against_cache(df_dict):
    conditions_list = df_dict['conditions'].name.unique().tolist()
    conditions_list = [str(i) for i in conditions_list]
    conditions_list = list(set([i.lower() for i in conditions_list]))
    
    interventions_list = df_dict['interventions'].name.unique().tolist()
    interventions_list = [str(i) for i in interventions_list]
    interventions_list = list(set([i.lower() for i in interventions_list]))
    
    interventions_alts_list = df_dict['interventions_alts'].name.unique().tolist()
    interventions_alts_list = [str(i) for i in interventions_alts_list]
    interventions_alts_list = list(set([i.lower() for i in interventions_alts_list]))
    
    try:        
        cache_df = pd.read_csv("mapping_cache.tsv", sep ="\t", index_col=False, header=0, on_bad_lines = 'warn')
        
        conditions_cache = cache_df[cache_df["term_type"] == "condition"]
        conditions_cache = conditions_cache['clintrial_term'].unique().tolist()
        conditions_cache = list(set([i.lower() for i in conditions_cache]))
        
        conditions_new = [x for x in conditions_list if x not in conditions_cache] # find conditions not in the cache (i.g. new conditions to map)
        conditions_new = list(filter(None, conditions_new))
        conditions_new = [str(i) for i in conditions_new]
        
        interventions_cache = cache_df[cache_df["term_type"] == "intervention"]
        interventions_cache = interventions_cache['clintrial_term'].unique().tolist()
        interventions_cache = list(set([i.lower() for i in interventions_cache]))
        
        interventions_new = [x for x in interventions_list if x not in interventions_cache] # find interventions not in the cache (i.g. new interventions to map)
        interventions_new = list(filter(None, interventions_new))
        interventions_new = [str(i) for i in interventions_new]
        
        interventions_alts_cache = cache_df[cache_df["term_type"] == "intervention_alternate"]
        interventions_alts_cache = interventions_alts_cache['clintrial_term'].unique().tolist()
        interventions_alts_cache = list(set([i.lower() for i in interventions_alts_cache]))
        
        interventions_alts_new = [x for x in interventions_alts_list if x not in interventions_alts_cache] # find interventions_alts not in the cache (i.g. new interventions_alts to map)
        interventions_alts_new = list(filter(None, interventions_alts_new))
        interventions_alts_new = [str(i) for i in interventions_alts_new]
        
    except:
        print("No cache of terms found. Proceeding to map entire KG from scratch")
        conditions_new = conditions_list
        interventions_new = interventions_list
        interventions_alts_new = interventions_alts_list
        
    dict_new_terms = {"conditions": conditions_new, "interventions": interventions_new, "interventions_alts": interventions_alts_new}

    return dict_new_terms


# Map new terms using MetaMap

In [14]:
# def get_nr_response(term_type, description, ct_intervention_type, chunk, csv_writer):
# def run_metamap(term_pair, params, mm, term_type, csv_writer):

def get_nr_response(orig_term):
    print("TRYING NAME RESOLVER")
    def create_session():
        s = requests.Session()
        return s
    
    sess = create_session()
    
    """   Runs Name Resolver   """
    nr_url = 'https://name-resolution-sri.renci.org/lookup'
    max_retries = 3 
    
    input_term = orig_term # in MetaMap, we have to potentially deascii the term and lower case it...for Name Resolver, we don't need to do that. To keep columns consist with MetaMap output, we just keep it and say the original term and the input term are the same. For MetaMap, they might be different
    retries = 0
    params = {'string':orig_term, 'limit':1} # limit -1 makes this return all available equivalent CURIEs name resolver can give (deprecated)
    while retries <= max_retries:
        try:
            r = sess.post(nr_url, params=params)
            if r.status_code == 200:
                mapping_tool_response = r.json()  # process Name Resolver response
                return mapping_tool_response
            else:
                return None
        except (requests.RequestException, ConnectionResetError, OSError) as ex:
            print(f"\nName Resolver request failed for term: {term}. Error: {ex}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying ({retries}/{max_retries}) after a delay.")
                time.sleep(2 ** retries)  # Increase the delay between retries exponentially
            else:
                print(f"Max retries (Name Resolver) reached for term: {term}.")
                return None
    
    # request_count +=1
    # if request_count % 50 == 0:  # if 50 requests to API have been made, sleep 10 secs
    #     time.sleep(10)  
    
            
        
            



    
    
    
    
    
    
    
    
    
    
    
#     for ct_term in chunk:
#         request_count +=1
#         if request_count % 50 == 0:  # if 50 requests to API have been made, sleep 10 secs
#             time.sleep(10)  

#         input_term = ct_term # in MetaMap, we have to potentially deascii the term and lower case it...for Name Resolver, we don't need to do that. To keep columns consist with MetaMap output, we just keep it and say the original term and the input term are the same. For MetaMap, they might be different
#         retries = 0
#         params = {'string':ct_term, 'limit':1} # limit -1 makes this return all available equivalent CURIEs name resolver can give (deprecated)
#         while retries <= max_retries:
#             try:
#                 r = sess.post(nr_url, params=params)
#                 if r.status_code == 200:
#                     mapping_tool_response = r.json()  # process Name Resolver response
#                     row_to_write = [mapping_tool, term_type, ct_term, input_term,  mapping_tool_response]
#                     csv_writer.writerow(row_to_write)
#                     # print(row_to_write)
#                     break
#             except (requests.RequestException, ConnectionResetError, OSError) as ex:
#                 print(f"\nName Resolver request failed for term: {term}. Error: {ex}")
#                 retries += 1
#                 if retries < max_retries:
#                     print(f"Retrying ({retries}/{max_retries}) after a delay.")
#                     time.sleep(2 ** retries)  # Increase the delay between retries exponentially
#                 else:
#                     print(f"Max retries (Name Resolver) reached for term: {term}. Moving to the next term.")
    

In [16]:
# # nr_response = get_nr_response(orig_term)
# # ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
# for i in ["cordiceps", "chocolate", "diabetes mellitus", "blipadbloo", "catheter", "humira®"]:
#     nr_result = get_nr_response(i)
#     if nr_result:
#         nr_result
#         nr_curie = nr_result[0]["curie"]
#         nr_name = nr_result[0]["label"]
#         nr_type = nr_result[0]["types"][0]
#         nr_score = nr_result[0]["score"]
#         new_concept_dict = {"nameresolver_preferred_name": nr_name,
#                              "nameresolver_cui": nr_curie,
#                              "nameresolver_score": nr_score,
#                              "nameresolver_semtypes": nr_type}
#     else:
#         print("nothing returned from NR")

In [None]:
nr_name = nr_result[0]["label"]
nr_type = nr_result[0]["types"][0]
nr_name
nr_type

In [None]:
# def get_nr_response(term_type, description, ct_intervention_type, chunk, csv_writer):
#     # Format of cache output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response']

#     def create_session():
#         s = requests.Session()
#         return s
    
#     sess = create_session()
    
#     """   Runs Name Resolver   """
#     nr_url = 'https://name-resolution-sri.renci.org/lookup'
#     max_retries = 3 
    
#     mapping_tool = "name_resolver"

#     request_count = 0 # track how many times I'm making requests or hitting API
#     for ct_term in chunk:
#         request_count +=1
#         if request_count % 50 == 0:  # if 50 requests to API have been made, sleep 10 secs
#             time.sleep(10)  

#         input_term = ct_term # in MetaMap, we have to potentially deascii the term and lower case it...for Name Resolver, we don't need to do that. To keep columns consist with MetaMap output, we just keep it and say the original term and the input term are the same. For MetaMap, they might be different
#         retries = 0
#         params = {'string':ct_term, 'limit':1} # limit -1 makes this return all available equivalent CURIEs name resolver can give (deprecated)
#         while retries <= max_retries:
#             try:
#                 r = sess.post(nr_url, params=params)
#                 if r.status_code == 200:
#                     mapping_tool_response = r.json()  # process Name Resolver response
#                     row_to_write = [mapping_tool, term_type, ct_term, input_term,  mapping_tool_response]
#                     csv_writer.writerow(row_to_write)
#                     # print(row_to_write)
#                     break
#             except (requests.RequestException, ConnectionResetError, OSError) as ex:
#                 print(f"\nName Resolver request failed for term: {term}. Error: {ex}")
#                 retries += 1
#                 if retries < max_retries:
#                     print(f"Retrying ({retries}/{max_retries}) after a delay.")
#                     time.sleep(2 ** retries)  # Increase the delay between retries exponentially
#                 else:
#                     print(f"Max retries (Name Resolver) reached for term: {term}. Moving to the next term.")
    

In [None]:
# def run_metamap(term_pair, params, mm, term_type, csv_writer):
#     mapping_tool = "metamap"
#     orig_term = term_pair[0]
#     input_term = term_pair[1]
#     from_metamap = []
#     # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']

#     if params.get("exclude_sts") is None: # exclude_sts is used for Interventions. restrict_to_sts is used for Conditions. So, the logic is, if we're mapping Conditions, execute "if" part of code. If we're mapping Interventions, execute "else" part of code
#         try:
#             concepts,error = mm.extract_concepts([input_term],
#                                                  restrict_to_sts = params["restrict_to_sts"],
#                                                  term_processing = params["term_processing"],
#                                                  ignore_word_order = params["ignore_word_order"],
#                                                  strict_model = params["strict_model"],
#                                                 )
#             for concept in concepts:
#                 concept_info = []
#                 concept_info.extend([mapping_tool, term_type, orig_term, input_term]) # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
#                 concept = concept._asdict()
#                 new_concept_dict  = {"metamap_preferred_name": concept.get("preferred_name"),
#                                      "metamap_cui": concept.get("cui"),
#                                      "metamap_score": concept.get("score"),
#                                      "metamap_semtypes": concept.get("semtypes")}
#                 concept_info.append(new_concept_dict)
#                 concept_info.append(None) # this is for the score column, empty bc not scored yet
#                 from_metamap.append(concept_info)
                
#         except: # if no mapping tool response, append None for both the response and the score
#             from_metamap.extend([mapping_tool, term_type, orig_term, input_term, None, None])   # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']

#     else:
#         try:
#             concepts,error = mm.extract_concepts([input_term],
#                                                  exclude_sts = params["exclude_sts"],
#                                                  term_processing = params["term_processing"],
#                                                  ignore_word_order = params["ignore_word_order"],
#                                                  strict_model = params["strict_model"],
#                                                 )
#             for concept in concepts:
#                 concept_info = []
#                 concept_info.extend([mapping_tool, term_type, orig_term, input_term]) # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
#                 concept = concept._asdict()
#                 new_concept_dict  = {"metamap_preferred_name": concept.get("preferred_name"),
#                                      "metamap_cui": concept.get("cui"),
#                                      "metamap_score": concept.get("score"),
#                                      "metamap_semtypes": concept.get("semtypes")}
#                 concept_info.append(new_concept_dict)
#                 concept_info.append(None) # for score column
#                 from_metamap.append(concept_info)

#         except: # if no mapping tool response, append None for both the response and the score
#             from_metamap.extend([mapping_tool, term_type, orig_term, input_term, None])   # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
#     # print(from_metamap)

#     for result in from_metamap:
#         # print(result)
#         csv_writer.writerow(result)
#     # return from_metamap

In [19]:
def run_mappers(term_pair, params, mm, term_type, csv_writer):
    mapping_tool = "metamap"
    orig_term = term_pair[0]
    input_term = term_pair[1]
    from_mapper = []
    
    # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']

    if params.get("exclude_sts") is None: # exclude_sts is used for Interventions. restrict_to_sts is used for Conditions. So, the logic is, if we're mapping Conditions, execute "if" part of code. If we're mapping Interventions, execute "else" part of code
        try:
            concepts,error = mm.extract_concepts([input_term],
                                     restrict_to_sts = params["restrict_to_sts"],
                                     term_processing = params["term_processing"],
                                     ignore_word_order = params["ignore_word_order"],
                                     strict_model = params["strict_model"],)
                                                    
            if concepts:   # if MetaMap gives response, process response
                for concept in concepts:
                    concept_info = []
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term]) # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
                    concept = concept._asdict()
                    new_concept_dict  = {"metamap_preferred_name": concept.get("preferred_name"),
                                         "metamap_cui": concept.get("cui"),
                                         "metamap_score": concept.get("score"),
                                         "metamap_semtypes": concept.get("semtypes")}
                    concept_info.append(new_concept_dict)
                    concept_info.append(None) # this is for the score column, empty bc not scored yet
                    from_mapper.append(concept_info)
            else:   # if MetaMap fails, try using Name Resolver and process response
                print("ATTEMPTING NAME RESOLVER HERE")
                nr_response = get_nr_response(orig_term)
                # run_mappers.nrcalls += 1
                if nr_response: # if Name Resolver gives response, process repsonse
                    input_term = orig_term # no preprocessing (lowercasing or deascii-ing) necessary to submit terms to Name Resolver (unlike MetaMap)
                    mapping_tool = "nameresolver"
                    concept_info = []
                    
                    nr_curie = nr_response[0]["curie"]
                    nr_name = nr_response[0]["label"]
                    nr_type = nr_response[0]["types"][0]
                    nr_score = nr_response[0]["score"]
                    new_concept_dict = {"nameresolver_preferred_name": nr_name,
                                         "nameresolver_cui": nr_curie,
                                         "nameresolver_score": nr_score,
                                         "nameresolver_semtypes": nr_type}
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term])
                    concept_info.append(new_concept_dict)
                    concept_info.append(None) # this is for the score column, empty bc not scored yet
                    from_mapper.append(concept_info)
                else:
                    print("Nothing returned from NR or Metamap")
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term, None, None])
                    from_mapper.append(concept_info)
        except:
            print("Nothing returned from NR or Metamap")
            concept_info.extend([mapping_tool, term_type, orig_term, input_term, None, None])
            from_mapper.append(concept_info)
            
    else:
        try:
            concepts,error = mm.extract_concepts([input_term],
                                                 exclude_sts = params["exclude_sts"],
                                                 term_processing = params["term_processing"],
                                                 ignore_word_order = params["ignore_word_order"],
                                                 strict_model = params["strict_model"],) 
                                                   
            if concepts:   # if MetaMap gives response, process response
                for concept in concepts:
                    concept_info = []
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term]) # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
                    concept = concept._asdict()
                    new_concept_dict  = {"metamap_preferred_name": concept.get("preferred_name"),
                                         "metamap_cui": concept.get("cui"),
                                         "metamap_score": concept.get("score"),
                                         "metamap_semtypes": concept.get("semtypes")}
                    concept_info.append(new_concept_dict)
                    concept_info.append(None) # this is for the score column, empty bc not scored yet
                    from_mapper.append(concept_info)
            else:   # if MetaMap fails, try using Name Resolver and process response
                print("ATTEMPTING NAME RESOLVER HERE")
                nr_response = get_nr_response(orig_term)
                # run_mappers.nrcalls += 1
                if nr_response: # if Name Resolver gives response, process repsonse
                    input_term = orig_term # no preprocessing (lowercasing or deascii-ing) necessary to submit terms to Name Resolver (unlike MetaMap)
                    mapping_tool = "nameresolver"
                    concept_info = []
                    
                    nr_curie = nr_response[0]["curie"]
                    nr_name = nr_response[0]["label"]
                    nr_type = nr_response[0]["types"][0]
                    nr_score = nr_response[0]["score"]
                    new_concept_dict = {"nameresolver_preferred_name": nr_name,
                                         "nameresolver_cui": nr_curie,
                                         "nameresolver_score": nr_score,
                                         "nameresolver_semtypes": nr_type}
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term])
                    concept_info.append(new_concept_dict)
                    concept_info.append(None) # this is for the score column, empty bc not scored yet
                    from_mapper.append(concept_info)
                else:
                    print("Nothing returned from NR or Metamap")
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term, None, None])
                    from_mapper.append(concept_info)
        except:
            print("Nothing returned from NR or Metamap")
            concept_info.extend([mapping_tool, term_type, orig_term, input_term, None, None])
            from_mapper.append(concept_info)
               
    
#         try:
#             concepts,error = mm.extract_concepts([input_term],
#                                                  restrict_to_sts = params["restrict_to_sts"],
#                                                  term_processing = params["term_processing"],
#                                                  ignore_word_order = params["ignore_word_order"],
#                                                  strict_model = params["strict_model"],
#                                                 )
#             if concepts: # if MetaMap gives response, process response
#                 for concept in concepts:
#                     concept_info = []
#                     concept_info.extend([mapping_tool, term_type, orig_term, input_term]) # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
#                     concept = concept._asdict()
#                     new_concept_dict  = {"metamap_preferred_name": concept.get("preferred_name"),
#                                          "metamap_cui": concept.get("cui"),
#                                          "metamap_score": concept.get("score"),
#                                          "metamap_semtypes": concept.get("semtypes")}
#                     concept_info.append(new_concept_dict)
#                     concept_info.append(None) # this is for the score column, empty bc not scored yet
#                     from_mapper.append(concept_info)
#             except: # if MetaMap fails, try using Name Resolver and process response
#                 nr_response = get_nr_response(orig_term)
#                 run_mappers.nrcalls += 1
#                 if nr_response: # if Name Resolver gives response, process repsonse
#                     input_term = orig_term # no preprocessing (lowercasing or deascii-ing) necessary to submit terms to Name Resolver (unlike MetaMap)
#                     mapping_tool = "nameresolver"
#                     concept_info = []
                    
#                     nr_curie = nr_response[0]["curie"]
#                     nr_name = nr_response[0]["label"]
#                     nr_type = nr_response[0]["types"][0]
#                     nr_score = nr_response[0]["score"]
#                     new_concept_dict = {"nameresolver_preferred_name": nr_name,
#                                          "nameresolver_cui": nr_curie,
#                                          "nameresolver_score": nr_score,
#                                          "nameresolver_semtypes": nr_type}
#                     concept_info.extend([mapping_tool, term_type, orig_term, input_term])
#                     concept_info.append(new_concept_dict)
#                     concept_info.append(None) # this is for the score column, empty bc not scored yet
#                     from_mapper.append(concept_info)
#                 else: # both MetaMap and Name Resolver failed to give response, so put None for the response, but show that the term was attempted to be mapped
#                     print("nothing returned from NR or Metamap")
#                     concept_info.extend([mapping_tool, term_type, orig_term, input_term, None, None])
#                     from_mapper.append(concept_info)
#     else:
#         try:
#             concepts,error = mm.extract_concepts([input_term],
#                                                  exclude_sts = params["exclude_sts"],
#                                                  term_processing = params["term_processing"],
#                                                  ignore_word_order = params["ignore_word_order"],
#                                                  strict_model = params["strict_model"],
#                                                 ) 
#             if concepts: # if MetaMap gives response, process response
#                 for concept in concepts:
#                     concept_info = []
#                     concept_info.extend([mapping_tool, term_type, orig_term, input_term]) # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
#                     concept = concept._asdict()
#                     new_concept_dict  = {"metamap_preferred_name": concept.get("preferred_name"),
#                                          "metamap_cui": concept.get("cui"),
#                                          "metamap_score": concept.get("score"),
#                                          "metamap_semtypes": concept.get("semtypes")}
#                     concept_info.append(new_concept_dict)
#                     concept_info.append(None) # this is for the score column, empty bc not scored yet
#                     from_mapper.append(concept_info)
#         except: # if MetaMap fails, try using Name Resolver and process response
#                 nr_response = get_nr_response(orig_term)
#                 run_mappers.nrcalls += 1
#                 if nr_response: # if Name Resolver gives response, process repsonse
#                     input_term = orig_term # no preprocessing (lowercasing or deascii-ing) necessary to submit terms to Name Resolver (unlike MetaMap)
#                     mapping_tool = "nameresolver"
#                     concept_info = []
                    
#                     nr_curie = nr_response[0]["curie"]
#                     nr_name = nr_response[0]["label"]
#                     nr_type = nr_response[0]["types"][0]
#                     nr_score = nr_response[0]["score"]
#                     new_concept_dict = {"nameresolver_preferred_name": nr_name,
#                                          "nameresolver_cui": nr_curie,
#                                          "nameresolver_score": nr_score,
#                                          "nameresolver_semtypes": nr_type}
#                     concept_info.extend([mapping_tool, term_type, orig_term, input_term])
#                     concept_info.append(new_concept_dict)
#                     concept_info.append(None) # this is for the score column, empty bc not scored yet
#                     from_mapper.append(concept_info)
#                 else: # both MetaMap and Name Resolver failed to give response, so put None for the response, but show that the term was attempted to be mapped
#                     print("nothing returned from NR or Metamap")
#                     concept_info.extend([mapping_tool, term_type, orig_term, input_term, None, None]) # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
#                     from_mapper.append(concept_info)
    # print(from_mapper)

    for result in from_mapper:
        print(result)
        # csv_writer.writerow(result)
    # return from_metamap

In [9]:
def parallelize_mappers(term_pair_list, params, term_type, csv_writer):

    LENGTH = len(term_pair_list)  # Number of iterations required to fill progress bar (pbar)
    pbar = tqdm(total=LENGTH, desc="% {}s mapped".format(term_type), position=0, leave=True, mininterval = LENGTH/20, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')  # Init progress bar

    start_metamap_servers(metamap_dirs) # start the MetaMap servers
    mm = MetaMap.get_instance(metamap_dirs["metamap_base_dir"] + metamap_dirs["metamap_bin_dir"])
    with concurrent.futures.ThreadPoolExecutor((multiprocessing.cpu_count()*2) - 1) as executor:
        futures = [executor.submit(run_mappers, term_pair, params, mm, term_type, csv_writer) for term_pair in term_pair_list]
        for _ in concurrent.futures.as_completed(futures):
            pbar.update(n=1)  # Increments counter
    stop_metamap_servers(metamap_dirs) # stop the MetaMap servers
    

In [None]:
# def parallelize_metamap(term_pair_list, params, term_type, csv_writer):

#     LENGTH = len(term_pair_list)  # Number of iterations required to fill progress bar (pbar)
#     pbar = tqdm(total=LENGTH, desc="% {}s mapped".format(term_type), position=0, leave=True, mininterval = LENGTH/20, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')  # Init progress bar

#     start_metamap_servers(metamap_dirs) # start the MetaMap servers
#     mm = MetaMap.get_instance(metamap_dirs["metamap_base_dir"] + metamap_dirs["metamap_bin_dir"])
#     with concurrent.futures.ThreadPoolExecutor((multiprocessing.cpu_count()*2) - 1) as executor:
#         futures = [executor.submit(run_metamap, term_pair, params, mm, term_type, csv_writer) for term_pair in term_pair_list]
#         for _ in concurrent.futures.as_completed(futures):
#             pbar.update(n=1)  # Increments counter
#     stop_metamap_servers(metamap_dirs) # stop the MetaMap servers
    

In [12]:
def term_list_to_mappers(dict_new_terms):   
    metamap_version = [int(s) for s in re.findall(r'\d+', metamap_dirs.get('metamap_bin_dir'))] # get MetaMap version being run 
    deasciier = np.vectorize(de_ascii_er) # vectorize function
    
    # open mapping cache to add MetaMap terms
    mapping_filename = "mapping_cache.tsv"
    if os.path.exists(mapping_filename):
        output = open(mapping_filename, 'a', newline='') 
        csv_writer = csv.writer(output, delimiter='\t')
    else:
        output = open(mapping_filename, 'w+', newline='')
        col_names = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
        csv_writer = csv.writer(output, delimiter='\t')
        csv_writer.writerow(col_names)

    #  - Conditions
    condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    conditions = dict_new_terms.get("conditions")
    condition_params = {"restrict_to_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    # conditon_term_type = "condition"

    #  - Interventions
    condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    interventions = dict_new_terms.get("interventions")
    intervention_params = {"exclude_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    # intervention_term_type = "intervention"

    #  - Alternate Interventions
    condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    interventions_alts = dict_new_terms.get("interventions_alts")
    intervention_params = {"exclude_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    # intervention_alternate_term_type = "intervention_alternate"
    
    if metamap_version[0] >= 20:
        print("MetaMap version >= 2020, conduct mapping on original terms")
        # parallelize_metamap(list(zip(conditions, conditions)), condition_params, "condition", csv_writer)
        # parallelize_metamap(list(zip(interventions, interventions)), intervention_params, "intervention", csv_writer)
        # parallelize_metamap(list(zip(interventions_alts, interventions_alts)), intervention_alts_params, "alternate_intervention", csv_writer)
        parallelize_mappers(list(zip(conditions, conditions)), condition_params, "condition", csv_writer)
        parallelize_mappers(list(zip(interventions, interventions)), intervention_params, "intervention", csv_writer)
        parallelize_mappers(list(zip(interventions_alts, interventions_alts)), intervention_alts_params, "alternate_intervention", csv_writer)
    else:
        print("MetaMap version < 2020, conduct mapping on terms after removing ascii characters")
        deascii_cons = deasciier(conditions)
        deascii_ints = deasciier(interventions)
        deascii_int_alts = deasciier(interventions_alts)
        # parallelize_metamap(list(zip(conditions, deascii_cons)), condition_params, "condition", csv_writer)
        # parallelize_metamap(list(zip(interventions, deascii_ints)), intervention_params, "intervention", csv_writer)
        # parallelize_metamap(list(zip(interventions_alts, deascii_int_alts)), intervention_params, "intervention_alternate", csv_writer)
        parallelize_mappers(list(zip(conditions, deascii_cons)), condition_params, "condition", csv_writer)
        parallelize_mappers(list(zip(interventions, deascii_ints)), intervention_params, "intervention", csv_writer)
        parallelize_mappers(list(zip(interventions_alts, deascii_int_alts)), intervention_params, "intervention_alternate", csv_writer)

    output.close()
    
    """ Remove duplicate rows from cache """
    cache = pd.read_csv(mapping_filename, sep='\t', index_col=False, header=0, on_bad_lines = 'warn')
    cache = cache.drop_duplicates()
    cache.to_csv(mapping_filename, sep="\t", index=False, header=True) # output deduplicated cache terms to TSV


In [None]:
# def score_metamap_mappings():


header = True
with pd.read_csv("mapping_cache.tsv", sep='\t', index_col=False, header=0, on_bad_lines = 'warn', chunksize=1000) as reader:
    for chunk in reader:
        chunk["scored"] = np.where(~chunk["score"].isnull(), chunk["score"],
                                   np.where((chunk.score.isnull())&(chunk.mapping_tool == "metamap"),
        df['d'] = np.where(df.a.isnull(),
         np.nan,
         np.where((df.b == "N")&(~df.c.isnull()),
                  df.a*df.c,
                  df.a))
        
        
#         for i, row in chunk.iterrows():
#             print(type(row["score"]))
#             if not row["score"].isnull():
#                 print(i)
# #                 break
#             elif pd.isnull(row["score"]) and row["mapping_tool"] == "metamap":
#                 mm_dict = ast.literal_eval(row["mapping_tool_response"])
#                 mapped_term = mm_dict['metamap_preferred_name']
#                 sort_ratio_score = get_token_sort_ratio(row["clintrial_term"], mapped_term)
#                 similarity_score = get_similarity_score(row["clintrial_term"], mapped_term)
#                 max_score = max(sort_ratio_score, similarity_score)
#                 chunk.at[i, "score"] = max_score
#             elif pd.isnull(row["score"]) and row["mapping_tool"] == "nameresolver":
#                 break
            
#             chunk.to_csv("mapping_cache_scored.tsv", header=header, sep="\t", index=False, mode='a+')
#             header = False

#       header = True
# for chunk in chunks:

#     chunk.to_csv(os.path.join(folder, new_folder, "new_file_" + filename),
#         header=header, cols=[['TIME','STUFF']], mode='a')

#     header = False          
                
            
# original_clintrial_term = row["clintrial_term"]


In [20]:
flag_and_path = get_raw_ct_data() # download raw data
global metamap_dirs
metamap_dirs = check_os()
subset_size = 20
df_dict = read_raw_ct_data(flag_and_path, subset_size) # read the clinical trial data
dict_new_terms = check_against_cache(df_dict) # use the existing cache of MetaMapped terms so that only new terms are mapped
term_list_to_mappers(dict_new_terms)

Attempting download of Clinical Trial data as of 02_21_2024


Failed to scrape AACT for download. Please navigate to https://aact.ctti-clinicaltrials.org/download and manually download zip file.
Please store the downloaded zip in the /data directory. This should be the only item besides the cache file, condition manual review file, and intervention manual review file, in the directory at this time.


Type Done when done:  Done


File found at: 
/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/8vstm2enpo0ocbo2z7oypqurhgmz.zip
Please make sure this the correct zip file from AACT
Unzipping data into
/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/02_20_2024_extracted
MetaMap version < 2020, conduct mapping on terms after removing ascii characters


% conditions mapped:   5%|█                   | 1/20 [00:11<03:35, 11.

ATTEMPTING NAME RESOLVER HERE
TRYING NAME RESOLVER
['metamap', 'condition', 'tetraplegia', 'tetraplegia', {'metamap_preferred_name': 'Quadriplegia', 'metamap_cui': 'C0034372', 'metamap_score': '16.26', 'metamap_semtypes': '[dsyn]'}, None]
['nameresolver', 'condition', 'thermogenesis', 'thermogenesis', {'nameresolver_preferred_name': 'Thermogenesis', 'nameresolver_cui': 'UMLS:C0018841', 'nameresolver_score': 99.5397, 'nameresolver_semtypes': 'biolink:PhysiologicalProcess'}, None]
['metamap', 'condition', 'cardiovascular risk factor', 'cardiovascular risk factor', {'metamap_preferred_name': 'cardiovascular risk factor', 'metamap_cui': 'C0850624', 'metamap_score': '3.77', 'metamap_semtypes': '[dsyn]'}, None]


% conditions mapped:  20%|████                | 4/20 [00:12<00:40,  2.

['metamap', 'condition', 'hepatocellular carcinoma', 'hepatocellular carcinoma', {'metamap_preferred_name': 'Liver carcinoma', 'metamap_cui': 'C2239176', 'metamap_score': '19.49', 'metamap_semtypes': '[neop]'}, None]
['metamap', 'condition', 'maturity-onset diabetes in the young (mody)', 'maturity-onset diabetes in the young (mody)', {'metamap_preferred_name': None, 'metamap_cui': None, 'metamap_score': None, 'metamap_semtypes': None}, None]
['metamap', 'condition', 'maturity-onset diabetes in the young (mody)', 'maturity-onset diabetes in the young (mody)', {'metamap_preferred_name': 'Diabetes Mellitus, Non-Insulin-Dependent', 'metamap_cui': 'C0011860', 'metamap_score': '16.23', 'metamap_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'maturity-onset diabetes in the young (mody)', 'maturity-onset diabetes in the young (mody)', {'metamap_preferred_name': 'Young-onset diabetes', 'metamap_cui': 'C4227728', 'metamap_score': '3.60', 'metamap_semtypes': '[fndg]'}, None]
['metamap', 'co

% conditions mapped:  35%|███████             | 7/20 [00:13<00:18,  1.

['metamap', 'condition', 'human papilloma virus infection', 'human papilloma virus infection', {'metamap_preferred_name': 'Human papilloma virus infection', 'metamap_cui': 'C0343641', 'metamap_score': '10.12', 'metamap_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'hypertrophic obstructive cardiomyopathy', 'hypertrophic obstructive cardiomyopathy', {'metamap_preferred_name': 'Hypertrophic obstructive cardiomyopathy', 'metamap_cui': 'C4551472', 'metamap_score': '3.77', 'metamap_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'prehypertension', 'prehypertension', {'metamap_preferred_name': 'Prehypertension', 'metamap_cui': 'C1696708', 'metamap_score': '9.95', 'metamap_semtypes': '[dsyn]'}, None]


% conditions mapped:  50%|██████████          | 10/20 [00:16<00:11,  1

['metamap', 'condition', 'squamous cell carcinoma of the head and neck', 'squamous cell carcinoma of the head and neck', {'metamap_preferred_name': 'Squamous cell carcinoma of the head and neck', 'metamap_cui': 'C1168401', 'metamap_score': '19.70', 'metamap_semtypes': '[neop]'}, None]


% conditions mapped:  60%|████████████        | 12/20 [00:18<00:09,  1

['metamap', 'condition', 'attention deficit hyperactivity disorder', 'attention deficit hyperactivity disorder', {'metamap_preferred_name': 'Attention deficit hyperactivity disorder', 'metamap_cui': 'C1263846', 'metamap_score': '13.28', 'metamap_semtypes': '[mobd]'}, None]
['metamap', 'condition', 'liver metastases', 'liver metastases', {'metamap_preferred_name': 'Secondary malignant neoplasm of liver', 'metamap_cui': 'C0494165', 'metamap_score': '3.72', 'metamap_semtypes': '[neop]'}, None]
ATTEMPTING NAME RESOLVER HERE
TRYING NAME RESOLVER
['nameresolver', 'condition', 'safer sex', 'safer sex', {'nameresolver_preferred_name': "sexually active and practicing 'safer sex'", 'nameresolver_cui': 'UMLS:C2229924', 'nameresolver_score': 36.026413, 'nameresolver_semtypes': 'biolink:PhenotypicFeature'}, None]


% conditions mapped:  70%|██████████████      | 14/20 [00:20<00:06,  1

ATTEMPTING NAME RESOLVER HERE
TRYING NAME RESOLVER
['metamap', 'condition', 'metabolic associated-dysfunction steatotic liver disease (masld)', 'metabolic associated-dysfunction steatotic liver disease (masld)', {'metamap_preferred_name': None, 'metamap_cui': None, 'metamap_score': None, 'metamap_semtypes': None}, None]
['metamap', 'condition', 'metabolic associated-dysfunction steatotic liver disease (masld)', 'metabolic associated-dysfunction steatotic liver disease (masld)', {'metamap_preferred_name': 'Steatohepatitis', 'metamap_cui': 'C2711227', 'metamap_score': '9.69', 'metamap_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'metabolic associated-dysfunction steatotic liver disease (masld)', 'metabolic associated-dysfunction steatotic liver disease (masld)', {'metamap_preferred_name': 'Hepatic Metabolic derangement', 'metamap_cui': 'C0851734', 'metamap_score': '3.58', 'metamap_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'metabolic associated-dysfunction steatotic liv

% conditions mapped:  90%|██████████████████  | 18/20 [00:22<00:01,  1

['metamap', 'condition', 'carcinoma, non-small-cell lung', 'carcinoma, non-small-cell lung', {'metamap_preferred_name': 'Non-Small Cell Lung Carcinoma', 'metamap_cui': 'C0007131', 'metamap_score': '25.86', 'metamap_semtypes': '[neop]'}, None]
['metamap', 'condition', 'idiopathic myelofibrosis', 'idiopathic myelofibrosis', {'metamap_preferred_name': 'Primary Myelofibrosis', 'metamap_cui': 'C0001815', 'metamap_score': '16.33', 'metamap_semtypes': '[neop]'}, None]
['metamap', 'condition', 'acute patella tendon rupture', 'acute patella tendon rupture', {'metamap_preferred_name': 'Traumatic rupture of patellar tendon', 'metamap_cui': 'C0263969', 'metamap_score': '3.65', 'metamap_semtypes': '[inpo]'}, None]


% conditions mapped: 100%|████████████████████| 20/20 [00:24<00:00,  1
% interventions mapped:   0%|                    | 0/19 [00:00<?, ?it/

ATTEMPTING NAME RESOLVER HERE
TRYING NAME RESOLVER


% interventions mapped:   5%|█                   | 1/19 [00:08<02:40, 

Nothing returned from NR or Metamap
Nothing returned from NR or Metamap
['metamap', 'intervention', 'propolis extract', 'propolis extract', {'metamap_preferred_name': 'Propolis', 'metamap_cui': 'C0033488', 'metamap_score': '16.21', 'metamap_semtypes': '[bacs,phsu]'}, None]
['metamap', 'intervention', 'propolis extract', 'propolis extract', {'metamap_preferred_name': 'Propolis (fungal genus)', 'metamap_cui': 'C3160730', 'metamap_score': '3.59', 'metamap_semtypes': '[fngs]'}, None]
['metamap', 'intervention', 'propolis extract', 'propolis extract', {'metamap_preferred_name': 'Extract (substance)', 'metamap_cui': 'C2828366', 'metamap_score': '3.45', 'metamap_semtypes': '[sbst]'}, None]


% interventions mapped:  16%|███▏                | 3/19 [00:10<00:48, 

['metamap', 'intervention', 'laboratory biomarker analysis', 'laboratory biomarker analysis', {'metamap_preferred_name': 'Laboratory Biomarker Analysis', 'metamap_cui': 'C1881352', 'metamap_score': '3.77', 'metamap_semtypes': '[lbpr]'}, None]
['metamap', 'intervention', 'mindfulness meditation', 'mindfulness meditation', {'metamap_preferred_name': 'Mindfulness', 'metamap_cui': 'C3542996', 'metamap_score': '16.21', 'metamap_semtypes': '[menp]'}, None]
['metamap', 'intervention', 'mindfulness meditation', 'mindfulness meditation', {'metamap_preferred_name': 'meditation', 'metamap_cui': 'C0150277', 'metamap_score': '16.21', 'metamap_semtypes': '[menp]'}, None]
['metamap', 'intervention', 'mindfulness meditation', 'mindfulness meditation', {'metamap_preferred_name': 'Meditation Therapy', 'metamap_cui': 'C0814263', 'metamap_score': '3.59', 'metamap_semtypes': '[topp]'}, None]
['metamap', 'intervention', 'mindfulness meditation', 'mindfulness meditation', {'metamap_preferred_name': 'Mental c

% interventions mapped:  26%|█████▎              | 5/19 [00:12<00:25, 

['metamap', 'intervention', 'part 2 focus group', 'part 2 focus group', {'metamap_preferred_name': 'Focus Groups', 'metamap_cui': 'C0016400', 'metamap_score': '16.22', 'metamap_semtypes': '[grup]'}, None]
['metamap', 'intervention', 'part 2 focus group', 'part 2 focus group', {'metamap_preferred_name': 'acireductone dioxygenase [iron(II)-requiring] activity', 'metamap_cui': 'C1817569', 'metamap_score': '3.46', 'metamap_semtypes': '[moft]'}, None]
['metamap', 'intervention', 'part 2 focus group', 'part 2 focus group', {'metamap_preferred_name': 'Part', 'metamap_cui': 'C0449719', 'metamap_score': '3.43', 'metamap_semtypes': '[spco]'}, None]
['metamap', 'intervention', 'part 2 focus group', 'part 2 focus group', {'metamap_preferred_name': 'Part Dosing Unit', 'metamap_cui': 'C1709471', 'metamap_score': '3.43', 'metamap_semtypes': '[qnco]'}, None]
['metamap', 'intervention', 'skin prick test', 'skin prick test', {'metamap_preferred_name': 'Skin prick test', 'metamap_cui': 'C0430561', 'metam

% interventions mapped:  53%|██████████▌         | 10/19 [00:13<00:07,

['nameresolver', 'intervention', 'pnf', 'pnf', {'nameresolver_preferred_name': 'PNF protein, Arabidopsis', 'nameresolver_cui': 'MESH:C561284', 'nameresolver_score': 21.310198, 'nameresolver_semtypes': 'biolink:ChemicalEntity'}, None]
['metamap', 'intervention', 'keeping adults physically active (kapa) intervention', 'keeping adults physically active (kapa) intervention', {'metamap_preferred_name': None, 'metamap_cui': None, 'metamap_score': None, 'metamap_semtypes': None}, None]
['metamap', 'intervention', 'keeping adults physically active (kapa) intervention', 'keeping adults physically active (kapa) intervention', {'metamap_preferred_name': 'Adult', 'metamap_cui': 'C0001675', 'metamap_score': '9.74', 'metamap_semtypes': '[aggp]'}, None]
['metamap', 'intervention', 'keeping adults physically active (kapa) intervention', 'keeping adults physically active (kapa) intervention', {'metamap_preferred_name': 'Intervention regimes', 'metamap_cui': 'C1273869', 'metamap_score': '3.55', 'metamap

% interventions mapped:  68%|█████████████▋      | 13/19 [00:16<00:05,

['metamap', 'intervention', 'project x 5.1ml', 'project x 5.1ml', {'metamap_preferred_name': 'Project', 'metamap_cui': 'C1709701', 'metamap_score': '3.43', 'metamap_semtypes': '[cnce]'}, None]
['metamap', 'intervention', 'messaging', 'messaging', {'metamap_preferred_name': 'message', 'metamap_cui': 'C0470166', 'metamap_score': '3.60', 'metamap_semtypes': '[inpr]'}, None]
['metamap', 'intervention', 'tumor biopsy', 'tumor biopsy', {'metamap_preferred_name': 'Biopsy', 'metamap_cui': 'C0005558', 'metamap_score': '19.36', 'metamap_semtypes': '[diap]'}, None]
['metamap', 'intervention', 'tumor biopsy', 'tumor biopsy', {'metamap_preferred_name': 'biopsy characteristics', 'metamap_cui': 'C0220797', 'metamap_score': '3.59', 'metamap_semtypes': '[ftcn]'}, None]
['metamap', 'intervention', 'tumor biopsy', 'tumor biopsy', {'metamap_preferred_name': 'Specimen Source Codes - tumor', 'metamap_cui': 'C1578706', 'metamap_score': '3.45', 'metamap_semtypes': '[inpr]'}, None]


% interventions mapped:  84%|████████████████▊   | 16/19 [00:19<00:02,

['metamap', 'intervention', 'autologous hematopoietic stem cell transplantation', 'autologous hematopoietic stem cell transplantation', {'metamap_preferred_name': 'Transplantation of autologous hematopoietic stem cell', 'metamap_cui': 'C1831743', 'metamap_score': '3.85', 'metamap_semtypes': '[topp]'}, None]
['metamap', 'intervention', 'dog training education', 'dog training education', {'metamap_preferred_name': 'Canis familiaris', 'metamap_cui': 'C0012984', 'metamap_score': '28.67', 'metamap_semtypes': '[mamm]'}, None]
['metamap', 'intervention', 'dog training education', 'dog training education', {'metamap_preferred_name': 'education and training', 'metamap_cui': 'C0582584', 'metamap_score': '3.64', 'metamap_semtypes': '[edac]'}, None]
['metamap', 'intervention', 'dog training education', 'dog training education', {'metamap_preferred_name': 'Dog family', 'metamap_cui': 'C1280551', 'metamap_score': '3.44', 'metamap_semtypes': '[mamm]'}, None]
['metamap', 'intervention', 'intravenous i

% interventions mapped: 100%|████████████████████| 19/19 [00:22<00:00,
% intervention_alternates mapped:   5%|█                   | 1/20 [00:

['metamap', 'intervention_alternate', 'dexinoral', 'dexinoral', {'metamap_preferred_name': 'Dexinoral', 'metamap_cui': 'C1511856', 'metamap_score': '3.64', 'metamap_semtypes': '[orch,phsu]'}, None]
['metamap', 'intervention_alternate', 'rt', 'rt', {'metamap_preferred_name': 'Structure of right thigh', 'metamap_cui': 'C0230425', 'metamap_score': '3.64', 'metamap_semtypes': '[bpoc]'}, None]
['metamap', 'intervention_alternate', 'mk-3475', 'mk-3475', {'metamap_preferred_name': 'MK-3475', 'metamap_cui': 'C3660977', 'metamap_score': '3.72', 'metamap_semtypes': '[aapp,imft,phsu]'}, None]
['metamap', 'intervention_alternate', 'meticorten', 'meticorten', {'metamap_preferred_name': 'Meticorten', 'metamap_cui': 'C0728770', 'metamap_score': '22.57', 'metamap_semtypes': '[orch,phsu]'}, None]
['metamap', 'intervention_alternate', '1-ohp', '1-ohp', {'metamap_preferred_name': '2,3-diketo-5-methylthiopentyl-1-phosphate enolase activity', 'metamap_cui': 'C2248614', 'metamap_score': '3.52', 'metamap_sem

% intervention_alternates mapped:  35%|███████             | 7/20 [00:

['metamap', 'intervention_alternate', 'in vivo exposure', 'in vivo exposure', {'metamap_preferred_name': 'Exposure to', 'metamap_cui': 'C0332157', 'metamap_score': '3.57', 'metamap_semtypes': '[qlco]'}, None]
['metamap', 'intervention_alternate', 'in vivo exposure', 'in vivo exposure', {'metamap_preferred_name': 'in vivo', 'metamap_cui': 'C1515655', 'metamap_score': '3.48', 'metamap_semtypes': '[spco]'}, None]
['metamap', 'intervention_alternate', 'azacytidine', 'azacytidine', {'metamap_preferred_name': 'Azacitidine', 'metamap_cui': 'C0004475', 'metamap_score': '19.41', 'metamap_semtypes': '[nnon,phsu]'}, None]
['metamap', 'intervention_alternate', 'hsct', 'hsct', {'metamap_preferred_name': 'Hemopoietic stem cell transplant', 'metamap_cui': 'C0472699', 'metamap_score': '19.41', 'metamap_semtypes': '[topp]'}, None]
['metamap', 'intervention_alternate', 'hsct', 'hsct', {'metamap_preferred_name': 'Allogeneic Hematopoietic Stem Cell Transplantation', 'metamap_cui': 'C1705576', 'metamap_sco

% intervention_alternates mapped:  55%|███████████         | 11/20 [00

['metamap', 'intervention_alternate', 'audiological assessment', 'audiological assessment', {'metamap_preferred_name': 'Audiological evaluation', 'metamap_cui': 'C0200297', 'metamap_score': '3.72', 'metamap_semtypes': '[diap]'}, None]
['metamap', 'intervention_alternate', 'hy209 gel', 'hy209 gel', {'metamap_preferred_name': 'Gel', 'metamap_cui': 'C0017243', 'metamap_score': '13.05', 'metamap_semtypes': '[bodm]'}, None]
['metamap', 'intervention_alternate', 'hy209 gel', 'hy209 gel', {'metamap_preferred_name': 'Gel physical state', 'metamap_cui': 'C1382104', 'metamap_score': '3.59', 'metamap_semtypes': '[sbst]'}, None]
['metamap', 'intervention_alternate', 'chondroitin', 'chondroitin', {'metamap_preferred_name': 'Chondroitin', 'metamap_cui': 'C0008454', 'metamap_score': '13.10', 'metamap_semtypes': '[bacs,orch,phsu]'}, None]
['metamap', 'intervention_alternate', 'leucovorin', 'leucovorin', {'metamap_preferred_name': 'Leucovorin', 'metamap_cui': 'C0023413', 'metamap_score': '28.87', 'meta

% intervention_alternates mapped:  80%|████████████████    | 16/20 [00

['metamap', 'intervention_alternate', '2h-1,3,2-oxazaphosphorine, 2-[bis(2-chloroethyl)amino]tetrahydro-, 2-oxide, monohydrate', '2h-1,3,2-oxazaphosphorine, 2-[bis(2-chloroethyl)amino]tetrahydro-, 2-oxide, monohydrate', {'metamap_preferred_name': 'Oxides', 'metamap_cui': 'C0030015', 'metamap_score': '16.15', 'metamap_semtypes': '[inch]'}, None]
['metamap', 'intervention_alternate', '2h-1,3,2-oxazaphosphorine, 2-[bis(2-chloroethyl)amino]tetrahydro-, 2-oxide, monohydrate', '2h-1,3,2-oxazaphosphorine, 2-[bis(2-chloroethyl)amino]tetrahydro-, 2-oxide, monohydrate', {'metamap_preferred_name': '2 Hours', 'metamap_cui': 'C1292425', 'metamap_score': '3.53', 'metamap_semtypes': '[tmco]'}, None]
['metamap', 'intervention_alternate', '2h-1,3,2-oxazaphosphorine, 2-[bis(2-chloroethyl)amino]tetrahydro-, 2-oxide, monohydrate', '2h-1,3,2-oxazaphosphorine, 2-[bis(2-chloroethyl)amino]tetrahydro-, 2-oxide, monohydrate', {'metamap_preferred_name': 'BAG3 wt Allele', 'metamap_cui': 'C3811385', 'metamap_score

% intervention_alternates mapped: 100%|████████████████████| 20/20 [00


In [None]:

flag_and_path = get_raw_ct_data() # download raw data

global metamap_dirs
metamap_dirs = check_os()
df_dict = read_raw_ct_data(flag_and_path, subset_size) # read the clinical trial data
dict_new_terms = check_against_cache(df_dict, flag_and_path) # use the existing cache of MetaMapped terms so that only new terms are mapped

term_list_to_mm(dict_new_terms, flag_and_path) # map new terms using MetaMap

map_to_trial(flag_and_path) # map MetaMap terms back to trial 
score_mappings(flag_and_path) # score the mappings
auto_select_curies(flag_and_path) # select CURIEs automatically that pass score threshold

# compile_curies_for_trials(flag_and_path) # select CURIEs automatically that pass score threshold

