### THIS SCRIPT USES MetaMap to try and map the bulk of terms, and Name Resolver to pick up what's left

In [1]:
# display cells to maximum width 
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))

# lets you preint multiple outputs per cell, not just last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
import re
import collections
import os
import json
import numpy as np
import pickle
from functools import reduce
import time
from time import sleep
# import concurrent
import concurrent.futures
import multiprocessing
import datetime as dt
from datetime import date
import pathlib
import configparser
import sys
import urllib
import zipfile
import csv
sys.path.insert(0, '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/pymetamap-master')
from pymetamap import MetaMap  # https://github.com/AnthonyMRios/pymetamap/blob/master/pymetamap/SubprocessBackend.py
from pandas import ExcelWriter
import ast
import glob
from tqdm import tqdm
import subprocess
import shlex
from collections import Counter

# %pip install thefuzz
# %pip install levenshtein
# %pip install xlsxwriter
# %pip install ratelimit

from thefuzz import fuzz # fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

In [3]:
def get_token_sort_ratio(str1, str2):
    """ fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python """
    try:
        return fuzz.token_sort_ratio(str1, str2)
    except:
        return None

def get_token_set_ratio(str1, str2):
    """ fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python """
    try:
        return fuzz.token_set_ratio(str1, str2)
    except:
        return None  

def get_similarity_score(str1, str2):
    """ fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python """
    try:
        return fuzz.ratio(str1, str2)
    except:
        return None
    
def convert_seconds_to_hms(seconds):
    """ converts the elapsed time or runtime to hours, min, sec """
    hours = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return hours, minutes, seconds

def de_ascii_er(text):
    non_ascii = "[^\x00-\x7F]"
    pattern = re.compile(r"[^\x00-\x7F]")
    non_ascii_text = re.sub(pattern, ' ', text)
    return non_ascii_text

def start_metamap_servers(metamap_dirs):
    global metamap_pos_server_dir
    global metamap_wsd_server_dir
    metamap_pos_server_dir = 'bin/skrmedpostctl' # Part of speech tagger
    metamap_wsd_server_dir = 'bin/wsdserverctl' # Word sense disambiguation 
    
    metamap_executable_path_pos = os.path.join(metamap_dirs['metamap_base_dir'], metamap_pos_server_dir)
    metamap_executable_path_wsd = os.path.join(metamap_dirs['metamap_base_dir'], metamap_wsd_server_dir)
    command_pos = [metamap_executable_path_pos, 'start']
    command_wsd = [metamap_executable_path_wsd, 'start']

    # Start servers, with open portion redirects output of metamap server printing output to NULL
    with open(os.devnull, "w") as fnull:
        result_post = subprocess.call(command_pos, stdout = fnull, stderr = fnull)
        result_wsd = subprocess.call(command_wsd, stdout = fnull, stderr = fnull)
    sleep(5)

def stop_metamap_servers(metamap_dirs):
    metamap_executable_path_pos = os.path.join(metamap_dirs['metamap_base_dir'], metamap_pos_server_dir)
    metamap_executable_path_wsd = os.path.join(metamap_dirs['metamap_base_dir'], metamap_wsd_server_dir)
    command_pos = [metamap_executable_path_pos, 'stop']
    command_wsd = [metamap_executable_path_wsd, 'stop']
    
    # Stop servers, with open portion redirects output of metamap server printing output to NULL
    with open(os.devnull, "w") as fnull:
        result_post = subprocess.call(command_pos, stdout = fnull, stderr = fnull)
        result_wsd = subprocess.call(command_wsd, stdout = fnull, stderr = fnull)
    sleep(2)  
    
def add_mappings_to_cache(flag_and_path):
    relevant_date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data
    with open("metamapped_terms_cache.tsv", 'a+', encoding="utf-8") as cache:
        with open(f"{relevant_date}_metamap_output.tsv", 'r', encoding="utf-8", errors='ignore') as new_metamapped_terms:
            # Read the first line from new_metamapped_terms to move the cursor
            line = new_metamapped_terms.readline()

            # Move the cursor to the position after the first line
            while line:
                line = new_metamapped_terms.readline()
                if line:
                    # Append the line to file_1
                    cache.write(line)
    """ Remove duplicate rows from cache """
    cache = pd.read_csv("metamapped_terms_cache.tsv", sep='\t', index_col=False, header=0, on_bad_lines = 'warn')
    cache = cache.drop_duplicates()
    cache.to_csv('metamapped_terms_cache.tsv', sep="\t", index=False, header=True) # output deduplicated cache terms to TSV

def add_manually_selected_terms_to_cache():
    # -----     ------     GENERATE MANUALLY SELECTED CACHE     -----     ------  #
    try:
        #  --- --- --   CONDITIONS     --- --- --   #
        files = glob.glob("*.xlsx")
        conditions_manselected_files = [i for i in files if "conditions_manual_review" in i if not i.startswith("~")][0]  
        conditions_manselected = pd.read_excel(conditions_manselected_files)
        conditions_manselected.name.ffill(inplace=True)
        conditions_manselected.orig_con.ffill(inplace=True)
        conditions_manselected = conditions_manselected[~conditions_manselected['manually_selected_CURIE'].isnull()] # check if the conditions got mapped to any CURIEs
        conditions_manselected.drop(["curie_info"], axis = 1, inplace = True)
        conditions_manselected.rename(columns = {'name':'original_clin_trial_term', 'orig_con':'modified_clin_trial_term'}, inplace = True)

        with open('conditions_manually_selected_cache.tsv', 'a') as output:
            conditions_manselected.to_csv(output, mode='a',sep="\t", index=False, header=output.tell()==0)
        """ Remove duplicate rows from cache """
        cache = pd.read_csv("conditions_manually_selected_cache.tsv", sep='\t', index_col=False, header=0, on_bad_lines = 'warn')
        cache = cache.drop_duplicates()
        cache.to_csv('conditions_manually_selected_cache.tsv', sep="\t", index=False, header=True) # output deduplicated cache terms to TSV

        #  --- --- --   INTERVENTIONS and Alternate INTERVENTIONS   --- --- --   #
        files = glob.glob("*.xlsx")
        interventions_manselected_files = [i for i in files if "interventions_manual_review" in i if not i.startswith("~")][0]  
        interventions_manselected = pd.read_excel(interventions_manselected_files)
        interventions_manselected.name.ffill(inplace=True)
        interventions_manselected.orig_int.ffill(inplace=True)
        interventions_manselected = interventions_manselected[~interventions_manselected['manually_selected_CURIE'].isnull()] # check if the conditions got mapped to any CURIEs
        interventions_manselected.drop(["curie_info", "description"], axis = 1, inplace = True)
        interventions_manselected.rename(columns = {'name':'original_clin_trial_term', 'orig_int':'modified_clin_trial_term'}, inplace = True)

        with open('interventions_manually_selected_cache.tsv', 'a') as output:
            interventions_manselected.to_csv(output, mode='a',sep="\t", index=False, header=output.tell()==0)
        """ Remove duplicate rows from cache """
        cache = pd.read_csv("interventions_manually_selected_cache.tsv", sep='\t', index_col=False, header=0, on_bad_lines = 'warn')
        cache = cache.drop_duplicates()
        cache.to_csv('interventions_manually_selected_cache.tsv', sep="\t", index=False, header=True) # output deduplicated cache terms to TSV
    except:
        print("No terms in manual select column; either column is empty or bug. Proceeding without them")
        
def check_os():
    if "linux" in sys.platform:
        print("Linux platform detected")
        metamap_base_dir = "{}/metamap/".format(pathlib.Path.cwd().parents[0])
        metamap_bin_dir = 'bin/metamap20'
    else:
        metamap_base_dir = '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/' # for running on local
        metamap_bin_dir = 'bin/metamap18'
        
    return {"metamap_base_dir":metamap_base_dir, "metamap_bin_dir":metamap_bin_dir}  


In [5]:
def get_raw_ct_data():
    term_program_flag = True
    global data_dir
    global data_extracted
    
    try:
        # get all the links and associated dates of upload into a dict called date_link
        url_all = "https://aact.ctti-clinicaltrials.org/download"
        response = requests.get(url_all)
        soup = BeautifulSoup(response.text, features="lxml")
        body = soup.find_all('option') #Find all
        date_link = {}
        for el in body:
            tags = el.find('a')
            try:
                zip_name = tags.contents[0].split()[0]
                date = zip_name.split("_")[0]
                date = dt.datetime.strptime(date, '%Y%m%d').date()
                date_link[date] = tags.get('href')
            except:
                pass
        latest_file_date = max(date_link.keys())   # get the date of the latest upload
        url = date_link[latest_file_date]   # get the corresponding download link of the latest upload so we can download the raw data
        date_string = latest_file_date.strftime("%m_%d_%Y")
        data_dir = "{}/data".format(pathlib.Path.cwd())
        data_extracted = data_dir + "/{}_extracted".format(date_string)
        data_path = "{}/{}_pipe-delimited-export.zip".format(data_dir, date_string)
    except:
        print("continue")

    if not os.path.exists(data_path):   # if folder containing most recent data doesn't exist, download and extract it into data folder

        term_program_flag = False   # flag below for terminating program if latest download exists (KG is assumed up to date)
        print("Attempting download of Clinical Trial data as of {}\n".format(date_string))
        try:
            response = requests.get(url)
            if response.status_code == 200:
                with open(data_path, 'wb') as file:
                    file.write(response.content)
                print("Finished download of zip")
                with zipfile.ZipFile(data_path, 'r') as download:
                    print("Unzipping data")
                    download.extractall(data_extracted)
        except:
            print("Failed to scrape AACT for download. Please navigate to https://aact.ctti-clinicaltrials.org/download and manually download zip file.")
            print("Please store the downloaded zip in the /data directory. This should be the only item besides the cache file, condition manual review file, and intervention manual review file, in the directory at this time.")
            done = input("Type Done when done: ")
            if done == "Done":
                data_dir = "{}/data".format(pathlib.Path.cwd())
                # list_of_files = glob.glob(data_dir + "/*") # get all files in directory
                try:
                    # latest_file = max(list_of_files, key=os.path.getctime) # get the most recent file in the directory
                    pattern = os.path.join(data_dir, "*.zip")
                    zip_file = glob.glob(pattern) # look for file in directory that ends in ".zip"
                    zip_file = zip_file[0]
                    print("File found at: ")
                    print(zip_file)
                    # print(latest_file)
                    print("Please make sure this the correct zip file from AACT")
                    if not os.path.exists(data_extracted):   # if folder of unzipped data does not exist, unzip
                        try:
                            with zipfile.ZipFile(zip_file, 'r') as download:
                                print("Unzipping data into")
                                cttime = os.path.getctime(zip_file)
                                date_string = dt.datetime.fromtimestamp(cttime).strftime('%m_%d_%Y')
                                data_extracted = data_dir + "/{}_extracted".format(date_string)
                                print(data_extracted)
                                download.extractall(data_extracted)
                        except:
                            pattern = os.path.join(data_dir, "*_extracted")
                            extracted_file = glob.glob(pattern) # look for file in directory that ends in "_extracted"
                            data_extracted = extracted_file[0]
                            extracted_name = os.path.basename(os.path.normpath(extracted_file[0]))
                            date_string = extracted_name.replace('_extracted', '')
                            print("Assuming data is already unzipped")
                        
                except:
                    print("Unable to download and extract Clincal Trial data.")
                    print("Cannot find pipe-delimited zip in /data folder.")
    else:
        print("KG is already up to date.")

    return {"term_program_flag": term_program_flag, "data_extracted_path": data_extracted, "date_string": date_string}


In [6]:
def read_raw_ct_data(flag_and_path, subset_size):
    if flag_and_path["term_program_flag"]:
        print("Exiting program. Assuming KG has already been constructed from most recent data dump from AACT.")
        exit()
    else:
        data_extracted = flag_and_path["data_extracted_path"]
        # read in pipe-delimited files 
        conditions_df = pd.read_csv(data_extracted + '/conditions.txt', sep='|', index_col=False, header=0, on_bad_lines = 'warn')
        interventions_df = pd.read_csv(data_extracted + '/interventions.txt', sep='|', index_col=False, header=0, on_bad_lines = 'warn')
        interventions_alts_df = pd.read_csv(data_extracted + '/intervention_other_names.txt', sep='|', index_col=False, header=0, on_bad_lines = 'warn')

        if subset_size:   # if a subset size is given, we are running this script on a small subset of the dataset
            conditions_df = conditions_df.sample(n=subset_size)
            interventions_df = interventions_df.sample(n=subset_size)
            interventions_alts_df = interventions_alts_df.sample(n=subset_size)
    
    df_dict = {"conditions": conditions_df, "interventions": interventions_df, "interventions_alts": interventions_alts_df}
    return df_dict


# Check against cache, retrieve terms not already mapped

In [7]:
def check_against_cache(df_dict):
    conditions_list = df_dict['conditions'].name.unique().tolist()
    conditions_list = [str(i) for i in conditions_list]
    conditions_list = list(set([i.lower() for i in conditions_list]))
    
    interventions_list = df_dict['interventions'].name.unique().tolist()
    interventions_list = [str(i) for i in interventions_list]
    interventions_list = list(set([i.lower() for i in interventions_list]))
    
    interventions_alts_list = df_dict['interventions_alts'].name.unique().tolist()
    interventions_alts_list = [str(i) for i in interventions_alts_list]
    interventions_alts_list = list(set([i.lower() for i in interventions_alts_list]))
    
    try:        
        cache_df = pd.read_csv("mapping_cache.tsv", sep ="\t", index_col=False, header=0, on_bad_lines = 'warn')
        
        conditions_cache = cache_df[cache_df["term_type"] == "condition"]
        conditions_cache = conditions_cache['clintrial_term'].unique().tolist()
        conditions_cache = list(set([i.lower() for i in conditions_cache]))
        
        conditions_new = [x for x in conditions_list if x not in conditions_cache] # find conditions not in the cache (i.g. new conditions to map)
        conditions_new = list(filter(None, conditions_new))
        conditions_new = [str(i) for i in conditions_new]
        
        interventions_cache = cache_df[cache_df["term_type"] == "intervention"]
        interventions_cache = interventions_cache['clintrial_term'].unique().tolist()
        interventions_cache = list(set([i.lower() for i in interventions_cache]))
        
        interventions_new = [x for x in interventions_list if x not in interventions_cache] # find interventions not in the cache (i.g. new interventions to map)
        interventions_new = list(filter(None, interventions_new))
        interventions_new = [str(i) for i in interventions_new]
        
        interventions_alts_cache = cache_df[cache_df["term_type"] == "intervention_alternate"]
        interventions_alts_cache = interventions_alts_cache['clintrial_term'].unique().tolist()
        interventions_alts_cache = list(set([i.lower() for i in interventions_alts_cache]))
        
        interventions_alts_new = [x for x in interventions_alts_list if x not in interventions_alts_cache] # find interventions_alts not in the cache (i.g. new interventions_alts to map)
        interventions_alts_new = list(filter(None, interventions_alts_new))
        interventions_alts_new = [str(i) for i in interventions_alts_new]
        
    except:
        print("No cache of terms found. Proceeding to map entire KG from scratch")
        conditions_new = conditions_list
        interventions_new = interventions_list
        interventions_alts_new = interventions_alts_list
        
    dict_new_terms = {"conditions": conditions_new, "interventions": interventions_new, "interventions_alts": interventions_alts_new}

    return dict_new_terms


# Map new terms using Mapper

In [17]:
def get_nr_response(orig_term):
    def create_session():
        s = requests.Session()
        return s
        def api_calls(resp, *args, **kwargs):
            total = 40
            print(total)
        s.hooks["response"] = api_calls
 
    sess = create_session()
 
    """   Runs Name Resolver   """
    nr_url = 'https://name-resolution-sri.renci.org/lookup'
    max_retries = 3 
    
    input_term = orig_term # in MetaMap, we have to potentially deascii the term and lower case it...for Name Resolver, we don't need to do that. To keep columns consist with MetaMap output, we just keep it and say the original term and the input term are the same. For MetaMap, they might be different
    retries = 0
    params = {'string':orig_term, 'limit':1} # limit -1 makes this return all available equivalent CURIEs name resolver can give (deprecated)
    while retries <= max_retries:
        try:
            r = sess.post(nr_url, params=params)
            if r.status_code == 200:
                mapping_tool_response = r.json()  # process Name Resolver response
                return mapping_tool_response
            else:
                return None
        except (requests.RequestException, ConnectionResetError, OSError) as ex:
            print(f"\nName Resolver request failed for term: {term}. Error: {ex}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying ({retries}/{max_retries}) after a delay.")
                time.sleep(2 ** retries)  # Increase the delay between retries exponentially
            else:
                print(f"Max retries (Name Resolver) reached for term: {term}.")
                return None
    
    # request_count +=1
    # if request_count % 50 == 0:  # if 50 requests to API have been made, sleep 10 secs
    #     time.sleep(10)  
    

In [16]:
# # nr_response = get_nr_response(orig_term)
# # ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
# for i in ["cordiceps", "chocolate", "diabetes mellitus", "blipadbloo", "catheter", "humira®"]:
#     nr_result = get_nr_response(i)
#     if nr_result:
#         nr_result
#         nr_curie = nr_result[0]["curie"]
#         nr_name = nr_result[0]["label"]
#         nr_type = nr_result[0]["types"][0]
#         nr_score = nr_result[0]["score"]
#         new_concept_dict = {"nameresolver_preferred_name": nr_name,
#                              "nameresolver_cui": nr_curie,
#                              "nameresolver_score": nr_score,
#                              "nameresolver_semtypes": nr_type}
#     else:
#         print("nothing returned from NR")

In [44]:
# I'm only getting 1 concept from Name Resolver. 
# Both MetaMap and Name Resolver return several, 
# but I only take 1 from Name Resolver bc they have a preferred concept.
# MetaMap's 2nd or 3rd result is often the best one, so I collect all of them and try to score"

def process_metamap_concept(concept):
    concept = concept._asdict()
    concept_dict  = {"mapped_name": concept.get("preferred_name"),
                     "mapped_curie": concept.get("cui"),
                     "mapped_score": concept.get("score"),
                     "mapped_semtypes": concept.get("semtypes")}
    return concept_dict

def process_nameresolver_response(nr_response):              
    nr_curie = nr_response[0]["curie"]
    nr_name = nr_response[0]["label"]
    nr_type = nr_response[0]["types"][0]
    nr_score = nr_response[0]["score"]
    concept_dict = {"mapped_name": nr_name,
                    "mapped_curie": nr_curie,
                    "mapped_score": nr_score,
                    "mapped_semtypes": nr_type}
    return concept_dict

In [42]:
def run_mappers(term_pair, params, mm, term_type, csv_writer):

    orig_term = term_pair[0]
    input_term = term_pair[1]
    from_mapper = []
    
    # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']

    if params.get("exclude_sts") is None: # exclude_sts is used for Interventions. restrict_to_sts is used for Conditions. So, the logic is, if we're mapping Conditions, execute "if" part of code. If we're mapping Interventions, execute "else" part of code
        try:
            concepts,error = mm.extract_concepts([input_term],
                                                 restrict_to_sts = params["restrict_to_sts"],
                                                 term_processing = params["term_processing"],
                                                 ignore_word_order = params["ignore_word_order"],
                                                 strict_model = params["strict_model"],)
                                                    
            if concepts:   # if MetaMap gives response, process response
                mapping_tool = "metamap"
                for concept in concepts:
                    concept_info = []
                    new_concept_dict = process_metamap_concept(concept)
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term, new_concept_dict, None]) # score column is empty, Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
                    from_mapper.append(concept_info)
            else:   # if MetaMap fails, try using Name Resolver and process response
                nr_response = get_nr_response(orig_term)
                if nr_response: # if Name Resolver gives response, process repsonse
                    input_term = orig_term # no preprocessing (lowercasing or deascii-ing) necessary to submit terms to Name Resolver (unlike MetaMap)
                    mapping_tool = "nameresolver"
                    concept_info = []
                    new_concept_dict = process_nameresolver_response(nr_response)
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term, new_concept_dict, None]) # Add None for score column, empty bc not scored yet
                    from_mapper.append(concept_info)
                else:
                    concept_info = []
                    print("Nothing returned from NR or Metamap")
                    concept_info.extend(["mapping_tools_failed", term_type, orig_term, input_term, None, None])
                    from_mapper.append(concept_info)
        except:
            concept_info = []
            print("Nothing returned from NR or Metamap")
            concept_info.extend(["mapping_tools_failed", term_type, orig_term, input_term, None, None])
            from_mapper.append(concept_info)
            
    else:   # Else block triggered if mapping Interventions
        try:
            concepts,error = mm.extract_concepts([input_term],
                                                 exclude_sts = params["exclude_sts"],
                                                 term_processing = params["term_processing"],
                                                 ignore_word_order = params["ignore_word_order"],
                                                 strict_model = params["strict_model"],) 
                                                   
            if concepts:   # if MetaMap gives response, process response
                mapping_tool = "metamap"
                for concept in concepts:
                    concept_info = []
                    new_concept_dict = process_metamap_concept(concept)
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term, new_concept_dict, None]) # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
                    from_mapper.append(concept_info)
            else:   # if MetaMap fails, try using Name Resolver and process response
                nr_response = get_nr_response(orig_term) 
                if nr_response: # if Name Resolver gives response, process repsonse
                    input_term = orig_term # no preprocessing (lowercasing or deascii-ing) necessary to submit terms to Name Resolver (unlike MetaMap)
                    mapping_tool = "nameresolver"
                    concept_info = []
                    new_concept_dict = process_nameresolver_response(nr_response)
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term, new_concept_dict, None])
                    from_mapper.append(concept_info)
                else:
                    concept_info = []
                    print("Nothing returned from NR or Metamap")
                    concept_info.extend(["mapping_tools_failed", term_type, orig_term, input_term, None, None])
                    from_mapper.append(concept_info)
        except:
            concept_info = []
            print("Nothing returned from NR or Metamap")
            concept_info.extend(["mapping_tools_failed", term_type, orig_term, input_term, None, None])
            from_mapper.append(concept_info)

    for result in from_mapper:
        # print(result)
        csv_writer.writerow(result)
    # return from_metamap

In [35]:
def parallelize_mappers(term_pair_list, params, term_type, csv_writer):

    LENGTH = len(term_pair_list)  # Number of iterations required to fill progress bar (pbar)
    pbar = tqdm(total=LENGTH, desc="% {}s mapped".format(term_type), position=0, leave=True, mininterval = LENGTH/20, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')  # Init progress bar

    start_metamap_servers(metamap_dirs) # start the MetaMap servers
    mm = MetaMap.get_instance(metamap_dirs["metamap_base_dir"] + metamap_dirs["metamap_bin_dir"])
    with concurrent.futures.ThreadPoolExecutor((multiprocessing.cpu_count()*2) - 1) as executor:
        futures = [executor.submit(run_mappers, term_pair, params, mm, term_type, csv_writer) for term_pair in term_pair_list]
        for _ in concurrent.futures.as_completed(futures):
            pbar.update(n=1)  # Increments counter
    stop_metamap_servers(metamap_dirs) # stop the MetaMap servers
    

In [36]:
def term_list_to_mappers(dict_new_terms):   
    metamap_version = [int(s) for s in re.findall(r'\d+', metamap_dirs.get('metamap_bin_dir'))] # get MetaMap version being run 
    deasciier = np.vectorize(de_ascii_er) # vectorize function
    
    # open mapping cache to add mapped terms
    mapping_filename = "mapping_cache.tsv"
    if os.path.exists(mapping_filename):
        output = open(mapping_filename, 'a', newline='') 
        csv_writer = csv.writer(output, delimiter='\t')
    else:
        output = open(mapping_filename, 'w+', newline='')
        col_names = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
        csv_writer = csv.writer(output, delimiter='\t')
        csv_writer.writerow(col_names)

    #  - Conditions
    condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    conditions = dict_new_terms.get("conditions")
    condition_params = {"restrict_to_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    # conditon_term_type = "condition"

    #  - Interventions
    condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    interventions = dict_new_terms.get("interventions")
    intervention_params = {"exclude_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    # intervention_term_type = "intervention"

    #  - Alternate Interventions
    condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    interventions_alts = dict_new_terms.get("interventions_alts")
    intervention_params = {"exclude_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    # intervention_alternate_term_type = "intervention_alternate"
    
    if metamap_version[0] >= 20:
        print("MetaMap version >= 2020, conduct mapping on original terms")
        parallelize_mappers(list(zip(conditions, conditions)), condition_params, "condition", csv_writer)
        parallelize_mappers(list(zip(interventions, interventions)), intervention_params, "intervention", csv_writer)
        parallelize_mappers(list(zip(interventions_alts, interventions_alts)), intervention_alts_params, "alternate_intervention", csv_writer)
    else:
        print("MetaMap version < 2020, conduct mapping on terms after removing ascii characters")
        deascii_cons = deasciier(conditions)
        deascii_ints = deasciier(interventions)
        deascii_int_alts = deasciier(interventions_alts)
        parallelize_mappers(list(zip(conditions, deascii_cons)), condition_params, "condition", csv_writer)
        parallelize_mappers(list(zip(interventions, deascii_ints)), intervention_params, "intervention", csv_writer)
        parallelize_mappers(list(zip(interventions_alts, deascii_int_alts)), intervention_params, "intervention_alternate", csv_writer)

    output.close()
    
    """ Remove duplicate rows from cache """
    cache = pd.read_csv(mapping_filename, sep='\t', index_col=False, header=0, on_bad_lines = 'warn')
    cache = cache.drop_duplicates()
    cache.to_csv(mapping_filename, sep="\t", index=False, header=True) # output deduplicated cache terms to TSV


In [None]:
# # def score_metamap_mappings():


# header = True
# with pd.read_csv("mapping_cache.tsv", sep='\t', index_col=False, header=0, on_bad_lines = 'warn', chunksize=1000) as reader:
#     for chunk in reader:
#         chunk["scored"] = np.where(~chunk["score"].isnull(), chunk["score"],
#                                    np.where((chunk.score.isnull())&(chunk.mapping_tool == "metamap"),
#         df['d'] = np.where(df.a.isnull(),
#          np.nan,
#          np.where((df.b == "N")&(~df.c.isnull()),
#                   df.a*df.c,
#                   df.a))
        
        
#         for i, row in chunk.iterrows():
#             print(type(row["score"]))
#             if not row["score"].isnull():
#                 print(i)
# #                 break
#             elif pd.isnull(row["score"]) and row["mapping_tool"] == "metamap":
#                 mm_dict = ast.literal_eval(row["mapping_tool_response"])
#                 mapped_term = mm_dict['metamap_preferred_name']
#                 sort_ratio_score = get_token_sort_ratio(row["clintrial_term"], mapped_term)
#                 similarity_score = get_similarity_score(row["clintrial_term"], mapped_term)
#                 max_score = max(sort_ratio_score, similarity_score)
#                 chunk.at[i, "score"] = max_score
#             elif pd.isnull(row["score"]) and row["mapping_tool"] == "nameresolver":
#                 break
            
#             chunk.to_csv("mapping_cache_scored.tsv", header=header, sep="\t", index=False, mode='a+')
#             header = False

#       header = True
# for chunk in chunks:

#     chunk.to_csv(os.path.join(folder, new_folder, "new_file_" + filename),
#         header=header, cols=[['TIME','STUFF']], mode='a')

#     header = False          
                
            
# original_clintrial_term = row["clintrial_term"]


In [46]:
flag_and_path = get_raw_ct_data() # download raw data
global metamap_dirs
metamap_dirs = check_os()
subset_size = 30
df_dict = read_raw_ct_data(flag_and_path, subset_size) # read the clinical trial data
dict_new_terms = check_against_cache(df_dict) # use the existing cache of MetaMapped terms so that only new terms are mapped
term_list_to_mappers(dict_new_terms)

Attempting download of Clinical Trial data as of 02_21_2024

Failed to scrape AACT for download. Please navigate to https://aact.ctti-clinicaltrials.org/download and manually download zip file.
Please store the downloaded zip in the /data directory. This should be the only item besides the cache file, condition manual review file, and intervention manual review file, in the directory at this time.


Type Done when done:  Done


File found at: 
/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/8vstm2enpo0ocbo2z7oypqurhgmz.zip
Please make sure this the correct zip file from AACT
Unzipping data into
/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/02_20_2024_extracted
MetaMap version < 2020, conduct mapping on terms after removing ascii characters


% conditions mapped:   3%|▋                   | 1/29 [00:13<06:29, 13.

['metamap', 'condition', 'poliomyelitis', 'poliomyelitis', {'mapped_name': 'Poliomyelitis', 'mapped_curie': 'C0032371', 'mapped_score': '16.26', 'mapped_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'gastroparesis', 'gastroparesis', {'mapped_name': 'Gastroparesis', 'mapped_curie': 'C0152020', 'mapped_score': '16.26', 'mapped_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'gastroparesis', 'gastroparesis', {'mapped_name': 'Gastroparesis, CTCAE', 'mapped_curie': 'C4552807', 'mapped_score': '3.64', 'mapped_semtypes': '[fndg]'}, None]
['metamap', 'condition', 'sars-cov2', 'sars-cov2', {'mapped_name': 'Severe Acute Respiratory Syndrome', 'mapped_curie': 'C1175175', 'mapped_score': '19.22', 'mapped_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'leukemia', 'leukemia', {'mapped_name': 'leukemia', 'mapped_curie': 'C0023418', 'mapped_score': '9.95', 'mapped_semtypes': '[neop]'}, None]
['metamap', 'condition', 'leukemia', 'leukemia', {'mapped_name': 'Childhood Leukemia', 'mappe

% conditions mapped:  24%|████▊               | 7/29 [00:15<00:38,  1.

['metamap', 'condition', 'autistic disorder of childhood onset with full syndrome', 'autistic disorder of childhood onset with full syndrome', {'mapped_name': 'Autistic Disorder', 'mapped_curie': 'C0004352', 'mapped_score': '16.28', 'mapped_semtypes': '[mobd]'}, None]
['metamap', 'condition', 'autistic disorder of childhood onset with full syndrome', 'autistic disorder of childhood onset with full syndrome', {'mapped_name': 'Syndrome', 'mapped_curie': 'C0039082', 'mapped_score': '12.88', 'mapped_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'irritable bowel syndrome', 'irritable bowel syndrome', {'mapped_name': 'Irritable Bowel Syndrome', 'mapped_curie': 'C0022104', 'mapped_score': '19.54', 'mapped_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'atrial fibrillation', 'atrial fibrillation', {'mapped_name': 'Atrial Fibrillation', 'mapped_curie': 'C0004238', 'mapped_score': '13.18', 'mapped_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'atrial fibrillation', 'atrial fi

% conditions mapped:  41%|████████▎           | 12/29 [00:22<00:25,  1

['metamap', 'condition', 'atopic dermatitis', 'atopic dermatitis', {'mapped_name': 'Dermatitis, Atopic', 'mapped_curie': 'C0011615', 'mapped_score': '13.18', 'mapped_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'atopic dermatitis', 'atopic dermatitis', {'mapped_name': 'Eczema', 'mapped_curie': 'C0013595', 'mapped_score': '13.18', 'mapped_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'primary dysmenorrhea', 'primary dysmenorrhea', {'mapped_name': 'Primary dysmenorrhea', 'mapped_curie': 'C0149875', 'mapped_score': '3.72', 'mapped_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'healthy diet', 'healthy diet', {'mapped_name': 'Healthy Diet', 'mapped_curie': 'C0452415', 'mapped_score': '16.33', 'mapped_semtypes': '[fndg]'}, None]


% conditions mapped:  52%|██████████▎         | 15/29 [00:26<00:20,  1

['metamap', 'condition', 'respiratory failure', 'respiratory failure', {'mapped_name': 'Respiratory Failure', 'mapped_curie': 'C1145670', 'mapped_score': '10.02', 'mapped_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'respiratory failure', 'respiratory failure', {'mapped_name': 'Respiratory Failure, CTCAE', 'mapped_curie': 'C4552651', 'mapped_score': '3.72', 'mapped_semtypes': '[fndg]'}, None]
['metamap', 'condition', 'cchs with neuroblastoma', 'cchs with neuroblastoma', {'mapped_name': 'Neuroblastoma', 'mapped_curie': 'C0027819', 'mapped_score': '25.52', 'mapped_semtypes': '[neop]'}, None]
['metamap', 'condition', 'cchs with neuroblastoma', 'cchs with neuroblastoma', {'mapped_name': 'Congenital central hypoventilation', 'mapped_curie': 'C1275808', 'mapped_score': '13.03', 'mapped_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'cchs with neuroblastoma', 'cchs with neuroblastoma', {'mapped_name': 'Central neuroblastoma', 'mapped_curie': 'C0700095', 'mapped_score': '3.44', '

% conditions mapped:  59%|███████████▋        | 17/29 [00:28<00:15,  1

['metamap', 'condition', 'neuroblastoma', 'neuroblastoma', {'mapped_name': 'Neuroblastoma', 'mapped_curie': 'C0027819', 'mapped_score': '25.72', 'mapped_semtypes': '[neop]'}, None]
['metamap', 'condition', 'neuroblastoma', 'neuroblastoma', {'mapped_name': 'Central neuroblastoma', 'mapped_curie': 'C0700095', 'mapped_score': '3.64', 'mapped_semtypes': '[neop]'}, None]
['metamap', 'condition', 'neuroblastoma', 'neuroblastoma', {'mapped_name': 'Childhood Neuroblastoma', 'mapped_curie': 'C4086165', 'mapped_score': '3.64', 'mapped_semtypes': '[neop]'}, None]
['metamap', 'condition', 'back pain', 'back pain', {'mapped_name': 'Back Pain, CTCAE 3.0', 'mapped_curie': 'C1963071', 'mapped_score': '3.72', 'mapped_semtypes': '[fndg]'}, None]
['metamap', 'condition', 'back pain', 'back pain', {'mapped_name': 'Back Pain, CTCAE 5.0', 'mapped_curie': 'C4553945', 'mapped_score': '3.72', 'mapped_semtypes': '[fndg]'}, None]
['nameresolver', 'condition', 'hiv prevention', 'hiv prevention', {'mapped_name': '

% conditions mapped:  69%|█████████████▊      | 20/29 [00:30<00:09,  1

['metamap', 'condition', 'chronic neck pain', 'chronic neck pain', {'mapped_name': 'Chronic neck pain', 'mapped_curie': 'C0746815', 'mapped_score': '3.77', 'mapped_semtypes': '[fndg]'}, None]
['metamap', 'condition', 'genetic diseases, x-linked', 'genetic diseases, x-linked', {'mapped_name': 'Genetic Diseases, X-Linked', 'mapped_curie': 'C1138434', 'mapped_score': '10.04', 'mapped_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'preoperative radiologic findings', 'preoperative radiologic findings', {'mapped_name': 'Radiologic finding', 'mapped_curie': 'C1290916', 'mapped_score': '3.62', 'mapped_semtypes': '[fndg]'}, None]


% conditions mapped:  79%|███████████████▊    | 23/29 [00:33<00:06,  1

['metamap', 'condition', 'cholelithiasis', 'cholelithiasis', {'mapped_name': 'Cholelithiasis', 'mapped_curie': 'C0008350', 'mapped_score': '9.95', 'mapped_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'stem cell transplant complications', 'stem cell transplant complications', {'mapped_name': 'Complications of stem cell transplant', 'mapped_curie': 'C3251587', 'mapped_score': '3.81', 'mapped_semtypes': '[patf]'}, None]


% conditions mapped:  86%|█████████████████▏  | 25/29 [00:35<00:04,  1

['metamap', 'condition', 'deep vein thrombosis (dvt)', 'deep vein thrombosis (dvt)', {'mapped_name': None, 'mapped_curie': None, 'mapped_score': None, 'mapped_semtypes': None}, None]
['metamap', 'condition', 'deep vein thrombosis (dvt)', 'deep vein thrombosis (dvt)', {'mapped_name': 'Deep Vein Thrombosis', 'mapped_curie': 'C0149871', 'mapped_score': '16.39', 'mapped_semtypes': '[dsyn]'}, None]
['nameresolver', 'condition', 'allergic rhinoconjunctivitis', 'allergic rhinoconjunctivitis', {'mapped_name': 'Allergic', 'mapped_curie': 'UMLS:C0700624', 'mapped_score': 100.71954, 'mapped_semtypes': 'biolink:InformationContentEntity'}, None]
['metamap', 'condition', 'sezary syndrome', 'sezary syndrome', {'mapped_name': 'Sezary Syndrome', 'mapped_curie': 'C0036920', 'mapped_score': '25.80', 'mapped_semtypes': '[neop]'}, None]
['nameresolver', 'condition', 'feeding behavior', 'feeding behavior', {'mapped_name': 'feeding behavior', 'mapped_curie': 'GO:0007631', 'mapped_score': 99.11896, 'mapped_se

% conditions mapped: 100%|████████████████████| 29/29 [00:37<00:00,  1

['metamap', 'condition', 'mild cognitive impairment', 'mild cognitive impairment', {'mapped_name': 'Mild cognitive disorder', 'mapped_curie': 'C1270972', 'mapped_score': '3.77', 'mapped_semtypes': '[mobd]'}, None]


% conditions mapped: 100%|████████████████████| 29/29 [00:39<00:00,  1
% interventions mapped:   3%|▋                   | 1/29 [00:11<05:08, 

Nothing returned from NR or Metamap
['mapping_tools_failed', 'intervention', 'eyepeace', 'eyepeace', None, None]
['metamap', 'intervention', 'dexrazoxane hydrochloride', 'dexrazoxane hydrochloride', {'mapped_name': 'Dexrazoxane hydrochloride', 'mapped_curie': 'C0982118', 'mapped_score': '3.72', 'mapped_semtypes': '[orch,phsu]'}, None]


% interventions mapped:  10%|██                  | 3/29 [00:12<01:30, 

['metamap', 'intervention', 'passive recovery', 'passive recovery', {'mapped_name': 'Recovery - action', 'mapped_curie': 'C0237820', 'mapped_score': '3.59', 'mapped_semtypes': '[acty]'}, None]
['metamap', 'intervention', 'passive recovery', 'passive recovery', {'mapped_name': 'Recovery - healing process', 'mapped_curie': 'C2004454', 'mapped_score': '3.59', 'mapped_semtypes': '[orgf]'}, None]
['metamap', 'intervention', 'passive recovery', 'passive recovery', {'mapped_name': 'recovery - adjustment', 'mapped_curie': 'C1555688', 'mapped_score': '3.59', 'mapped_semtypes': '[qnco]'}, None]
['metamap', 'intervention', 'passive recovery', 'passive recovery', {'mapped_name': 'Passive', 'mapped_curie': 'C3686820', 'mapped_score': '3.45', 'mapped_semtypes': '[qlco]'}, None]
['metamap', 'intervention', 'clinical scales', 'clinical scales', {'mapped_name': 'Integumentary scale', 'mapped_curie': 'C0222045', 'mapped_score': '3.59', 'mapped_semtypes': '[bpoc]'}, None]
['metamap', 'intervention', 'cli

% interventions mapped:  21%|████▏               | 6/29 [00:14<00:41, 

['metamap', 'intervention', 'orthokeratology with aspheric wide inversion zone design in the optical zone', 'orthokeratology with aspheric wide inversion zone design in the optical zone', {'mapped_name': 'Vision', 'mapped_curie': 'C0042789', 'mapped_score': '16.00', 'mapped_semtypes': '[orgf]'}, None]
['metamap', 'intervention', 'orthokeratology with aspheric wide inversion zone design in the optical zone', 'orthokeratology with aspheric wide inversion zone design in the optical zone', {'mapped_name': 'Orthokeratology', 'mapped_curie': 'C1533088', 'mapped_score': '3.54', 'mapped_semtypes': '[topp]'}, None]
['metamap', 'intervention', 'orthokeratology with aspheric wide inversion zone design in the optical zone', 'orthokeratology with aspheric wide inversion zone design in the optical zone', {'mapped_name': 'CDISC SEND Study Design Terminology', 'mapped_curie': 'C2983265', 'mapped_score': '3.42', 'mapped_semtypes': '[inpr]'}, None]
['metamap', 'intervention', 'orthokeratology with asphe

% interventions mapped:  28%|█████▌              | 8/29 [00:18<00:36, 

['metamap', 'intervention', 'comparative study between transdermal fentanyl and melatonin patches on postoperative pain relief after lumber laminectomy', 'comparative study between transdermal fentanyl and melatonin patches on postoperative pain relief after lumber laminectomy', {'mapped_name': 'Melatonin', 'mapped_curie': 'C0025219', 'mapped_score': '19.19', 'mapped_semtypes': '[horm,orch,phsu]'}, None]
['metamap', 'intervention', 'comparative study between transdermal fentanyl and melatonin patches on postoperative pain relief after lumber laminectomy', 'comparative study between transdermal fentanyl and melatonin patches on postoperative pain relief after lumber laminectomy', {'mapped_name': 'Fentanyl', 'mapped_curie': 'C0015846', 'mapped_score': '12.88', 'mapped_semtypes': '[orch,phsu]'}, None]
['metamap', 'intervention', 'comparative study between transdermal fentanyl and melatonin patches on postoperative pain relief after lumber laminectomy', 'comparative study between transderm

% interventions mapped:  41%|████████▎           | 12/29 [00:20<00:18,

['metamap', 'intervention', 'enteric coated mycophenolate sodium', 'enteric coated mycophenolate sodium', {'mapped_name': 'mycophenolate sodium', 'mapped_curie': 'C1337395', 'mapped_score': '3.61', 'mapped_semtypes': '[antb,orch]'}, None]
['metamap', 'intervention', 'enteric coated mycophenolate sodium', 'enteric coated mycophenolate sodium', {'mapped_name': 'Coating (film)', 'mapped_curie': 'C1522408', 'mapped_score': '3.43', 'mapped_semtypes': '[qlco]'}, None]
['metamap', 'intervention', 'enteric coated mycophenolate sodium', 'enteric coated mycophenolate sodium', {'mapped_name': 'Enteral', 'mapped_curie': 'C1304890', 'mapped_score': '3.43', 'mapped_semtypes': '[spco]'}, None]
['metamap', 'intervention', 'idarubicin', 'idarubicin', {'mapped_name': 'Idarubicin', 'mapped_curie': 'C0020789', 'mapped_score': '28.87', 'mapped_semtypes': '[orch,phsu]'}, None]
['metamap', 'intervention', 'quetiapine', 'quetiapine', {'mapped_name': 'quetiapine', 'mapped_curie': 'C0123091', 'mapped_score': '3

% interventions mapped:  52%|██████████▎         | 15/29 [00:24<00:16,

['metamap', 'intervention', 'rituximab [mabthera/rituxan]', 'rituximab [mabthera/rituxan]', {'mapped_name': 'rituximab', 'mapped_curie': 'C0393022', 'mapped_score': '28.80', 'mapped_semtypes': '[aapp,imft,phsu]'}, None]
['metamap', 'intervention', 'rituximab [mabthera/rituxan]', 'rituximab [mabthera/rituxan]', {'mapped_name': 'Mabthera', 'mapped_curie': 'C1314901', 'mapped_score': '28.67', 'mapped_semtypes': '[aapp,imft,phsu]'}, None]
['metamap', 'intervention', 'rituximab [mabthera/rituxan]', 'rituximab [mabthera/rituxan]', {'mapped_name': 'Rituxan', 'mapped_curie': 'C0732355', 'mapped_score': '28.67', 'mapped_semtypes': '[aapp,imft,phsu]'}, None]
['metamap', 'intervention', 'pneumovax', 'pneumovax', {'mapped_name': 'Pneumovax', 'mapped_curie': 'C0071315', 'mapped_score': '19.41', 'mapped_semtypes': '[imft,phsu]'}, None]


% interventions mapped:  59%|███████████▋        | 17/29 [00:26<00:14,

['metamap', 'intervention', 'form 3 of azd5718 tablets', 'form 3 of azd5718 tablets', {'mapped_name': 'Tablet Dosage Form', 'mapped_curie': 'C0039225', 'mapped_score': '9.86', 'mapped_semtypes': '[bodm]'}, None]
['metamap', 'intervention', 'form 3 of azd5718 tablets', 'form 3 of azd5718 tablets', {'mapped_name': 'Manufactured form', 'mapped_curie': 'C0376315', 'mapped_score': '6.71', 'mapped_semtypes': '[mnob]'}, None]
['metamap', 'intervention', 'form 3 of azd5718 tablets', 'form 3 of azd5718 tablets', {'mapped_name': 'Formation', 'mapped_curie': 'C1522492', 'mapped_score': '3.55', 'mapped_semtypes': '[ftcn]'}, None]
['metamap', 'intervention', 'form 3 of azd5718 tablets', 'form 3 of azd5718 tablets', {'mapped_name': 'Qualitative form', 'mapped_curie': 'C0348078', 'mapped_score': '3.55', 'mapped_semtypes': '[qlco]'}, None]
['metamap', 'intervention', 'aspirin enteric-coated tablets & atorvastatin calcium', 'aspirin enteric-coated tablets & atorvastatin calcium', {'mapped_name': 'Atorv

% interventions mapped:  79%|███████████████▊    | 23/29 [00:29<00:04,

['metamap', 'intervention', 'intraosseous', 'intraosseous', {'mapped_name': 'Intraosseous Route of Drug Administration', 'mapped_curie': 'C0595613', 'mapped_score': '3.48', 'mapped_semtypes': '[ftcn]'}, None]
['metamap', 'intervention', 'rapamycin', 'rapamycin', {'mapped_name': 'Sirolimus', 'mapped_curie': 'C0072980', 'mapped_score': '13.10', 'mapped_semtypes': '[antb,orch]'}, None]
['metamap', 'intervention', 'alendronate sodium', 'alendronate sodium', {'mapped_name': 'Alendronate Sodium', 'mapped_curie': 'C0700482', 'mapped_score': '16.33', 'mapped_semtypes': '[orch,phsu]'}, None]


% interventions mapped:  90%|█████████████████▉  | 26/29 [00:33<00:02,

['metamap', 'intervention', 'neuromuscular electrical stimulation', 'neuromuscular electrical stimulation', {'mapped_name': 'Neuromuscular Electrical Stimulation', 'mapped_curie': 'C2985393', 'mapped_score': '3.77', 'mapped_semtypes': '[topp]'}, None]
['metamap', 'intervention', 'forefoot bone surgery', 'forefoot bone surgery', {'mapped_name': 'Operation on bone', 'mapped_curie': 'C0185131', 'mapped_score': '3.64', 'mapped_semtypes': '[topp]'}, None]
['metamap', 'intervention', 'forefoot bone surgery', 'forefoot bone surgery', {'mapped_name': 'Forefoot', 'mapped_curie': 'C1510667', 'mapped_score': '3.44', 'mapped_semtypes': '[bpoc]'}, None]
['metamap', 'intervention', 'forefoot bone surgery', 'forefoot bone surgery', {'mapped_name': 'Forefoot of quadruped', 'mapped_curie': 'C1630649', 'mapped_score': '3.44', 'mapped_semtypes': '[bpoc]'}, None]
['metamap', 'intervention', 'forefoot bone surgery', 'forefoot bone surgery', {'mapped_name': 'Forepaw', 'mapped_curie': 'C1533572', 'mapped_sco

% interventions mapped: 100%|████████████████████| 29/29 [00:35<00:00,

['metamap', 'intervention', 'anti-human ccl24 monoclonal antibody (cm-101)', 'anti-human ccl24 monoclonal antibody (cm-101)', {'mapped_name': 'Homo sapiens', 'mapped_curie': 'C0086418', 'mapped_score': '28.66', 'mapped_semtypes': '[humn]'}, None]
['metamap', 'intervention', 'anti-human ccl24 monoclonal antibody (cm-101)', 'anti-human ccl24 monoclonal antibody (cm-101)', {'mapped_name': 'Monoclonal Antibodies', 'mapped_curie': 'C0003250', 'mapped_score': '22.49', 'mapped_semtypes': '[aapp,imft]'}, None]
['metamap', 'intervention', 'anti-human ccl24 monoclonal antibody (cm-101)', 'anti-human ccl24 monoclonal antibody (cm-101)', {'mapped_name': 'Chemokine CCL24', 'mapped_curie': 'C0538075', 'mapped_score': '22.35', 'mapped_semtypes': '[aapp,imft]'}, None]
['metamap', 'intervention', 'anti-human ccl24 monoclonal antibody (cm-101)', 'anti-human ccl24 monoclonal antibody (cm-101)', {'mapped_name': 'CCL24 protein, human', 'mapped_curie': 'C1701443', 'mapped_score': '12.88', 'mapped_semtypes':

% interventions mapped: 100%|████████████████████| 29/29 [00:37<00:00,
% intervention_alternates mapped:   3%|▋                   | 1/30 [00:

['metamap', 'intervention_alternate', 'cleaning', 'cleaning', {'mapped_name': 'Cleaning (activity)', 'mapped_curie': 'C1947930', 'mapped_score': '3.60', 'mapped_semtypes': '[acty]'}, None]
['metamap', 'intervention_alternate', 'pf-05280586', 'pf-05280586', {'mapped_name': 'Rituximab Biosimilar PF-05280586', 'mapped_curie': 'C4048258', 'mapped_score': '3.72', 'mapped_semtypes': '[aapp,imft,phsu]'}, None]
['metamap', 'intervention_alternate', 'specific immunotherapy', 'specific immunotherapy', {'mapped_name': 'Biological Response Modifiers', 'mapped_curie': 'C0005525', 'mapped_score': '13.05', 'mapped_semtypes': '[imft,phsu]'}, None]
['metamap', 'intervention_alternate', 'specific immunotherapy', 'specific immunotherapy', {'mapped_name': 'Immunotherapy', 'mapped_curie': 'C0021083', 'mapped_score': '13.05', 'mapped_semtypes': '[topp]'}, None]
['metamap', 'intervention_alternate', 'specific immunotherapy', 'specific immunotherapy', {'mapped_name': 'Entity Determiner - specific', 'mapped_cu

% intervention_alternates mapped:  13%|██▋                 | 4/30 [00:

['metamap', 'intervention_alternate', 'oral jak inhibitor', 'oral jak inhibitor', {'mapped_name': 'Janus kinase', 'mapped_curie': 'C0597721', 'mapped_score': '25.52', 'mapped_semtypes': '[aapp,enzy]'}, None]
['metamap', 'intervention_alternate', 'oral jak inhibitor', 'oral jak inhibitor', {'mapped_name': 'Inhibitor', 'mapped_curie': 'C1999216', 'mapped_score': '3.57', 'mapped_semtypes': '[qlco]'}, None]
['metamap', 'intervention_alternate', 'oral jak inhibitor', 'oral jak inhibitor', {'mapped_name': 'JAK activity', 'mapped_curie': 'C1150611', 'mapped_score': '3.44', 'mapped_semtypes': '[moft]'}, None]
['metamap', 'intervention_alternate', 'oral jak inhibitor', 'oral jak inhibitor', {'mapped_name': 'Oral', 'mapped_curie': 'C0442027', 'mapped_score': '3.44', 'mapped_semtypes': '[spco]'}, None]
['metamap', 'intervention_alternate', 'cat scan', 'cat scan', {'mapped_name': 'X-Ray Computed Tomography', 'mapped_curie': 'C0040405', 'mapped_score': '22.64', 'mapped_semtypes': '[diap]'}, None]
[

% intervention_alternates mapped:  20%|████                | 6/30 [00:

['nameresolver', 'intervention_alternate', 'prednicot', 'prednicot', {'mapped_name': 'Prednicot', 'mapped_curie': 'UMLS:C1170535', 'mapped_score': 97.21777, 'mapped_semtypes': 'biolink:ChemicalEntity'}, None]
['metamap', 'intervention_alternate', 'adinepar', 'adinepar', {'mapped_name': 'Adinepar', 'mapped_curie': 'C1517775', 'mapped_score': '3.64', 'mapped_semtypes': '[orch,phsu]'}, None]
['metamap', 'intervention_alternate', 'kd', 'kd', {'mapped_name': 'AR wt Allele', 'mapped_curie': 'C1705240', 'mapped_score': '3.64', 'mapped_semtypes': '[gngm]'}, None]


% intervention_alternates mapped:  30%|██████              | 9/30 [00:

['metamap', 'intervention_alternate', 'precedex', 'precedex', {'mapped_name': 'Precedex', 'mapped_curie': 'C0876757', 'mapped_score': '16.26', 'mapped_semtypes': '[orch,phsu]'}, None]
['nameresolver', 'intervention_alternate', 'smri', 'smri', {'mapped_name': 'Coralliophila fontanangioyae', 'mapped_curie': 'NCBITaxon:644099', 'mapped_score': 1.0, 'mapped_semtypes': 'biolink:OrganismTaxon'}, None]


% intervention_alternates mapped:  37%|███████▎            | 11/30 [00

['metamap', 'intervention_alternate', 'bi 655066', 'bi 655066', {'mapped_name': 'BI 655066', 'mapped_curie': 'C4078397', 'mapped_score': '13.18', 'mapped_semtypes': '[imft]'}, None]
['metamap', 'intervention_alternate', 'ac-1204', 'ac-1204', {'mapped_name': 'Actinium', 'mapped_curie': 'C0001246', 'mapped_score': '19.36', 'mapped_semtypes': '[elii]'}, None]
['metamap', 'intervention_alternate', 'ac-1204', 'ac-1204', {'mapped_name': 'Antigua and Barbuda', 'mapped_curie': 'C0003354', 'mapped_score': '16.21', 'mapped_semtypes': '[geoa]'}, None]
['metamap', 'intervention_alternate', 'ac-1204', 'ac-1204', {'mapped_name': 'ASAH1 protein, human', 'mapped_curie': 'C2000891', 'mapped_score': '13.05', 'mapped_semtypes': '[aapp,enzy]'}, None]
['metamap', 'intervention_alternate', 'ac-1204', 'ac-1204', {'mapped_name': 'ASAH1 gene', 'mapped_curie': 'C1412573', 'mapped_score': '3.59', 'mapped_semtypes': '[gngm]'}, None]
['metamap', 'intervention_alternate', 'ac-1204', 'ac-1204', {'mapped_name': 'ASAH

% intervention_alternates mapped:  47%|█████████▎          | 14/30 [00

['metamap', 'intervention_alternate', 'cis-platinum ii', 'cis-platinum ii', {'mapped_name': 'Cisplatin', 'mapped_curie': 'C0008838', 'mapped_score': '10.08', 'mapped_semtypes': '[inch,phsu]'}, None]
['metamap', 'intervention_alternate', 'platinol- aq', 'platinol- aq', {'mapped_name': 'Platinol- AQ', 'mapped_curie': 'C0722652', 'mapped_score': '3.72', 'mapped_semtypes': '[inch,phsu]'}, None]
['metamap', 'intervention_alternate', 'continuous infusion of remifentanil at 0.05 ~ 2 mcg/kg/min with sevoflurane concentration of 1 mac', 'continuous infusion of remifentanil at 0.05 ~ 2 mcg/kg/min with sevoflurane concentration of 1 mac', {'mapped_name': 'Macrophage-1 Antigen', 'mapped_curie': 'C0079785', 'mapped_score': '25.50', 'mapped_semtypes': '[aapp,imft]'}, None]
['metamap', 'intervention_alternate', 'continuous infusion of remifentanil at 0.05 ~ 2 mcg/kg/min with sevoflurane concentration of 1 mac', 'continuous infusion of remifentanil at 0.05 ~ 2 mcg/kg/min with sevoflurane concentration

% intervention_alternates mapped:  57%|███████████▎        | 17/30 [00

['metamap', 'intervention_alternate', 'dexmedetomidine hydrochloride', 'dexmedetomidine hydrochloride', {'mapped_name': 'Dexmedetomidine Hydrochloride', 'mapped_curie': 'C0752310', 'mapped_score': '16.33', 'mapped_semtypes': '[orch,phsu]'}, None]
['metamap', 'intervention_alternate', 'nutritional evaluation (nrs 2002)', 'nutritional evaluation (nrs 2002)', {'mapped_name': 'Nutrition Assessment', 'mapped_curie': 'C0028708', 'mapped_score': '16.22', 'mapped_semtypes': '[diap]'}, None]
['metamap', 'intervention_alternate', 'nutritional evaluation (nrs 2002)', 'nutritional evaluation (nrs 2002)', {'mapped_name': 'Diet education', 'mapped_curie': 'C0204932', 'mapped_score': '3.61', 'mapped_semtypes': '[edac]'}, None]
['metamap', 'intervention_alternate', 'nutritional evaluation (nrs 2002)', 'nutritional evaluation (nrs 2002)', {'mapped_name': 'Numeric Rating Scale', 'mapped_curie': 'C4050142', 'mapped_score': '3.43', 'mapped_semtypes': '[inpr]'}, None]
['metamap', 'intervention_alternate', 

% intervention_alternates mapped:  63%|████████████▋       | 19/30 [00

['metamap', 'intervention_alternate', 'mk-965', 'mk-965', {'mapped_name': 'MK-965', 'mapped_curie': 'C1522682', 'mapped_score': '3.72', 'mapped_semtypes': '[aapp,enzy,phsu]'}, None]


% intervention_alternates mapped:  70%|██████████████      | 21/30 [00

['metamap', 'intervention_alternate', 't-spot.tb test', 't-spot.tb test', {'mapped_name': 'Spot test', 'mapped_curie': 'C0201812', 'mapped_score': '3.60', 'mapped_semtypes': '[lbpr]'}, None]
['metamap', 'intervention_alternate', 'ocular mini insert (omi)', 'ocular mini insert (omi)', {'mapped_name': None, 'mapped_curie': None, 'mapped_score': None, 'mapped_semtypes': None}, None]
['metamap', 'intervention_alternate', 'ocular mini insert (omi)', 'ocular mini insert (omi)', {'mapped_name': 'Vision', 'mapped_curie': 'C0042789', 'mapped_score': '16.05', 'mapped_semtypes': '[orgf]'}, None]
['metamap', 'intervention_alternate', 'ocular mini insert (omi)', 'ocular mini insert (omi)', {'mapped_name': 'Eye', 'mapped_curie': 'C0015392', 'mapped_score': '12.90', 'mapped_semtypes': '[bpoc]'}, None]
['metamap', 'intervention_alternate', 'ocular mini insert (omi)', 'ocular mini insert (omi)', {'mapped_name': 'Mini', 'mapped_curie': 'C0445542', 'mapped_score': '3.57', 'mapped_semtypes': '[orga]'}, No

% intervention_alternates mapped:  80%|████████████████    | 24/30 [00

['metamap', 'intervention_alternate', 'nal-iri', 'nal-iri', {'mapped_name': 'irinotecan liposomal', 'mapped_curie': 'C4057931', 'mapped_score': '3.72', 'mapped_semtypes': '[orch,phsu]'}, None]
['metamap', 'intervention_alternate', 'gemzar', 'gemzar', {'mapped_name': 'Gemzar', 'mapped_curie': 'C0338133', 'mapped_score': '3.64', 'mapped_semtypes': '[nnon,phsu]'}, None]
['metamap', 'intervention_alternate', 'sirolimus', 'sirolimus', {'mapped_name': 'Sirolimus', 'mapped_curie': 'C0072980', 'mapped_score': '13.10', 'mapped_semtypes': '[antb,orch]'}, None]


% intervention_alternates mapped:  93%|██████████████████▋ | 28/30 [00

['metamap', 'intervention_alternate', 'mdx-ctla4', 'mdx-ctla4', {'mapped_name': 'MDX-CTLA-4', 'mapped_curie': 'C1691227', 'mapped_score': '28.95', 'mapped_semtypes': '[aapp,imft,phsu]'}, None]
['metamap', 'intervention_alternate', 'mdx-ctla4', 'mdx-ctla4', {'mapped_name': 'cytotoxic T-lymphocyte antigen 4', 'mapped_curie': 'C0111208', 'mapped_score': '22.64', 'mapped_semtypes': '[aapp,imft]'}, None]
['metamap', 'intervention_alternate', 'prevenar 13', 'prevenar 13', {'mapped_name': 'Prevnar 13', 'mapped_curie': 'C2756364', 'mapped_score': '3.72', 'mapped_semtypes': '[imft,orch,phsu]'}, None]
['metamap', 'intervention_alternate', 'bupivacaine liposome injectable suspension', 'bupivacaine liposome injectable suspension', {'mapped_name': 'Liposomal Bupivacaine', 'mapped_curie': 'C4086546', 'mapped_score': '3.81', 'mapped_semtypes': '[orch,phsu]'}, None]


% intervention_alternates mapped: 100%|████████████████████| 30/30 [02

['nameresolver', 'intervention_alternate', '4-demethoxydaunomycin', '4-demethoxydaunomycin', {'mapped_name': '6-Deoxy-4-demethoxydaunomycin', 'mapped_curie': 'PUBCHEM.COMPOUND:158476', 'mapped_score': 12.188611, 'mapped_semtypes': 'biolink:SmallMolecule'}, None]


% intervention_alternates mapped: 100%|████████████████████| 30/30 [02


In [None]:

flag_and_path = get_raw_ct_data() # download raw data

global metamap_dirs
metamap_dirs = check_os()
df_dict = read_raw_ct_data(flag_and_path, subset_size) # read the clinical trial data
dict_new_terms = check_against_cache(df_dict, flag_and_path) # use the existing cache of MetaMapped terms so that only new terms are mapped

term_list_to_mm(dict_new_terms, flag_and_path) # map new terms using MetaMap

map_to_trial(flag_and_path) # map MetaMap terms back to trial 
score_mappings(flag_and_path) # score the mappings
auto_select_curies(flag_and_path) # select CURIEs automatically that pass score threshold

# compile_curies_for_trials(flag_and_path) # select CURIEs automatically that pass score threshold

