### THIS SCRIPT USES MetaMap to try and map the bulk of terms, and Name Resolver to pick up what's left

In [1]:
# display cells to maximum width 
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))

# lets you preint multiple outputs per cell, not just last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
import re
import collections
import os
import json
import numpy as np
import pickle
from functools import reduce
import time
from time import sleep
# import concurrent
import concurrent.futures
import multiprocessing
import datetime as dt
from datetime import date
import pathlib
import configparser
import sys
import urllib
import zipfile
import csv
sys.path.insert(0, '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/pymetamap-master')
from pymetamap import MetaMap  # https://github.com/AnthonyMRios/pymetamap/blob/master/pymetamap/SubprocessBackend.py
from pandas import ExcelWriter
import ast
import glob
from tqdm import tqdm
import subprocess
import shlex
from collections import Counter

# %pip install thefuzz
# %pip install levenshtein
# %pip install xlsxwriter
# %pip install ratelimit

from thefuzz import fuzz # fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

In [3]:
def get_token_sort_ratio(str1, str2):
    """ fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python """
    try:
        return fuzz.token_sort_ratio(str1, str2)
    except:
        return None

def get_token_set_ratio(str1, str2):
    """ fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python """
    try:
        return fuzz.token_set_ratio(str1, str2)
    except:
        return None  

def get_similarity_score(str1, str2):
    """ fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python """
    try:
        return fuzz.ratio(str1, str2)
    except:
        return None
    
def convert_seconds_to_hms(seconds):
    """ converts the elapsed time or runtime to hours, min, sec """
    hours = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return hours, minutes, seconds

def de_ascii_er(text):
    non_ascii = "[^\x00-\x7F]"
    pattern = re.compile(r"[^\x00-\x7F]")
    non_ascii_text = re.sub(pattern, ' ', text)
    return non_ascii_text

def start_metamap_servers(metamap_dirs):
    global metamap_pos_server_dir
    global metamap_wsd_server_dir
    metamap_pos_server_dir = 'bin/skrmedpostctl' # Part of speech tagger
    metamap_wsd_server_dir = 'bin/wsdserverctl' # Word sense disambiguation 
    
    metamap_executable_path_pos = os.path.join(metamap_dirs['metamap_base_dir'], metamap_pos_server_dir)
    metamap_executable_path_wsd = os.path.join(metamap_dirs['metamap_base_dir'], metamap_wsd_server_dir)
    command_pos = [metamap_executable_path_pos, 'start']
    command_wsd = [metamap_executable_path_wsd, 'start']

    # Start servers, with open portion redirects output of metamap server printing output to NULL
    with open(os.devnull, "w") as fnull:
        result_post = subprocess.call(command_pos, stdout = fnull, stderr = fnull)
        result_wsd = subprocess.call(command_wsd, stdout = fnull, stderr = fnull)
    sleep(5)

def stop_metamap_servers(metamap_dirs):
    metamap_executable_path_pos = os.path.join(metamap_dirs['metamap_base_dir'], metamap_pos_server_dir)
    metamap_executable_path_wsd = os.path.join(metamap_dirs['metamap_base_dir'], metamap_wsd_server_dir)
    command_pos = [metamap_executable_path_pos, 'stop']
    command_wsd = [metamap_executable_path_wsd, 'stop']
    
    # Stop servers, with open portion redirects output of metamap server printing output to NULL
    with open(os.devnull, "w") as fnull:
        result_post = subprocess.call(command_pos, stdout = fnull, stderr = fnull)
        result_wsd = subprocess.call(command_wsd, stdout = fnull, stderr = fnull)
    sleep(2)  
    
def add_mappings_to_cache(flag_and_path):
    relevant_date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data
    with open("metamapped_terms_cache.tsv", 'a+', encoding="utf-8") as cache:
        with open(f"{relevant_date}_metamap_output.tsv", 'r', encoding="utf-8", errors='ignore') as new_metamapped_terms:
            # Read the first line from new_metamapped_terms to move the cursor
            line = new_metamapped_terms.readline()

            # Move the cursor to the position after the first line
            while line:
                line = new_metamapped_terms.readline()
                if line:
                    # Append the line to file_1
                    cache.write(line)
    """ Remove duplicate rows from cache """
    cache = pd.read_csv("metamapped_terms_cache.tsv", sep='\t', index_col=False, header=0, on_bad_lines = 'warn')
    cache = cache.drop_duplicates()
    cache.to_csv('metamapped_terms_cache.tsv', sep="\t", index=False, header=True) # output deduplicated cache terms to TSV

def add_manually_selected_terms_to_cache():
    # -----     ------     GENERATE MANUALLY SELECTED CACHE     -----     ------  #
    try:
        #  --- --- --   CONDITIONS     --- --- --   #
        files = glob.glob("*.xlsx")
        conditions_manselected_files = [i for i in files if "conditions_manual_review" in i if not i.startswith("~")][0]  
        conditions_manselected = pd.read_excel(conditions_manselected_files)
        conditions_manselected.name.ffill(inplace=True)
        conditions_manselected.orig_con.ffill(inplace=True)
        conditions_manselected = conditions_manselected[~conditions_manselected['manually_selected_CURIE'].isnull()] # check if the conditions got mapped to any CURIEs
        conditions_manselected.drop(["curie_info"], axis = 1, inplace = True)
        conditions_manselected.rename(columns = {'name':'original_clin_trial_term', 'orig_con':'modified_clin_trial_term'}, inplace = True)

        with open('conditions_manually_selected_cache.tsv', 'a') as output:
            conditions_manselected.to_csv(output, mode='a',sep="\t", index=False, header=output.tell()==0)
        """ Remove duplicate rows from cache """
        cache = pd.read_csv("conditions_manually_selected_cache.tsv", sep='\t', index_col=False, header=0, on_bad_lines = 'warn')
        cache = cache.drop_duplicates()
        cache.to_csv('conditions_manually_selected_cache.tsv', sep="\t", index=False, header=True) # output deduplicated cache terms to TSV

        #  --- --- --   INTERVENTIONS and Alternate INTERVENTIONS   --- --- --   #
        files = glob.glob("*.xlsx")
        interventions_manselected_files = [i for i in files if "interventions_manual_review" in i if not i.startswith("~")][0]  
        interventions_manselected = pd.read_excel(interventions_manselected_files)
        interventions_manselected.name.ffill(inplace=True)
        interventions_manselected.orig_int.ffill(inplace=True)
        interventions_manselected = interventions_manselected[~interventions_manselected['manually_selected_CURIE'].isnull()] # check if the conditions got mapped to any CURIEs
        interventions_manselected.drop(["curie_info", "description"], axis = 1, inplace = True)
        interventions_manselected.rename(columns = {'name':'original_clin_trial_term', 'orig_int':'modified_clin_trial_term'}, inplace = True)

        with open('interventions_manually_selected_cache.tsv', 'a') as output:
            interventions_manselected.to_csv(output, mode='a',sep="\t", index=False, header=output.tell()==0)
        """ Remove duplicate rows from cache """
        cache = pd.read_csv("interventions_manually_selected_cache.tsv", sep='\t', index_col=False, header=0, on_bad_lines = 'warn')
        cache = cache.drop_duplicates()
        cache.to_csv('interventions_manually_selected_cache.tsv', sep="\t", index=False, header=True) # output deduplicated cache terms to TSV
    except:
        print("No terms in manual select column; either column is empty or bug. Proceeding without them")
        
def check_os():
    if "linux" in sys.platform:
        print("Linux platform detected")
        metamap_base_dir = "{}/metamap/".format(pathlib.Path.cwd().parents[0])
        metamap_bin_dir = 'bin/metamap20'
    else:
        metamap_base_dir = '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/' # for running on local
        metamap_bin_dir = 'bin/metamap18'
        
    return {"metamap_base_dir":metamap_base_dir, "metamap_bin_dir":metamap_bin_dir}  


In [5]:
def get_raw_ct_data():
    term_program_flag = True
    global data_dir
    global data_extracted
    
    try:
        # get all the links and associated dates of upload into a dict called date_link
        url_all = "https://aact.ctti-clinicaltrials.org/download"
        response = requests.get(url_all)
        soup = BeautifulSoup(response.text, features="lxml")
        body = soup.find_all('option') #Find all
        date_link = {}
        for el in body:
            tags = el.find('a')
            try:
                zip_name = tags.contents[0].split()[0]
                date = zip_name.split("_")[0]
                date = dt.datetime.strptime(date, '%Y%m%d').date()
                date_link[date] = tags.get('href')
            except:
                pass
        latest_file_date = max(date_link.keys())   # get the date of the latest upload
        url = date_link[latest_file_date]   # get the corresponding download link of the latest upload so we can download the raw data
        date_string = latest_file_date.strftime("%m_%d_%Y")
        data_dir = "{}/data".format(pathlib.Path.cwd())
        data_extracted = data_dir + "/{}_extracted".format(date_string)
        data_path = "{}/{}_pipe-delimited-export.zip".format(data_dir, date_string)
    except:
        print("continue")

    if not os.path.exists(data_path):   # if folder containing most recent data doesn't exist, download and extract it into data folder

        term_program_flag = False   # flag below for terminating program if latest download exists (KG is assumed up to date)
        print("Attempting download of Clinical Trial data as of {}\n".format(date_string))
        try:
            response = requests.get(url)
            if response.status_code == 200:
                with open(data_path, 'wb') as file:
                    file.write(response.content)
                print("Finished download of zip")
                with zipfile.ZipFile(data_path, 'r') as download:
                    print("Unzipping data")
                    download.extractall(data_extracted)
        except:
            print("Failed to scrape AACT for download. Please navigate to https://aact.ctti-clinicaltrials.org/download and manually download zip file.")
            print("Please store the downloaded zip in the /data directory. This should be the only item besides the cache file, condition manual review file, and intervention manual review file, in the directory at this time.")
            done = input("Type Done when done: ")
            if done == "Done":
                data_dir = "{}/data".format(pathlib.Path.cwd())
                # list_of_files = glob.glob(data_dir + "/*") # get all files in directory
                try:
                    # latest_file = max(list_of_files, key=os.path.getctime) # get the most recent file in the directory
                    pattern = os.path.join(data_dir, "*.zip")
                    zip_file = glob.glob(pattern) # look for file in directory that ends in ".zip"
                    zip_file = zip_file[0]
                    print("File found at: ")
                    print(zip_file)
                    # print(latest_file)
                    print("Please make sure this the correct zip file from AACT")
                    if not os.path.exists(data_extracted):   # if folder of unzipped data does not exist, unzip
                        try:
                            with zipfile.ZipFile(zip_file, 'r') as download:
                                print("Unzipping data into")
                                cttime = os.path.getctime(zip_file)
                                date_string = dt.datetime.fromtimestamp(cttime).strftime('%m_%d_%Y')
                                data_extracted = data_dir + "/{}_extracted".format(date_string)
                                print(data_extracted)
                                download.extractall(data_extracted)
                        except:
                            pattern = os.path.join(data_dir, "*_extracted")
                            extracted_file = glob.glob(pattern) # look for file in directory that ends in "_extracted"
                            data_extracted = extracted_file[0]
                            extracted_name = os.path.basename(os.path.normpath(extracted_file[0]))
                            date_string = extracted_name.replace('_extracted', '')
                            print("Assuming data is already unzipped")
                        
                except:
                    print("Unable to download and extract Clincal Trial data.")
                    print("Cannot find pipe-delimited zip in /data folder.")
    else:
        print("KG is already up to date.")

    return {"term_program_flag": term_program_flag, "data_extracted_path": data_extracted, "date_string": date_string}


In [6]:
def read_raw_ct_data(flag_and_path, subset_size):
    if flag_and_path["term_program_flag"]:
        print("Exiting program. Assuming KG has already been constructed from most recent data dump from AACT.")
        exit()
    else:
        data_extracted = flag_and_path["data_extracted_path"]
        # read in pipe-delimited files 
        conditions_df = pd.read_csv(data_extracted + '/conditions.txt', sep='|', index_col=False, header=0, on_bad_lines = 'warn')
        interventions_df = pd.read_csv(data_extracted + '/interventions.txt', sep='|', index_col=False, header=0, on_bad_lines = 'warn')
        interventions_alts_df = pd.read_csv(data_extracted + '/intervention_other_names.txt', sep='|', index_col=False, header=0, on_bad_lines = 'warn')

        if subset_size:   # if a subset size is given, we are running this script on a small subset of the dataset
            conditions_df = conditions_df.sample(n=subset_size)
            interventions_df = interventions_df.sample(n=subset_size)
            interventions_alts_df = interventions_alts_df.sample(n=subset_size)
    
    df_dict = {"conditions": conditions_df, "interventions": interventions_df, "interventions_alts": interventions_alts_df}
    return df_dict


# Check against cache, retrieve terms not already mapped

In [7]:
def check_against_cache(df_dict):
    conditions_list = df_dict['conditions'].name.unique().tolist()
    conditions_list = [str(i) for i in conditions_list]
    conditions_list = list(set([i.lower() for i in conditions_list]))
    
    interventions_list = df_dict['interventions'].name.unique().tolist()
    interventions_list = [str(i) for i in interventions_list]
    interventions_list = list(set([i.lower() for i in interventions_list]))
    
    interventions_alts_list = df_dict['interventions_alts'].name.unique().tolist()
    interventions_alts_list = [str(i) for i in interventions_alts_list]
    interventions_alts_list = list(set([i.lower() for i in interventions_alts_list]))
    
    try:        
        cache_df = pd.read_csv("mapping_cache.tsv", sep ="\t", index_col=False, header=0, on_bad_lines = 'warn')
        
        conditions_cache = cache_df[cache_df["term_type"] == "condition"]
        conditions_cache = conditions_cache['clintrial_term'].unique().tolist()
        conditions_cache = list(set([i.lower() for i in conditions_cache]))
        
        conditions_new = [x for x in conditions_list if x not in conditions_cache] # find conditions not in the cache (i.g. new conditions to map)
        conditions_new = list(filter(None, conditions_new))
        conditions_new = [str(i) for i in conditions_new]
        
        interventions_cache = cache_df[cache_df["term_type"] == "intervention"]
        interventions_cache = interventions_cache['clintrial_term'].unique().tolist()
        interventions_cache = list(set([i.lower() for i in interventions_cache]))
        
        interventions_new = [x for x in interventions_list if x not in interventions_cache] # find interventions not in the cache (i.g. new interventions to map)
        interventions_new = list(filter(None, interventions_new))
        interventions_new = [str(i) for i in interventions_new]
        
        interventions_alts_cache = cache_df[cache_df["term_type"] == "intervention_alternate"]
        interventions_alts_cache = interventions_alts_cache['clintrial_term'].unique().tolist()
        interventions_alts_cache = list(set([i.lower() for i in interventions_alts_cache]))
        
        interventions_alts_new = [x for x in interventions_alts_list if x not in interventions_alts_cache] # find interventions_alts not in the cache (i.g. new interventions_alts to map)
        interventions_alts_new = list(filter(None, interventions_alts_new))
        interventions_alts_new = [str(i) for i in interventions_alts_new]
        
    except:
        print("No cache of terms found. Proceeding to map entire KG from scratch")
        conditions_new = conditions_list
        interventions_new = interventions_list
        interventions_alts_new = interventions_alts_list
        
    dict_new_terms = {"conditions": conditions_new, "interventions": interventions_new, "interventions_alts": interventions_alts_new}

    return dict_new_terms


# Map new terms using Mapper

In [8]:
def get_nr_response(orig_term):
    def create_session():
        s = requests.Session()
        return s
    
    sess = create_session()
    
    """   Runs Name Resolver   """
    nr_url = 'https://name-resolution-sri.renci.org/lookup'
    max_retries = 3 
    
    input_term = orig_term # in MetaMap, we have to potentially deascii the term and lower case it...for Name Resolver, we don't need to do that. To keep columns consist with MetaMap output, we just keep it and say the original term and the input term are the same. For MetaMap, they might be different
    retries = 0
    params = {'string':orig_term, 'limit':1} # limit -1 makes this return all available equivalent CURIEs name resolver can give (deprecated)
    while retries <= max_retries:
        try:
            r = sess.post(nr_url, params=params)
            if r.status_code == 200:
                mapping_tool_response = r.json()  # process Name Resolver response
                return mapping_tool_response
            else:
                return None
        except (requests.RequestException, ConnectionResetError, OSError) as ex:
            print(f"\nName Resolver request failed for term: {term}. Error: {ex}")
            retries += 1
            if retries < max_retries:
                print(f"Retrying ({retries}/{max_retries}) after a delay.")
                time.sleep(2 ** retries)  # Increase the delay between retries exponentially
            else:
                print(f"Max retries (Name Resolver) reached for term: {term}.")
                return None
    
    # request_count +=1
    # if request_count % 50 == 0:  # if 50 requests to API have been made, sleep 10 secs
    #     time.sleep(10)  
    

In [16]:
# # nr_response = get_nr_response(orig_term)
# # ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
# for i in ["cordiceps", "chocolate", "diabetes mellitus", "blipadbloo", "catheter", "humira®"]:
#     nr_result = get_nr_response(i)
#     if nr_result:
#         nr_result
#         nr_curie = nr_result[0]["curie"]
#         nr_name = nr_result[0]["label"]
#         nr_type = nr_result[0]["types"][0]
#         nr_score = nr_result[0]["score"]
#         new_concept_dict = {"nameresolver_preferred_name": nr_name,
#                              "nameresolver_cui": nr_curie,
#                              "nameresolver_score": nr_score,
#                              "nameresolver_semtypes": nr_type}
#     else:
#         print("nothing returned from NR")

In [9]:
def run_mappers(term_pair, params, mm, term_type, csv_writer):
    mapping_tool = "metamap"
    orig_term = term_pair[0]
    input_term = term_pair[1]
    from_mapper = []
    
    # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']

    if params.get("exclude_sts") is None: # exclude_sts is used for Interventions. restrict_to_sts is used for Conditions. So, the logic is, if we're mapping Conditions, execute "if" part of code. If we're mapping Interventions, execute "else" part of code
        try:
            concepts,error = mm.extract_concepts([input_term],
                                     restrict_to_sts = params["restrict_to_sts"],
                                     term_processing = params["term_processing"],
                                     ignore_word_order = params["ignore_word_order"],
                                     strict_model = params["strict_model"],)
                                                    
            if concepts:   # if MetaMap gives response, process response
                for concept in concepts:
                    concept_info = []
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term]) # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
                    concept = concept._asdict()
                    new_concept_dict  = {"metamap_preferred_name": concept.get("preferred_name"),
                                         "metamap_cui": concept.get("cui"),
                                         "metamap_score": concept.get("score"),
                                         "metamap_semtypes": concept.get("semtypes")}
                    concept_info.append(new_concept_dict)
                    concept_info.append(None) # this is for the score column, empty bc not scored yet
                    from_mapper.append(concept_info)
            else:   # if MetaMap fails, try using Name Resolver and process response
                nr_response = get_nr_response(orig_term)
                # run_mappers.nrcalls += 1
                if nr_response: # if Name Resolver gives response, process repsonse
                    input_term = orig_term # no preprocessing (lowercasing or deascii-ing) necessary to submit terms to Name Resolver (unlike MetaMap)
                    mapping_tool = "nameresolver"
                    concept_info = []
                    
                    nr_curie = nr_response[0]["curie"]
                    nr_name = nr_response[0]["label"]
                    nr_type = nr_response[0]["types"][0]
                    nr_score = nr_response[0]["score"]
                    new_concept_dict = {"nameresolver_preferred_name": nr_name,
                                         "nameresolver_cui": nr_curie,
                                         "nameresolver_score": nr_score,
                                         "nameresolver_semtypes": nr_type}
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term])
                    concept_info.append(new_concept_dict)
                    concept_info.append(None) # this is for the score column, empty bc not scored yet
                    from_mapper.append(concept_info)
                else:
                    print("Nothing returned from NR or Metamap")
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term, None, None])
                    from_mapper.append(concept_info)
        except:
            print("Nothing returned from NR or Metamap")
            concept_info.extend([mapping_tool, term_type, orig_term, input_term, None, None])
            from_mapper.append(concept_info)
            
    else:   # Else block triggered if mapping Interventions
        try:
            concepts,error = mm.extract_concepts([input_term],
                                                 exclude_sts = params["exclude_sts"],
                                                 term_processing = params["term_processing"],
                                                 ignore_word_order = params["ignore_word_order"],
                                                 strict_model = params["strict_model"],) 
                                                   
            if concepts:   # if MetaMap gives response, process response
                for concept in concepts:
                    concept_info = []
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term]) # Format of output TSV: header = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
                    concept = concept._asdict()
                    new_concept_dict  = {"metamap_preferred_name": concept.get("preferred_name"),
                                         "metamap_cui": concept.get("cui"),
                                         "metamap_score": concept.get("score"),
                                         "metamap_semtypes": concept.get("semtypes")}
                    concept_info.append(new_concept_dict)
                    concept_info.append(None) # this is for the score column, empty bc not scored yet
                    from_mapper.append(concept_info)
            else:   # if MetaMap fails, try using Name Resolver and process response
                print("ATTEMPTING NAME RESOLVER HERE")
                nr_response = get_nr_response(orig_term)
                # run_mappers.nrcalls += 1
                if nr_response: # if Name Resolver gives response, process repsonse
                    input_term = orig_term # no preprocessing (lowercasing or deascii-ing) necessary to submit terms to Name Resolver (unlike MetaMap)
                    mapping_tool = "nameresolver"
                    concept_info = []
                    
                    nr_curie = nr_response[0]["curie"]
                    nr_name = nr_response[0]["label"]
                    nr_type = nr_response[0]["types"][0]
                    nr_score = nr_response[0]["score"]
                    new_concept_dict = {"nameresolver_preferred_name": nr_name,
                                        "nameresolver_cui": nr_curie,
                                        "nameresolver_score": nr_score,
                                        "nameresolver_semtypes": nr_type}
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term])
                    concept_info.append(new_concept_dict)
                    concept_info.append(None) # this is for the score column, empty bc not scored yet
                    from_mapper.append(concept_info)
                else:
                    print("Nothing returned from NR or Metamap")
                    concept_info.extend([mapping_tool, term_type, orig_term, input_term, None, None])
                    from_mapper.append(concept_info)
        except:
            print("Nothing returned from NR or Metamap")
            concept_info.extend([mapping_tool, term_type, orig_term, input_term, None, None])
            from_mapper.append(concept_info)

    for result in from_mapper:
        print(result)
        csv_writer.writerow(result)
    # return from_metamap

In [11]:
def parallelize_mappers(term_pair_list, params, term_type, csv_writer):

    LENGTH = len(term_pair_list)  # Number of iterations required to fill progress bar (pbar)
    pbar = tqdm(total=LENGTH, desc="% {}s mapped".format(term_type), position=0, leave=True, mininterval = LENGTH/20, bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}')  # Init progress bar

    start_metamap_servers(metamap_dirs) # start the MetaMap servers
    mm = MetaMap.get_instance(metamap_dirs["metamap_base_dir"] + metamap_dirs["metamap_bin_dir"])
    with concurrent.futures.ThreadPoolExecutor((multiprocessing.cpu_count()*2) - 1) as executor:
        futures = [executor.submit(run_mappers, term_pair, params, mm, term_type, csv_writer) for term_pair in term_pair_list]
        for _ in concurrent.futures.as_completed(futures):
            pbar.update(n=1)  # Increments counter
    stop_metamap_servers(metamap_dirs) # stop the MetaMap servers
    

In [12]:
def term_list_to_mappers(dict_new_terms):   
    metamap_version = [int(s) for s in re.findall(r'\d+', metamap_dirs.get('metamap_bin_dir'))] # get MetaMap version being run 
    deasciier = np.vectorize(de_ascii_er) # vectorize function
    
    # open mapping cache to add mapped terms
    mapping_filename = "mapping_cache.tsv"
    if os.path.exists(mapping_filename):
        output = open(mapping_filename, 'a', newline='') 
        csv_writer = csv.writer(output, delimiter='\t')
    else:
        output = open(mapping_filename, 'w+', newline='')
        col_names = ['mapping_tool', 'term_type', 'clintrial_term', 'input_term', 'mapping_tool_response', 'score']
        csv_writer = csv.writer(output, delimiter='\t')
        csv_writer.writerow(col_names)

    #  - Conditions
    condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    conditions = dict_new_terms.get("conditions")
    condition_params = {"restrict_to_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    # conditon_term_type = "condition"

    #  - Interventions
    condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    interventions = dict_new_terms.get("interventions")
    intervention_params = {"exclude_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    # intervention_term_type = "intervention"

    #  - Alternate Interventions
    condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    interventions_alts = dict_new_terms.get("interventions_alts")
    intervention_params = {"exclude_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    # intervention_alternate_term_type = "intervention_alternate"
    
    if metamap_version[0] >= 20:
        print("MetaMap version >= 2020, conduct mapping on original terms")
        parallelize_mappers(list(zip(conditions, conditions)), condition_params, "condition", csv_writer)
        parallelize_mappers(list(zip(interventions, interventions)), intervention_params, "intervention", csv_writer)
        parallelize_mappers(list(zip(interventions_alts, interventions_alts)), intervention_alts_params, "alternate_intervention", csv_writer)
    else:
        print("MetaMap version < 2020, conduct mapping on terms after removing ascii characters")
        deascii_cons = deasciier(conditions)
        deascii_ints = deasciier(interventions)
        deascii_int_alts = deasciier(interventions_alts)
        parallelize_mappers(list(zip(conditions, deascii_cons)), condition_params, "condition", csv_writer)
        parallelize_mappers(list(zip(interventions, deascii_ints)), intervention_params, "intervention", csv_writer)
        parallelize_mappers(list(zip(interventions_alts, deascii_int_alts)), intervention_params, "intervention_alternate", csv_writer)

    output.close()
    
    """ Remove duplicate rows from cache """
    cache = pd.read_csv(mapping_filename, sep='\t', index_col=False, header=0, on_bad_lines = 'warn')
    cache = cache.drop_duplicates()
    cache.to_csv(mapping_filename, sep="\t", index=False, header=True) # output deduplicated cache terms to TSV


In [None]:
# # def score_metamap_mappings():


# header = True
# with pd.read_csv("mapping_cache.tsv", sep='\t', index_col=False, header=0, on_bad_lines = 'warn', chunksize=1000) as reader:
#     for chunk in reader:
#         chunk["scored"] = np.where(~chunk["score"].isnull(), chunk["score"],
#                                    np.where((chunk.score.isnull())&(chunk.mapping_tool == "metamap"),
#         df['d'] = np.where(df.a.isnull(),
#          np.nan,
#          np.where((df.b == "N")&(~df.c.isnull()),
#                   df.a*df.c,
#                   df.a))
        
        
#         for i, row in chunk.iterrows():
#             print(type(row["score"]))
#             if not row["score"].isnull():
#                 print(i)
# #                 break
#             elif pd.isnull(row["score"]) and row["mapping_tool"] == "metamap":
#                 mm_dict = ast.literal_eval(row["mapping_tool_response"])
#                 mapped_term = mm_dict['metamap_preferred_name']
#                 sort_ratio_score = get_token_sort_ratio(row["clintrial_term"], mapped_term)
#                 similarity_score = get_similarity_score(row["clintrial_term"], mapped_term)
#                 max_score = max(sort_ratio_score, similarity_score)
#                 chunk.at[i, "score"] = max_score
#             elif pd.isnull(row["score"]) and row["mapping_tool"] == "nameresolver":
#                 break
            
#             chunk.to_csv("mapping_cache_scored.tsv", header=header, sep="\t", index=False, mode='a+')
#             header = False

#       header = True
# for chunk in chunks:

#     chunk.to_csv(os.path.join(folder, new_folder, "new_file_" + filename),
#         header=header, cols=[['TIME','STUFF']], mode='a')

#     header = False          
                
            
# original_clintrial_term = row["clintrial_term"]


In [13]:
flag_and_path = get_raw_ct_data() # download raw data
global metamap_dirs
metamap_dirs = check_os()
subset_size = 20
df_dict = read_raw_ct_data(flag_and_path, subset_size) # read the clinical trial data
dict_new_terms = check_against_cache(df_dict) # use the existing cache of MetaMapped terms so that only new terms are mapped
term_list_to_mappers(dict_new_terms)

Attempting download of Clinical Trial data as of 02_21_2024

Failed to scrape AACT for download. Please navigate to https://aact.ctti-clinicaltrials.org/download and manually download zip file.
Please store the downloaded zip in the /data directory. This should be the only item besides the cache file, condition manual review file, and intervention manual review file, in the directory at this time.


Type Done when done:  Done


File found at: 
/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/8vstm2enpo0ocbo2z7oypqurhgmz.zip
Please make sure this the correct zip file from AACT
Unzipping data into
/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/02_20_2024_extracted
MetaMap version < 2020, conduct mapping on terms after removing ascii characters


% conditions mapped:   5%|█                   | 1/19 [00:11<03:27, 11.

['metamap', 'condition', 'pneumoperitoneum [c06.844.670]', 'pneumoperitoneum [c06.844.670]', {'metamap_preferred_name': 'Pneumoperitoneum', 'metamap_cui': 'C0032320', 'metamap_score': '9.87', 'metamap_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'bacteremia', 'bacteremia', {'metamap_preferred_name': 'Bacteremia', 'metamap_cui': 'C0004610', 'metamap_score': '19.41', 'metamap_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'bacteremia', 'bacteremia', {'metamap_preferred_name': 'Bacteremia, CTCAE', 'metamap_cui': 'C4553946', 'metamap_score': '3.64', 'metamap_semtypes': '[fndg]'}, None]
['metamap', 'condition', 'cataract', 'cataract', {'metamap_preferred_name': 'Cataract', 'metamap_cui': 'C0086543', 'metamap_score': '9.95', 'metamap_semtypes': '[acab]'}, None]
['metamap', 'condition', 'cataract', 'cataract', {'metamap_preferred_name': 'Cataract, CTCAE 5.0', 'metamap_cui': 'C4555209', 'metamap_score': '3.64', 'metamap_semtypes': '[fndg]'}, None]
['metamap', 'condition', 'ulcera

% conditions mapped:  37%|███████▎            | 7/19 [00:13<00:17,  1.

['metamap', 'condition', 'carcinoma, renal cell', 'carcinoma, renal cell', {'metamap_preferred_name': 'Renal Cell Carcinoma', 'metamap_cui': 'C0007134', 'metamap_score': '19.49', 'metamap_semtypes': '[neop]'}, None]
['metamap', 'condition', 'cardiovascular diseases', 'cardiovascular diseases', {'metamap_preferred_name': 'Cardiovascular Diseases', 'metamap_cui': 'C0007222', 'metamap_score': '3.72', 'metamap_semtypes': '[dsyn]'}, None]
['nameresolver', 'condition', 'character', 'character', {'nameresolver_preferred_name': 'Character change', 'nameresolver_cui': 'UMLS:C0235181', 'nameresolver_score': 20.734428, 'nameresolver_semtypes': 'biolink:Disease'}, None]
['metamap', 'condition', 'hodgkin disease', 'hodgkin disease', {'metamap_preferred_name': 'Hodgkin Disease', 'metamap_cui': 'C0019829', 'metamap_score': '16.33', 'metamap_semtypes': '[neop]'}, None]


% conditions mapped:  58%|███████████▌        | 11/19 [00:17<00:10,  1

['nameresolver', 'condition', 'healthy volunteers', 'healthy volunteers', {'nameresolver_preferred_name': 'Healthy Volunteers', 'nameresolver_cui': 'UMLS:C1708335', 'nameresolver_score': 102.17507, 'nameresolver_semtypes': 'biolink:PopulationOfIndividualOrganisms'}, None]
['metamap', 'condition', 'electromagnetic interference', 'electromagnetic interference', {'metamap_preferred_name': 'Electromagnetic Interference Associated with Medical Device', 'metamap_cui': 'C1880489', 'metamap_score': '3.72', 'metamap_semtypes': '[fndg]'}, None]
['metamap', 'condition', 'electromagnetic interference', 'electromagnetic interference', {'metamap_preferred_name': 'Electromagnetic interference', 'metamap_cui': 'C1504600', 'metamap_score': '3.72', 'metamap_semtypes': '[fndg]'}, None]
['metamap', 'condition', 'polyps gallbladder', 'polyps gallbladder', {'metamap_preferred_name': 'Polyp of gallbladder', 'metamap_cui': 'C0262493', 'metamap_score': '3.72', 'metamap_semtypes': '[neop]'}, None]
['metamap', '

% conditions mapped:  79%|███████████████▊    | 15/19 [00:19<00:03,  1

['metamap', 'condition', 'high altitude cerebral edema', 'high altitude cerebral edema', {'metamap_preferred_name': 'High altitude cerebral edema', 'metamap_cui': 'C0472390', 'metamap_score': '3.81', 'metamap_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'hypoventilation', 'hypoventilation', {'metamap_preferred_name': 'Hypoventilation', 'metamap_cui': 'C3203358', 'metamap_score': '13.10', 'metamap_semtypes': '[patf]'}, None]
['metamap', 'condition', 'peripheral artery disease', 'peripheral artery disease', {'metamap_preferred_name': 'Peripheral Arterial Diseases', 'metamap_cui': 'C1704436', 'metamap_score': '19.54', 'metamap_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'peripheral artery disease', 'peripheral artery disease', {'metamap_preferred_name': 'Peripheral arterial stenosis', 'metamap_cui': 'C4025272', 'metamap_score': '3.77', 'metamap_semtypes': '[dsyn]'}, None]


% conditions mapped:  95%|██████████████████▉ | 18/19 [00:20<00:00,  1

['metamap', 'condition', 'microvascular angina', 'microvascular angina', {'metamap_preferred_name': 'Microvascular Angina', 'metamap_cui': 'C0206064', 'metamap_score': '16.33', 'metamap_semtypes': '[dsyn]'}, None]
['metamap', 'condition', 'breast cancer female', 'breast cancer female', {'metamap_preferred_name': 'Malignant neoplasm of female breast', 'metamap_cui': 'C0235653', 'metamap_score': '3.77', 'metamap_semtypes': '[neop]'}, None]


% conditions mapped: 100%|████████████████████| 19/19 [00:23<00:00,  1
% interventions mapped:   5%|█                   | 1/19 [00:09<02:53, 

['metamap', 'intervention', 'etoposide', 'etoposide', {'metamap_preferred_name': 'Etoposide', 'metamap_cui': 'C0015133', 'metamap_score': '28.87', 'metamap_semtypes': '[orch,phsu]'}, None]


% interventions mapped:  11%|██                  | 2/19 [00:10<01:18, 

['metamap', 'intervention', 'rotem-algorithm', 'rotem-algorithm', {'metamap_preferred_name': 'algorithm', 'metamap_cui': 'C0002045', 'metamap_score': '9.90', 'metamap_semtypes': '[inpr]'}, None]
['metamap', 'intervention', 'rotem-algorithm', 'rotem-algorithm', {'metamap_preferred_name': 'Observation Method - algorithm', 'metamap_cui': 'C1553907', 'metamap_score': '3.59', 'metamap_semtypes': '[ftcn]'}, None]
ATTEMPTING NAME RESOLVER HERE
['metamap', 'intervention', 'nasal glucagon (ng)', 'nasal glucagon (ng)', {'metamap_preferred_name': None, 'metamap_cui': None, 'metamap_score': None, 'metamap_semtypes': None}, None]
['metamap', 'intervention', 'nasal glucagon (ng)', 'nasal glucagon (ng)', {'metamap_preferred_name': 'Glucagon', 'metamap_cui': 'C0017687', 'metamap_score': '19.36', 'metamap_semtypes': '[aapp,horm,phsu]'}, None]
['metamap', 'intervention', 'nasal glucagon (ng)', 'nasal glucagon (ng)', {'metamap_preferred_name': 'Nose', 'metamap_cui': 'C0028429', 'metamap_score': '12.91', 

% interventions mapped:  32%|██████▎             | 6/19 [00:12<00:17, 

['metamap', 'intervention', 'ropivacaine 20ml 5mg/ml+ placebo', 'ropivacaine 20ml 5mg/ml+ placebo', {'metamap_preferred_name': 'ropivacaine', 'metamap_cui': 'C0073571', 'metamap_score': '16.04', 'metamap_semtypes': '[orch,phsu]'}, None]
['metamap', 'intervention', 'ropivacaine 20ml 5mg/ml+ placebo', 'ropivacaine 20ml 5mg/ml+ placebo', {'metamap_preferred_name': 'Placebos', 'metamap_cui': 'C0032042', 'metamap_score': '6.71', 'metamap_semtypes': '[topp]'}, None]
['metamap', 'intervention', 'ropivacaine 20ml 5mg/ml+ placebo', 'ropivacaine 20ml 5mg/ml+ placebo', {'metamap_preferred_name': 'Placebo Control', 'metamap_cui': 'C1706408', 'metamap_score': '3.55', 'metamap_semtypes': '[resa]'}, None]
['metamap', 'intervention', 'ropivacaine 20ml 5mg/ml+ placebo', 'ropivacaine 20ml 5mg/ml+ placebo', {'metamap_preferred_name': 'placebo', 'metamap_cui': 'C1696465', 'metamap_score': '3.55', 'metamap_semtypes': '[bodm]'}, None]
['metamap', 'intervention', 'ropivacaine 20ml 5mg/ml+ placebo', 'ropivaca

% interventions mapped:  42%|████████▍           | 8/19 [00:13<00:11, 

['metamap', 'intervention', 'telemedicine based healthcare programme', 'telemedicine based healthcare programme', {'metamap_preferred_name': 'Telemedicine', 'metamap_cui': 'C0162648', 'metamap_score': '12.89', 'metamap_semtypes': '[hlca]'}, None]
['metamap', 'intervention', 'telemedicine based healthcare programme', 'telemedicine based healthcare programme', {'metamap_preferred_name': 'health care program', 'metamap_cui': 'C0679897', 'metamap_score': '3.61', 'metamap_semtypes': '[hlca]'}, None]
['metamap', 'intervention', 'telemedicine based healthcare programme', 'telemedicine based healthcare programme', {'metamap_preferred_name': 'Base - General Qualifier', 'metamap_cui': 'C1705938', 'metamap_score': '3.43', 'metamap_semtypes': '[idcn]'}, None]
['metamap', 'intervention', 'telemedicine based healthcare programme', 'telemedicine based healthcare programme', {'metamap_preferred_name': 'Basis - conceptual entity', 'metamap_cui': 'C1527178', 'metamap_score': '3.43', 'metamap_semtypes': 

% interventions mapped:  53%|██████████▌         | 10/19 [00:16<00:10,

['metamap', 'intervention', 'surgery', 'surgery', {'metamap_preferred_name': 'Surgical aspects', 'metamap_cui': 'C0038895', 'metamap_score': '22.57', 'metamap_semtypes': '[ftcn]'}, None]
['metamap', 'intervention', 'surgery', 'surgery', {'metamap_preferred_name': 'General surgery specialty', 'metamap_cui': 'C1274039', 'metamap_score': '13.10', 'metamap_semtypes': '[bmod]'}, None]
['metamap', 'intervention', 'surgery', 'surgery', {'metamap_preferred_name': 'Operative Surgical Procedures', 'metamap_cui': 'C0543467', 'metamap_score': '3.64', 'metamap_semtypes': '[topp]'}, None]
['metamap', 'intervention', 'surgery', 'surgery', {'metamap_preferred_name': 'Surgery specialty', 'metamap_cui': 'C0038894', 'metamap_score': '3.64', 'metamap_semtypes': '[bmod]'}, None]
['metamap', 'intervention', 'multifunctional birthing ball', 'multifunctional birthing ball', {'metamap_preferred_name': 'Birthing ball', 'metamap_cui': 'C4068784', 'metamap_score': '3.64', 'metamap_semtypes': '[medd]'}, None]
ATTE

% interventions mapped:  63%|████████████▋       | 12/19 [00:17<00:07,

['nameresolver', 'intervention', 'atg', 'atg', {'nameresolver_preferred_name': 'ATG 016', 'nameresolver_cui': 'UMLS:C5557451', 'nameresolver_score': 19.83688, 'nameresolver_semtypes': 'biolink:ChemicalEntity'}, None]
['metamap', 'intervention', 'use of the device dbl-4pen', 'use of the device dbl-4pen', {'metamap_preferred_name': 'Use of', 'metamap_cui': 'C1524063', 'metamap_score': '3.58', 'metamap_semtypes': '[ftcn]'}, None]
['metamap', 'intervention', 'use of the device dbl-4pen', 'use of the device dbl-4pen', {'metamap_preferred_name': 'Devices', 'metamap_cui': 'C0699733', 'metamap_score': '3.42', 'metamap_semtypes': '[mnob]'}, None]
['metamap', 'intervention', 'use of the device dbl-4pen', 'use of the device dbl-4pen', {'metamap_preferred_name': 'Medical Devices', 'metamap_cui': 'C0025080', 'metamap_score': '3.42', 'metamap_semtypes': '[medd]'}, None]
['metamap', 'intervention', 'resistance training', 'resistance training', {'metamap_preferred_name': 'Resistance Training', 'metama

% interventions mapped:  89%|█████████████████▉  | 17/19 [00:19<00:01,

['metamap', 'intervention', 'emergency vital room', 'emergency vital room', {'metamap_preferred_name': 'Accident and Emergency department', 'metamap_cui': 'C0562508', 'metamap_score': '19.39', 'metamap_semtypes': '[hcro,mnob]'}, None]
['metamap', 'intervention', 'emergency vital room', 'emergency vital room', {'metamap_preferred_name': 'Vital (qualifier value)', 'metamap_cui': 'C0442732', 'metamap_score': '3.44', 'metamap_semtypes': '[qlco]'}, None]
['metamap', 'intervention', 'emergency vital room', 'emergency vital room', {'metamap_preferred_name': 'Vital High Nitrogen Enteral Nutrition', 'metamap_cui': 'C1875856', 'metamap_score': '3.44', 'metamap_semtypes': '[food]'}, None]
['metamap', 'intervention', 'physiotherapy combined with pain education', 'physiotherapy combined with pain education', {'metamap_preferred_name': 'Combined physical therapy', 'metamap_cui': 'C0700642', 'metamap_score': '3.58', 'metamap_semtypes': '[topp]'}, None]
['metamap', 'intervention', 'physiotherapy combi

% interventions mapped: 100%|████████████████████| 19/19 [00:20<00:00,

['metamap', 'intervention', 'indomethacin(kahira pharma&chem,ind,co.cairo-egypt)', 'indomethacin(kahira pharma&chem,ind,co.cairo-egypt)', {'metamap_preferred_name': 'Indomethacin', 'metamap_cui': 'C0021246', 'metamap_score': '16.16', 'metamap_semtypes': '[orch,phsu]'}, None]
['metamap', 'intervention', 'indomethacin(kahira pharma&chem,ind,co.cairo-egypt)', 'indomethacin(kahira pharma&chem,ind,co.cairo-egypt)', {'metamap_preferred_name': 'Colombia', 'metamap_cui': 'C3245499', 'metamap_score': '12.88', 'metamap_semtypes': '[geoa]'}, None]
['metamap', 'intervention', 'indomethacin(kahira pharma&chem,ind,co.cairo-egypt)', 'indomethacin(kahira pharma&chem,ind,co.cairo-egypt)', {'metamap_preferred_name': 'Egypt', 'metamap_cui': 'C0013715', 'metamap_score': '12.88', 'metamap_semtypes': '[geoa]'}, None]
['metamap', 'intervention', 'indomethacin(kahira pharma&chem,ind,co.cairo-egypt)', 'indomethacin(kahira pharma&chem,ind,co.cairo-egypt)', {'metamap_preferred_name': 'Chemical procedure', 'metam

% interventions mapped: 100%|████████████████████| 19/19 [00:22<00:00,
% intervention_alternates mapped:   5%|█                   | 1/19 [00:

['metamap', 'intervention_alternate', 'methylthioninium chloride', 'methylthioninium chloride', {'metamap_preferred_name': 'Methylene blue', 'metamap_cui': 'C0025746', 'metamap_score': '16.33', 'metamap_semtypes': '[irda,orch,phsu]'}, None]
['metamap', 'intervention_alternate', 'progesterone', 'progesterone', {'metamap_preferred_name': 'Progesterone', 'metamap_cui': 'C0033308', 'metamap_score': '22.57', 'metamap_semtypes': '[horm,orch,phsu]'}, None]
['metamap', 'intervention_alternate', 'progesterone', 'progesterone', {'metamap_preferred_name': 'Progesterone [EPC]', 'metamap_cui': 'C2757070', 'metamap_score': '3.64', 'metamap_semtypes': '[phsu]'}, None]
['metamap', 'intervention_alternate', 'erchonia evrl laser', 'erchonia evrl laser', {'metamap_preferred_name': 'Lasers', 'metamap_cui': 'C0023089', 'metamap_score': '9.88', 'metamap_semtypes': '[mnob]'}, None]
['metamap', 'intervention_alternate', 'erchonia evrl laser', 'erchonia evrl laser', {'metamap_preferred_name': 'Laser Device Com

% intervention_alternates mapped:  32%|██████▎             | 6/19 [00:

['metamap', 'intervention_alternate', 'budesonide via mucosal atomization device', 'budesonide via mucosal atomization device', {'metamap_preferred_name': 'Budesonide', 'metamap_cui': 'C0054201', 'metamap_score': '22.48', 'metamap_semtypes': '[orch,phsu]'}, None]
['metamap', 'intervention_alternate', 'budesonide via mucosal atomization device', 'budesonide via mucosal atomization device', {'metamap_preferred_name': 'Mucous Membrane', 'metamap_cui': 'C0026724', 'metamap_score': '9.74', 'metamap_semtypes': '[tisu]'}, None]
['metamap', 'intervention_alternate', 'budesonide via mucosal atomization device', 'budesonide via mucosal atomization device', {'metamap_preferred_name': 'Devices', 'metamap_cui': 'C0699733', 'metamap_score': '3.43', 'metamap_semtypes': '[mnob]'}, None]
['metamap', 'intervention_alternate', 'budesonide via mucosal atomization device', 'budesonide via mucosal atomization device', {'metamap_preferred_name': 'Medical Devices', 'metamap_cui': 'C0025080', 'metamap_score': 

% intervention_alternates mapped:  58%|███████████▌        | 11/19 [00

['metamap', 'intervention_alternate', 'matched placebo', 'matched placebo', {'metamap_preferred_name': 'Placebos', 'metamap_cui': 'C0032042', 'metamap_score': '6.74', 'metamap_semtypes': '[topp]'}, None]
['metamap', 'intervention_alternate', 'matched placebo', 'matched placebo', {'metamap_preferred_name': 'Placebo Control', 'metamap_cui': 'C1706408', 'metamap_score': '3.59', 'metamap_semtypes': '[resa]'}, None]
['metamap', 'intervention_alternate', 'matched placebo', 'matched placebo', {'metamap_preferred_name': 'placebo', 'metamap_cui': 'C1696465', 'metamap_score': '3.59', 'metamap_semtypes': '[bodm]'}, None]
['metamap', 'intervention_alternate', 'matched placebo', 'matched placebo', {'metamap_preferred_name': 'MATCHING', 'metamap_cui': 'C0150103', 'metamap_score': '3.45', 'metamap_semtypes': '[resa]'}, None]
['metamap', 'intervention_alternate', 'matched placebo', 'matched placebo', {'metamap_preferred_name': 'Match', 'metamap_cui': 'C1708943', 'metamap_score': '3.45', 'metamap_semty

% intervention_alternates mapped:  79%|███████████████▊    | 15/19 [00

['metamap', 'intervention_alternate', 'early mobilization', 'early mobilization', {'metamap_preferred_name': 'Early Mobilization', 'metamap_cui': 'C0013459', 'metamap_score': '19.49', 'metamap_semtypes': '[topp]'}, None]


% intervention_alternates mapped:  95%|██████████████████▉ | 18/19 [00

['metamap', 'intervention_alternate', 'bms-936558', 'bms-936558', {'metamap_preferred_name': 'BMS-936558', 'metamap_cui': 'C4552119', 'metamap_score': '28.95', 'metamap_semtypes': '[aapp,imft,phsu]'}, None]
['metamap', 'intervention_alternate', 'botox cosmetic', 'botox cosmetic', {'metamap_preferred_name': 'Cosmetics', 'metamap_cui': 'C0010164', 'metamap_score': '9.76', 'metamap_semtypes': '[mnob]'}, None]
['metamap', 'intervention_alternate', 'botox cosmetic', 'botox cosmetic', {'metamap_preferred_name': 'Botox', 'metamap_cui': 'C0700702', 'metamap_score': '3.59', 'metamap_semtypes': '[orch,phsu]'}, None]
['metamap', 'intervention_alternate', 'botox cosmetic', 'botox cosmetic', {'metamap_preferred_name': 'Cosmetic procedure', 'metamap_cui': 'C0442965', 'metamap_score': '3.45', 'metamap_semtypes': '[topp]'}, None]
['metamap', 'intervention_alternate', 'g-csf', 'g-csf', {'metamap_preferred_name': 'Granulocyte Colony-Stimulating Factor', 'metamap_cui': 'C0079459', 'metamap_score': '22.64

% intervention_alternates mapped: 100%|████████████████████| 19/19 [00


In [None]:

flag_and_path = get_raw_ct_data() # download raw data

global metamap_dirs
metamap_dirs = check_os()
df_dict = read_raw_ct_data(flag_and_path, subset_size) # read the clinical trial data
dict_new_terms = check_against_cache(df_dict, flag_and_path) # use the existing cache of MetaMapped terms so that only new terms are mapped

term_list_to_mm(dict_new_terms, flag_and_path) # map new terms using MetaMap

map_to_trial(flag_and_path) # map MetaMap terms back to trial 
score_mappings(flag_and_path) # score the mappings
auto_select_curies(flag_and_path) # select CURIEs automatically that pass score threshold

# compile_curies_for_trials(flag_and_path) # select CURIEs automatically that pass score threshold

