In [45]:
# display cells to maximum width 
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))

# lets you preint multiple outputs per cell, not just last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [212]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
import re
import collections
import os
import json
import numpy as np
import pickle
from functools import reduce
import time
from time import sleep
import concurrent
import multiprocessing
import datetime as dt
from datetime import date
import pathlib
import configparser
import sys
import urllib
import zipfile
import csv
sys.path.insert(0, '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/pymetamap-master')
from pymetamap import MetaMap  # https://github.com/AnthonyMRios/pymetamap/blob/master/pymetamap/SubprocessBackend.py

In [213]:
# %pip install thefuzz
# %pip install levenshtein

from thefuzz import fuzz # fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

In [214]:
global metamap_dirs
global metamap_pos_server_dir
global metamap_wsd_server_dir


In [215]:
# fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

def get_token_sort_ratio(str1, str2):
    try:
        return fuzz.token_sort_ratio(str1, str2)
    except:
        return None
    
sort_ratio = np.vectorize(get_token_sort_ratio)

def get_token_set_ratio(str1, str2):
    try:
        return fuzz.token_set_ratio(str1, str2)
    except:
        return None  
set_ratio = np.vectorize(get_token_set_ratio)

def get_similarity_score(str1, str2):
    try:
        return fuzz.ratio(str1, str2)
    except:
        return None
sim_score = np.vectorize(get_similarity_score)

In [216]:
def get_raw_ct_data():
    term_program_flag = True
    global data_dir
    global data_extracted
    
    # get all the links and associated dates of upload into a dict called date_link
    url_all = "https://aact.ctti-clinicaltrials.org/pipe_files"
    response = requests.get(url_all)
    soup = BeautifulSoup(response.text)
    body = soup.find_all('option') #Find all
    date_link = {}
    for el in body:
        tags = el.find('a')
        try:
            zip_name = tags.contents[0].split()[0]
            date = zip_name.split("_")[0]
            date = dt.datetime.strptime(date, '%Y%m%d').date()
            date_link[date] = tags.get('href')
        except:
            pass
    latest_file_date = max(date_link.keys())   # get the date of the latest upload
    url = date_link[latest_file_date]   # get the corresponding download link of the latest upload so we can download the raw data
    date_string = latest_file_date.strftime("%m_%d_%Y")
    data_dir = "{}/data".format(pathlib.Path.cwd())
    data_extracted = data_dir + "/{}_extracted".format(date_string)
    data_path = "{}/{}_pipe-delimited-export.zip".format(data_dir, date_string)
    
    if not os.path.exists(data_path):   # if folder containing most recent data doesn't exist, download and extract it into data folder
        
        term_program_flag = False   # flag below for terminating program if latest download exists (KG is assumed up to date)
        print("Downloading Clinical Trial data as of {}".format(date_string))
        response = requests.get(url)
        if response.status_code == 200:
            with open(data_path, 'wb') as file:
                file.write(response.content)
            print("Finished download of zip")
            with zipfile.ZipFile(data_path, 'r') as download:
                print("Unzipping data")
                download.extractall(data_extracted)
        else:
            print("KG is already up to date.")
    return {"term_program_flag": term_program_flag, "data_extracted_path": data_extracted, "date_string": date_string}



In [223]:
def read_raw_ct_data(flag_and_path):
    if flag_and_path["term_program_flag"]:
        print("Exiting program. Assuming KG has already been constructed from most recent data dump from AACT.")
#         exit()
#         pass
    else:
        data_extracted = flag_and_path["data_extracted_path"]
        # read in pipe-delimited files 
        conditions_df = pd.read_csv(data_extracted + '/conditions.txt', sep='|', index_col=False, header=0)
        interventions_df = pd.read_csv(data_extracted + '/interventions.txt', sep='|', index_col=False, header=0)
        interventions_alts = pd.read_csv(data_extracted + '/intervention_other_names.txt', sep='|', index_col=False, header=0)

#         browse_conditions_df = pd.read_csv(data_extracted + '/browse_conditions.txt', sep='|', index_col=False, header=0)
#         browse_interventions_df = pd.read_csv(data_extracted + '/browse_interventions.txt', sep='|', index_col=False, header=0)
        
    ### GET RID OF....CHEAT LINE FOR TESTING
        conditions_df = conditions_df.iloc[:500]
        interventions_df = interventions_df.iloc[:500]

    return {"conditions": conditions_df, "interventions": interventions_df, "interventions_alts": interventions_alts
#             "browse_conditions": browse_conditions_df, "browse_interventions": browse_interventions_df
           }



In [218]:
def de_ascii_er(text):
    non_ascii = "[^\x00-\x7F]"
    pattern = re.compile(r"[^\x00-\x7F]")
    non_ascii_text = re.sub(pattern, ' ', text)
    return non_ascii_text

In [219]:
def start_metamap_servers(metamap_dirs):
    metamap_pos_server_dir = 'bin/skrmedpostctl' # Part of speech tagger
    metamap_wsd_server_dir = 'bin/wsdserverctl' # Word sense disambiguation 
    
    # Start servers
    os.system(metamap_dirs['metamap_base_dir'] + metamap_pos_server_dir + ' start') # Part of speech tagger
    os.system(metamap_dirs['metamap_base_dir'] + metamap_wsd_server_dir + ' start') # Word sense disambiguation 
    # # Sleep a bit to give time for these servers to start up
    sleep(5)

def stop_metamap_servers(metamap_dirs):
    metamap_pos_server_dir = 'bin/skrmedpostctl' # Part of speech tagger
    metamap_wsd_server_dir = 'bin/wsdserverctl' # Word sense disambiguation 
    # Stop servers
    os.system(metamap_dirs['metamap_base_dir'] + metamap_pos_server_dir + ' stop') # Part of speech tagger
    os.system(metamap_dirs['metamap_base_dir'] + metamap_wsd_server_dir + ' stop') # Word sense disambiguation 
        

In [220]:
def check_os():
    if "linux" in sys.platform:
        print("Linux platform detected")
        metamap_base_dir = "{}/metamap/".format(pathlib.Path.cwd().parents[0])
        metamap_bin_dir = 'bin/metamap20'
    else:
        metamap_base_dir = '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/' # for running on local
        metamap_bin_dir = 'bin/metamap18'
        
    return {"metamap_base_dir":metamap_base_dir, "metamap_bin_dir":metamap_bin_dir}
        

In [235]:
# def run_metamap(input_term, params, mm, cond_or_inter, csv_writer):
#     from_metamap = []
#     try:
#         concepts,error = mm.extract_concepts([input_term],
# #                                              restrict_to_sts = params["restrict_to_sts"],
#                                              term_processing = params["term_processing"],
#                                              ignore_word_order = params["ignore_word_order"],
#                                              strict_model = params["strict_model"],
#                                              exclude_sts = params["exclude_sts"]
#                                             )

#         for concept in concepts:
#             concept_info = []
#             concept = concept._asdict()
#             concept_info.extend([cond_or_inter,input_term])
#             concept_info.extend([concept.get(k) for k in ['preferred_name', 'cui', 'score', 'semtypes']])
#             from_metamap.append(concept_info)
#     except:
#         from_metamap.extend([input_term, None, None, None, None, None, None])
#     for result in from_metamap:
#         print(result)
#         csv_writer.writerow(result)
#     return from_metamap

In [250]:
def run_metamap(input_term, params, mm, cond_or_inter, csv_writer):
    from_metamap = []
    if params.get("exclude_sts") is None: # exclude_sts is used for Interventions. restrict_to_sts is used for Conditions. So, the logic is, if we're mapping Conditions, execute "if" part of code. If we're mapping Interventions, execute "else" part of code
        try:
            concepts,error = mm.extract_concepts([input_term],
                                                 restrict_to_sts = params["restrict_to_sts"],
                                                 term_processing = params["term_processing"],
                                                 ignore_word_order = params["ignore_word_order"],
                                                 strict_model = params["strict_model"],
                                                )

            for concept in concepts:
                concept_info = []
                concept = concept._asdict()
                concept_info.extend([cond_or_inter,input_term])
                concept_info.extend([concept.get(k) for k in ['preferred_name', 'cui', 'score', 'semtypes']])
                from_metamap.append(concept_info)
        except:
            from_metamap.extend([input_term, None, None, None, None, None, None])
    else:
        try:
            concepts,error = mm.extract_concepts([input_term],
                                                 exclude_sts = params["exclude_sts"],
                                                 term_processing = params["term_processing"],
                                                 ignore_word_order = params["ignore_word_order"],
                                                 strict_model = params["strict_model"],
                                                )

            for concept in concepts:
                concept_info = []
                concept = concept._asdict()
                concept_info.extend([cond_or_inter,input_term])
                concept_info.extend([concept.get(k) for k in ['preferred_name', 'cui', 'score', 'semtypes']])
                from_metamap.append(concept_info)
        except:
            from_metamap.extend([input_term, None, None, None, None, None, None])
        
    for result in from_metamap:
#         print(result)
        csv_writer.writerow(result)
    return from_metamap

In [59]:
# this cell tests if MetaMap is working on the term list below
# terms = ['infarction, myocardial', 'aneurysm', 'diabetes', 'common cold', 'fracture', 'juice blend', "hormones"]

# condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
# params = {"restrict_to_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False}
# start_metamap_servers(metamap_dirs) # start the MetaMap servers
# mm = MetaMap.get_instance(metamap_dirs["metamap_base_dir"] + metamap_dirs["metamap_bin_dir"])
# cond_or_inter = "condition"

# # prep file that stores MetaMap output
# col_names = ['term_type', 'clin_trial_term','metamap_preferred_name', 'metamap_cui', 'metamap_score', 'metamap_semantic_type']
# # metamap_output = open("metamap_output().tsv".format(flag_and_path["date_string"]), 'w+', newline='') 
# metamap_output = open("metamap_output.tsv", 'w+', newline='') 
# csv_writer = csv.writer(metamap_output, delimiter='\t')
# csv_writer.writerow(col_names)

# for term in terms:
# #     test = run_metamap(term, params, mm, cond_or_inter)
#     run_metamap(term, params, mm, cond_or_inter, csv_writer)
# metamap_output.close()    
# stop_metamap_servers(metamap_dirs) # stop the MetaMap servers


In [248]:
def parallelize_metamap(term_list, params, cond_or_inter, flag_and_path, csv_writer):
    start_metamap_servers(metamap_dirs) # start the MetaMap servers
    mm = MetaMap.get_instance(metamap_dirs["metamap_base_dir"] + metamap_dirs["metamap_bin_dir"])
    with concurrent.futures.ThreadPoolExecutor((multiprocessing.cpu_count()*2) - 1) as executor:
        _ = [executor.submit(run_metamap, term, params, mm, cond_or_inter, csv_writer) for term in term_list]
    stop_metamap_servers(metamap_dirs) # stop the MetaMap servers
    


# USE METAMAP LOCAL TO MAP REMAINING TERMS

In [251]:
def term_list_to_mm(df_dict, flag_and_path):
    
    # -------    CONDITIONS    ------- #
    print("Using UMLS MetaMap to get mappings for CONDITIONS. MetaMap returns mappings, CUIs, and semantic type of mapping.")
    unmapped_conditions = df_dict["conditions"].downcase_name
    unmapped_conditions = list(unmapped_conditions.unique())
    unmapped_conditions = list(filter(None, unmapped_conditions))

    deasciied_unmapped_conditions = list(map(de_ascii_er, unmapped_conditions)) # Metamap 2020 does not need de_asciier. Metamap 2018 and prior does. 

    conditions = pd.DataFrame({'original_unmapped_conditions': unmapped_conditions, 'de_asciied_unmapped_conditions': deasciied_unmapped_conditions})
    
    # some input terms have () with additional text, like an abbreviation, in them. split them out to facilitate better mapping 
    pattern_outisde = r'(?<=\().+?(?=\))|([^(]+)'
    pattern_inside = r'\(([^()]+)\)|([^(]+)'

    matches_outside = conditions['original_unmapped_conditions'].str.extract(pattern_outisde)
    conditions['original_condition_split_1'] = matches_outside[0].fillna('')
    matches_inside = conditions['original_unmapped_conditions'].str.extract(pattern_inside)
    conditions['original_condition_split_2'] = matches_inside[0].fillna('')

    matches_outside = conditions['de_asciied_unmapped_conditions'].str.extract(pattern_outisde)
    conditions['de_asciied_conditions_split_1'] = matches_outside[0].fillna('')
    matches_inside = conditions['de_asciied_unmapped_conditions'].str.extract(pattern_inside)
    conditions['de_asciied_conditions_split_2'] = matches_inside[0].fillna('')
    
    # see MetaMap Usage instructions: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MM_2016_Usage.pdf
#      condition_args = ['--sldi -I -C -J acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf -z -i -f']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    params = {"restrict_to_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    
    # prep output file of Metamap results
    date = flag_and_path["date_string"]
    filename = f"metamap_output_{date}.tsv"
    metamap_output = open(filename, 'w+', newline='')
    col_names = ['term_type', 'clin_trial_term','metamap_preferred_name', 'metamap_cui', 'metamap_score', 'metamap_semantic_type']
    csv_writer = csv.writer(metamap_output, delimiter='\t')
    csv_writer.writerow(col_names)
    
    metamap_version = [int(s) for s in re.findall(r'\d+', metamap_dirs.get('metamap_bin_dir'))]
    if metamap_version[0] >= 20:
        print("MetaMap version >= 2020, conduct mapping on original terms")
#         mm_conditions = run_parallel_threads_mm(conditions["original_condition_split_1"].tolist(), params)
        parallelize_metamap(conditions["original_condition_split_1"].tolist(), params, "condition", flag_and_path, csv_writer)
    else:
        print("MetaMap version < 2020, conduct mapping on terms after removing ascii characters")
#         mm_conditions = run_parallel_threads_mm(conditions["de_asciied_conditions_split_1"].tolist(), params)
        parallelize_metamap(conditions["de_asciied_conditions_split_1"].tolist(), params, "condition", flag_and_path, csv_writer)
    
    # -------    INTERVENTIONS    ------- #
    print("Using UMLS MetaMap to get mappings for INTERVENTIONS. MetaMap returns mappings, CUIs, and semantic type of mapping.")
#     interventions_df['orig_downcase_name'] = interventions_df['name'].str.lower()
#     unmapped_interventions = df_dict["interventions"].orig_downcase_name
#     unmapped_interventions = list(unmapped_interventions.unique())
#     unmapped_interventions = list(filter(None, unmapped_interventions))
    
    """ Interventions requires unique handling. Another table gives possible alternate names for the interventions. 
        We may map on the alternate names column
        We take the interventions, take the ascii and deasciied versions of them,
        and split substrings in parentheses out of them. We perform MetaMapping on the
        original term or the deasciied term dependinging on what operating system we
        are on. If the mapped term passes the fuzzy scoring thesholds for any of the
        terms (original, deasciied, original inside the parentheses, deasciied inside
        the parentheses, original outside the parentheses, deasciied outside the
        parentheses), we keep that CURIE 
    """

    interventions_df = df_dict["interventions"]
    interventions_df['orig_downcase_name'] = interventions_df['name'].str.lower()
    interventions_alts = df_dict["interventions_alts"]
    interventions_alts['alt_downcase_name'] = interventions_alts['name'].str.lower()

    interventions_all = pd.merge(interventions_df[["id", "nct_id", "intervention_type", "orig_downcase_name", "description"]], interventions_alts[["nct_id", "intervention_id", "alt_downcase_name"]], how='left', left_on=['id'], right_on = ['intervention_id'])
    interventions_all = interventions_all.astype(str)
    interventions_all = interventions_all.drop('nct_id_y', axis=1) # drop the redundant column now
    interventions_all.rename(columns = {'nct_id_x':'nct_id'}, inplace = True)

    interventions_all = interventions_all.sort_values(by='nct_id', ascending=False, na_position='last')
    interventions_all = interventions_all.drop('intervention_id', axis=1) # drop the redundant column now
    interventions_all.rename(columns = {'id':'intervention_id'}, inplace = True)

    """ remove any placebo/saline/water rows """
    interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('placebo')]
    # interventions_all = interventions_all[~interventions_all['description'].str.contains('placebo')]
    interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('placebo')]

    interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('saline')]
    # interventions_all = interventions_all[~interventions_all['description'].str.contains('saline')]
    interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('saline')]

    interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('water')]
    # interventions_all = interventions_all[~interventions_all['description'].str.contains('water')]
    interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('water')]

    interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('sham')]
    # interventions_all = interventions_all[~interventions_all['description'].str.contains('sham')]
    interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('sham')]

    interventions_all.to_csv('all_interventions.tsv', sep="\t", index=False, header=False) # output interventions to TSV, avoid storing in memory

    # make df of de-asciied columns for the original_name_interventions and alternate_name_interventions
    deasciier = np.vectorize(de_ascii_er) # vectorize function

    original_int = pd.Series(interventions_all.orig_downcase_name.unique()).dropna() 
    deascii_orig_int = deasciier(original_int.values) # perform deascii-ing on original intervention names

    alternate_int = pd.Series(interventions_all.alt_downcase_name.unique()).dropna()
    deascii_alt_int = deasciier(alternate_int.values) # perform deascii-ing on alternate intervention names

    interventions_mapping = pd.DataFrame(list(zip(original_int, deascii_orig_int, alternate_int, deascii_alt_int)), columns =['orig_int', 'deascii_orig_int', 'alt_int', 'deascii_alt_int'])

    # split out substring if they are in parentheses
    pattern_outisde = r'(?<=\().+?(?=\))|([^(]+)'
    # pattern_inside = r'\(([^()]+)\)|([^(]+)'
    pattern_inside = r'\(([^)]+)\)'

    matches_outside = interventions_mapping['orig_int'].str.extract(pattern_outisde)
    interventions_mapping['orig_int_outside'] = matches_outside[0].fillna('')
    matches_inside = interventions_mapping['orig_int'].str.extract(pattern_inside)
    interventions_mapping['orig_int_inside'] = matches_inside[0].fillna('')

    matches_outside = interventions_mapping['deascii_orig_int'].str.extract(pattern_outisde)
    interventions_mapping['deascii_orig_int_outside'] = matches_outside[0].fillna('')
    matches_inside = interventions_mapping['deascii_orig_int'].str.extract(pattern_inside)
    interventions_mapping['deascii_orig_int_inside'] = matches_inside[0].fillna('')

    matches_outside = interventions_mapping['alt_int'].str.extract(pattern_outisde)
    interventions_mapping['alt_int_outside'] = matches_outside[0].fillna('')
    matches_inside = interventions_mapping['alt_int'].str.extract(pattern_inside)
    interventions_mapping['alt_int_inside'] = matches_inside[0].fillna('')

    matches_outside = interventions_mapping['deascii_alt_int'].str.extract(pattern_outisde)
    interventions_mapping['deascii_alt_int_outside'] = matches_outside[0].fillna('')
    matches_inside = interventions_mapping['deascii_alt_int'].str.extract(pattern_inside)
    interventions_mapping['deascii_alt_int_inside'] = matches_inside[0].fillna('')

    """ separate the columns that are deascii vs with ascii char"""
    deascii_cols = [col for col in interventions_mapping.columns if "deascii" in col] 
    deascii_ints = interventions_mapping[deascii_cols]
    ascii_cols = [col for col in interventions_mapping.columns if "deascii" not in col]
    ascii_ints = interventions_mapping[ascii_cols]

    """ I don't want to perform mapping on strings < 4 char in length; these are ambiguous and it's hard to make a call what that concept should be """
    """ Get the char counts of all the columns. """
    ascii_ints_counts = ascii_ints.copy() # make a copy of ascii char df
    for col in ascii_ints.columns: # get the char counts of each column
        char_count_col_name = col + '_char_count'
        ascii_ints_counts[char_count_col_name] = ascii_ints_counts[col].str.len()

    deascii_ints_counts = deascii_ints.copy() # make a copy of df
    for col in deascii_ints.columns: # get the char counts of each column
        char_count_col_name = col + '_char_count'
        deascii_ints_counts[char_count_col_name] = deascii_ints_counts[col].str.len()

    """ If char_count < 4, replace the string in the corresponding column with None so that we don't use it for comparison """    
    for col in ascii_ints_counts.columns[ascii_ints_counts.columns.str.contains("char_count")]:
        for index, value in ascii_ints_counts[col].items():
            if value < 4:
                # Find the column with the most similar name without "char_count" substring
                most_similar_col = ascii_ints_counts.columns[ascii_ints_counts.columns.str.replace("_char_count", "") == col.replace("_char_count", "")].values[0]
                # Update the value in the most similar column
                ascii_ints_counts.at[index, most_similar_col] = None
        ascii_ints_counts = ascii_ints_counts.drop(col, axis=1) # drop the count columns now

    for col in deascii_ints_counts.columns[deascii_ints_counts.columns.str.contains("char_count")]:
        for index, value in deascii_ints_counts[col].items():
            if value < 4:
                # Find the column with the most similar name without "char_count" substring
                most_similar_col = deascii_ints_counts.columns[deascii_ints_counts.columns.str.replace("_char_count", "") == col.replace("_char_count", "")].values[0]
                # Update the value in the most similar column
                deascii_ints_counts.at[index, most_similar_col] = None  
        deascii_ints_counts = deascii_ints_counts.drop(col, axis=1) # drop the count columns now

    """ If the substring that was either outside or inside the () is identical to the term from which it came from, put None in that column """    
    ascii_ints_counts.loc[ascii_ints_counts['orig_int'] == ascii_ints_counts['orig_int_outside'], 'orig_int_outside'] = None
    ascii_ints_counts.loc[ascii_ints_counts['alt_int'] == ascii_ints_counts['alt_int_outside'], 'alt_int_outside'] = None
    ascii_ints_counts.loc[ascii_ints_counts['orig_int'] == ascii_ints_counts['orig_int_inside'], 'orig_int_inside'] = None
    ascii_ints_counts.loc[ascii_ints_counts['alt_int'] == ascii_ints_counts['alt_int_inside'], 'alt_int_inside'] = None

    deascii_ints_counts.loc[deascii_ints_counts['deascii_orig_int'] == deascii_ints_counts['deascii_orig_int_outside'], 'deascii_orig_int_outside'] = None
    deascii_ints_counts.loc[deascii_ints_counts['deascii_alt_int'] == deascii_ints_counts['deascii_alt_int_outside'], 'deascii_alt_int_outside'] = None
    deascii_ints_counts.loc[deascii_ints_counts['deascii_orig_int'] == deascii_ints_counts['deascii_orig_int_inside'], 'deascii_orig_int_inside'] = None
    deascii_ints_counts.loc[deascii_ints_counts['deascii_alt_int'] == deascii_ints_counts['deascii_alt_int_inside'], 'deascii_alt_int_inside'] = None

    """ When it comes to parameters for MetaMap for conditions, we restricted the semantic types to those seeming relevant to diseases, etc.
        For interventions, we have chosen to EXCLUDE the semantic types specified for conditions,
        instead of specifying semantic types desired for interventions. Spot-testing setting restrictions on semantic type for interventions didn't show much promise.
        restrict_to_sts = restrict semantic type, exclude_sts = exclude semantic types """

    params = {"exclude_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...

    # file to write out to is still open, so continuing to append the interventions to it now     
    if metamap_version[0] >= 20:
        print("MetaMap version >= 2020, conduct mapping on original terms")
        parallelize_metamap(ascii_ints_counts["orig_int"].tolist(), params, "intervention", flag_and_path, csv_writer)
        parallelize_metamap(ascii_ints_counts["alt_int"].tolist(), params, "alternate_intervention", flag_and_path, csv_writer)

    else:
        print("MetaMap version < 2020, conduct mapping on terms after removing ascii characters")
        parallelize_metamap(deascii_ints_counts["deascii_orig_int"].tolist(), params, "intervention", flag_and_path, csv_writer)
        parallelize_metamap(deascii_ints_counts["deascii_alt_int"].tolist(), params, "alternate_intervention", flag_and_path, csv_writer)





    # # print(len(test))
    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    #     display(ascii_ints)

    



    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    

          
#     # -------    INTERVENTIONS    ------- #
#     print("Using UMLS MetaMap to get more mappings for interventions. MetaMap returns mappings, CUIs, and semantic type of mapping.")
#     unmapped_interventions = ct_terms["unmapped_interventions"]
#     interventions_unmapped_chunked = split_list_by_char_lim(unmapped_interventions)
#     # see MetaMap Usage instructions: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MM_2016_Usage.pdf
#     # removing sosy semantic type (sign or symptom) - often get MetaMap matches to the sign or symptom instead of the full disease...for example, will get back "exercise-induced" instead of "immune dysfunction" for "exercise-induced immune dysfunction" bc it matches the descriptive quality "exercise-induced" is matched on 
#     intervention_args = ['--sldi -I -C -k acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,sosy -z -i -f']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.") (I used inverse of semantic terms picked for conditions here)
#     mm_interventions = run_parallel_threads_mm(interventions_unmapped_chunked, intervention_args)
#     flattened_mm_interventions = {key: [item for sublist in value for item in sublist] for key, value in mm_interventions.items()}
#     mm_interventions_df = pd.DataFrame({"intervention_input": list(flattened_mm_interventions.keys()),
#                                         "intervention_CURIE_id": [value[0] for value in flattened_mm_interventions.values()],
#                                         "intervention_CURIE_name": [value[1] for value in flattened_mm_interventions.values()],
#                                         "intervention_semantic_type": [value[-1] for value in flattened_mm_interventions.values()],
#                                         "source": "MetaMap via UMLS, term and CURIE"})

#     mm_interventions_df[['intervention_CURIE_name_1', 'intervention_CURIE_name_2']] = mm_interventions_df['intervention_CURIE_name'].str.extract(r'^(.*?)\s*\((.*?)\)$').fillna('NA') # 

#     sort_ratio = np.vectorize(get_token_sort_ratio)
#     set_ratio = np.vectorize(get_token_set_ratio)
#     sim_score = np.vectorize(get_similarity_score)

#     # many MetaMap terms are returned as "term (term)". For example, "Nonessential Amino Acid (Nonessential amino acid)". This repetition messes up the sort ratio and sim score, so we extract the substrings out of the parenthesis to conduct scoring on those
#     mm_interventions_scored = mm_interventions_df.copy()
#     mm_interventions_scored["sort_ratio"] = sort_ratio(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name"]].values) # generate fuzzy scores based between original and MeSH term
#     mm_interventions_scored["sim_score"] = sim_score(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name"]].values)

#     mm_interventions_scored["sort_ratio_1"] = sort_ratio(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name_1"]].values) # generate fuzzy scores based between original and MetaMap term
#     mm_interventions_scored["sim_score_1"] = sim_score(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name_1"]].values)

#     mm_interventions_scored["sort_ratio_2"] = sort_ratio(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name_2"]].values) # generate fuzzy scores based between original and MetaMap term
#     mm_interventions_scored["sim_score_2"] = sim_score(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name_2"]].values)

#     mm_interventions_scored_thresholded = mm_interventions_scored.copy() 
#     mm_interventions_scored_thresholded = mm_interventions_scored_thresholded[(mm_interventions_scored_thresholded['sim_score'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sort_ratio'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sim_score_1'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sort_ratio_1'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sim_score_2'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sort_ratio_2'] > 88)]
    
#     print("Number of unique interventions that are mapped after using MetaMap and similarity and ratio score thresholds of 88: {}".format(mm_interventions_scored_thresholded.shape[0]))

#     mm_interventions_scored_thresholded = mm_interventions_scored_thresholded.drop(['intervention_CURIE_name_1',
#                                                                                     'intervention_CURIE_name_2',
#                                                                                     'sort_ratio',
#                                                                                     'sim_score',
#                                                                                     'sort_ratio_1',
#                                                                                     'sim_score_1',
#                                                                                     'sort_ratio_2',
#                                                                                     'sim_score_2'], axis=1)
#     previously_mapped = ct_terms["mapped_interventions"]
#     combined_mapped_interventions = pd.concat([previously_mapped, mm_interventions_scored_thresholded], ignore_index=True) # get dataframe of combined previously mapped interventions and additional MetaMapped interventions that passed threshold scoring
#     interventions = df_dict["interventions"]
#     all_interventions_list = interventions["downcase_name"].values.tolist()
#     all_interventions_list = list(set(all_interventions_list))
#     unmapped_interventions = list(set(all_interventions_list)-set(list(combined_mapped_interventions.intervention_input.values)))
#     print("Number of unique interventions that are unmapped after using MetaMap and similarity and ratio score thresholds of 88: {}".format(len(unmapped_interventions)))
#     ct_terms = {'mapped_conditions': combined_mapped_conditions,
#                 'unmapped_conditions': unmapped_conditions,
#                 'mapped_interventions': combined_mapped_interventions,
#                 'unmapped_interventions': unmapped_interventions,
#                 'all_metamapped_conditions': mm_conditions_df,
#                 'all_metamapped_interventions': mm_interventions_df}


#     return ct_terms


In [210]:
metamap_version = [int(s) for s in re.findall(r'\d+', metamap_dirs.get('metamap_bin_dir'))]

# prep output file of Metamap results
date = flag_and_path["date_string"]
filename = f"metamap_output_{date}.tsv"
metamap_output = open(filename, 'w+', newline='')
col_names = ['term_type', 'clin_trial_term','metamap_preferred_name', 'metamap_cui', 'metamap_score', 'metamap_semantic_type']
csv_writer = csv.writer(metamap_output, delimiter='\t')
csv_writer.writerow(col_names)

98

In [244]:

"""
We take the interventions, take the ascii and deasciied versions of them,
and split substrings in parentheses out of them. We perform MetaMapping on the
original term or the deasciied term dependinging on what operating system we
are on. If the mapped term passes the fuzzy scoring thesholds for any of the
terms (original, deasciied, original inside the parentheses, deasciied inside
the parentheses, original outside the parentheses, deasciied outside the
parentheses), we keep that CURIE 
"""

interventions_df = df_dict["interventions"]
interventions_df['orig_downcase_name'] = interventions_df['name'].str.lower()
interventions_alts = df_dict["interventions_alts"]
interventions_alts['alt_downcase_name'] = interventions_alts['name'].str.lower()

interventions_all = pd.merge(interventions_df[["id", "nct_id", "intervention_type", "orig_downcase_name", "description"]], interventions_alts[["nct_id", "intervention_id", "alt_downcase_name"]], how='left', left_on=['id'], right_on = ['intervention_id'])
interventions_all = interventions_all.astype(str)
interventions_all = interventions_all.drop('nct_id_y', axis=1) # drop the redundant column now
interventions_all.rename(columns = {'nct_id_x':'nct_id'}, inplace = True)

interventions_all = interventions_all.sort_values(by='nct_id', ascending=False, na_position='last')
interventions_all = interventions_all.drop('intervention_id', axis=1) # drop the redundant column now
interventions_all.rename(columns = {'id':'intervention_id'}, inplace = True)

""" remove any placebo/saline/water rows """
interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('placebo')]
# interventions_all = interventions_all[~interventions_all['description'].str.contains('placebo')]
interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('placebo')]

interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('saline')]
# interventions_all = interventions_all[~interventions_all['description'].str.contains('saline')]
interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('saline')]

interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('water')]
# interventions_all = interventions_all[~interventions_all['description'].str.contains('water')]
interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('water')]

interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('sham')]
# interventions_all = interventions_all[~interventions_all['description'].str.contains('sham')]
interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('sham')]

interventions_all.to_csv('all_interventions.tsv', sep="\t", index=False, header=False) # output interventions to TSV, avoid storing in memory

# make df of de-asciied columns for the original_name_interventions and alternate_name_interventions
deasciier = np.vectorize(de_ascii_er) # vectorize function

original_int = pd.Series(interventions_all.orig_downcase_name.unique()).dropna() 
deascii_orig_int = deasciier(original_int.values) # perform deascii-ing on original intervention names

alternate_int = pd.Series(interventions_all.alt_downcase_name.unique()).dropna()
deascii_alt_int = deasciier(alternate_int.values) # perform deascii-ing on alternate intervention names

interventions_mapping = pd.DataFrame(list(zip(original_int, deascii_orig_int, alternate_int, deascii_alt_int)), columns =['orig_int', 'deascii_orig_int', 'alt_int', 'deascii_alt_int'])

# split out substring if they are in parentheses
pattern_outisde = r'(?<=\().+?(?=\))|([^(]+)'
# pattern_inside = r'\(([^()]+)\)|([^(]+)'
pattern_inside = r'\(([^)]+)\)'

matches_outside = interventions_mapping['orig_int'].str.extract(pattern_outisde)
interventions_mapping['orig_int_outside'] = matches_outside[0].fillna('')
matches_inside = interventions_mapping['orig_int'].str.extract(pattern_inside)
interventions_mapping['orig_int_inside'] = matches_inside[0].fillna('')

matches_outside = interventions_mapping['deascii_orig_int'].str.extract(pattern_outisde)
interventions_mapping['deascii_orig_int_outside'] = matches_outside[0].fillna('')
matches_inside = interventions_mapping['deascii_orig_int'].str.extract(pattern_inside)
interventions_mapping['deascii_orig_int_inside'] = matches_inside[0].fillna('')

matches_outside = interventions_mapping['alt_int'].str.extract(pattern_outisde)
interventions_mapping['alt_int_outside'] = matches_outside[0].fillna('')
matches_inside = interventions_mapping['alt_int'].str.extract(pattern_inside)
interventions_mapping['alt_int_inside'] = matches_inside[0].fillna('')

matches_outside = interventions_mapping['deascii_alt_int'].str.extract(pattern_outisde)
interventions_mapping['deascii_alt_int_outside'] = matches_outside[0].fillna('')
matches_inside = interventions_mapping['deascii_alt_int'].str.extract(pattern_inside)
interventions_mapping['deascii_alt_int_inside'] = matches_inside[0].fillna('')

""" separate the columns that are deascii vs with ascii char"""
deascii_cols = [col for col in interventions_mapping.columns if "deascii" in col] 
deascii_ints = interventions_mapping[deascii_cols]
ascii_cols = [col for col in interventions_mapping.columns if "deascii" not in col]
ascii_ints = interventions_mapping[ascii_cols]

""" I don't want to perform mapping on strings < 4 char in length; these are ambiguous and it's hard to make a call what that concept should be """
""" Get the char counts of all the columns. """
ascii_ints_counts = ascii_ints.copy() # make a copy of ascii char df
for col in ascii_ints.columns: # get the char counts of each column
    char_count_col_name = col + '_char_count'
    ascii_ints_counts[char_count_col_name] = ascii_ints_counts[col].str.len()

deascii_ints_counts = deascii_ints.copy() # make a copy of df
for col in deascii_ints.columns: # get the char counts of each column
    char_count_col_name = col + '_char_count'
    deascii_ints_counts[char_count_col_name] = deascii_ints_counts[col].str.len()

""" If char_count < 4, replace the string in the corresponding column with None so that we don't use it for comparison """    
for col in ascii_ints_counts.columns[ascii_ints_counts.columns.str.contains("char_count")]:
    for index, value in ascii_ints_counts[col].items():
        if value < 4:
            # Find the column with the most similar name without "char_count" substring
            most_similar_col = ascii_ints_counts.columns[ascii_ints_counts.columns.str.replace("_char_count", "") == col.replace("_char_count", "")].values[0]
            # Update the value in the most similar column
            ascii_ints_counts.at[index, most_similar_col] = None
    ascii_ints_counts = ascii_ints_counts.drop(col, axis=1) # drop the count columns now
    
for col in deascii_ints_counts.columns[deascii_ints_counts.columns.str.contains("char_count")]:
    for index, value in deascii_ints_counts[col].items():
        if value < 4:
            # Find the column with the most similar name without "char_count" substring
            most_similar_col = deascii_ints_counts.columns[deascii_ints_counts.columns.str.replace("_char_count", "") == col.replace("_char_count", "")].values[0]
            # Update the value in the most similar column
            deascii_ints_counts.at[index, most_similar_col] = None  
    deascii_ints_counts = deascii_ints_counts.drop(col, axis=1) # drop the count columns now

""" If the substring that was either outside or inside the () is identical to the term from which it came from, put None in that column """    
ascii_ints_counts.loc[ascii_ints_counts['orig_int'] == ascii_ints_counts['orig_int_outside'], 'orig_int_outside'] = None
ascii_ints_counts.loc[ascii_ints_counts['alt_int'] == ascii_ints_counts['alt_int_outside'], 'alt_int_outside'] = None
ascii_ints_counts.loc[ascii_ints_counts['orig_int'] == ascii_ints_counts['orig_int_inside'], 'orig_int_inside'] = None
ascii_ints_counts.loc[ascii_ints_counts['alt_int'] == ascii_ints_counts['alt_int_inside'], 'alt_int_inside'] = None

deascii_ints_counts.loc[deascii_ints_counts['deascii_orig_int'] == deascii_ints_counts['deascii_orig_int_outside'], 'deascii_orig_int_outside'] = None
deascii_ints_counts.loc[deascii_ints_counts['deascii_alt_int'] == deascii_ints_counts['deascii_alt_int_outside'], 'deascii_alt_int_outside'] = None
deascii_ints_counts.loc[deascii_ints_counts['deascii_orig_int'] == deascii_ints_counts['deascii_orig_int_inside'], 'deascii_orig_int_inside'] = None
deascii_ints_counts.loc[deascii_ints_counts['deascii_alt_int'] == deascii_ints_counts['deascii_alt_int_inside'], 'deascii_alt_int_inside'] = None
    
""" When it comes to parameters for MetaMap for conditions, we restricted the semantic types to those seeming relevant to diseases, etc.
    For interventions, we have chosen to EXCLUDE the semantic types specified for conditions,
    instead of specifying semantic types desired for interventions. Spot-testing setting restrictions on semantic type for interventions didn't show much promise.
    restrict_to_sts = restrict semantic type, exclude_sts = exclude semantic types """

params = {"exclude_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...

# file to write out to is still open, so continuing to append the interventions to it now     
if metamap_version[0] >= 20:
    print("MetaMap version >= 2020, conduct mapping on original terms")
    parallelize_metamap(ascii_ints_counts["orig_int"].tolist(), params, "intervention", flag_and_path, csv_writer)

else:
    print("MetaMap version < 2020, conduct mapping on terms after removing ascii characters")
    parallelize_metamap(deascii_ints_counts["deascii_orig_int"].tolist(), params, "intervention", flag_and_path, csv_writer)
    



# # print(len(test))
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     display(ascii_ints)
    
# ascii_ints_counts
# deascii_ints_counts


'\nWe take the interventions, take the ascii and deasciied versions of them,\nand split substrings in parentheses out of them. We perform MetaMapping on the\noriginal term or the deasciied term dependinging on what operating system we\nare on. If the mapped term passes the fuzzy scoring thesholds for any of the\nterms (original, deasciied, original inside the parentheses, deasciied inside\nthe parentheses, original outside the parentheses, deasciied outside the\nparentheses), we keep that CURIE \n'

' remove any placebo/saline/water rows '

' separate the columns that are deascii vs with ascii char'

" I don't want to perform mapping on strings < 4 char in length; these are ambiguous and it's hard to make a call what that concept should be "

' Get the char counts of all the columns. '

" If char_count < 4, replace the string in the corresponding column with None so that we don't use it for comparison "

' If the substring that was either outside or inside the () is identical to the term from which it came from, put None in that column '

" When it comes to parameters for MetaMap for conditions, we restricted the semantic types to those seeming relevant to diseases, etc.\n    For interventions, we have chosen to EXCLUDE the semantic types specified for conditions,\n    instead of specifying semantic types desired for interventions. Spot-testing setting restrictions on semantic type for interventions didn't show much promise.\n    restrict_to_sts = restrict semantic type, exclude_sts = exclude semantic types "

MetaMap version < 2020, conduct mapping on terms after removing ascii characters
Starting skrmedpostctl: 
started.
Starting wsdserverctl: 
started.
loading properties file /Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/WSD_Server/config/disambServer.cfg
['intervention', 'sertraline', 'Sertraline', 'C0074393', '25.72', '[orch,phsu]']
['intervention', 'sertraline', 'Sertraline measurement', 'C0524265', '3.64', '[lbpr]']
['intervention', 'modified intubation process', 'Intubation', 'C0021925', '6.59', '[topp]']
['intervention', 'modified intubation process', 'Human Cells, Tissues, and Cellular and Tissue-Based Products Processing', 'C4521541', '3.57', '[hlca]']
['intervention', 'modified intubation process', 'Process', 'C1522240', '3.57', '[phpr]']
['intervention', 'modified intubation process', 'Process (qualifier value)', 'C4521054', '3.57', '[ftcn]']
['intervention', 'modified intubation process', 'bony process', 'C1184743', '3.57', '[bpoc]']
['intervention', 'modified intubation p

['intervention', 'quiet time care', 'Time', 'C0040223', '6.59', '[tmco]']
['intervention', 'quiet time care', 'care activity', 'C1947933', '3.57', '[acty]']
['intervention', 'quiet time care', 'Quiet', 'C0439654', '3.44', '[qlco]']
['intervention', 'quiet time care', 'Time (foundation metadata concept)', 'C3541383', '3.44', '[tmco]']
['intervention', 'conventional care', 'care activity', 'C1947933', '3.59', '[acty]']
['intervention', 'conventional care', 'classical example', 'C0439858', '3.45', '[qlco]']
['intervention', 'high power pain threshold ultrasound', 'Ultrasonic Shockwave', 'C0041621', '19.32', '[npop]']
['intervention', 'high power pain threshold ultrasound', 'Ultrasonic', 'C0220934', '13.01', '[ftcn]']
['intervention', 'high power pain threshold ultrasound', 'Ultrasonics (sound)', 'C1456803', '13.01', '[npop]']
['intervention', 'high power pain threshold ultrasound', 'Ultrasonography', 'C0041618', '13.01', '[diap]']
['intervention', 'high power pain threshold ultrasound', '

['intervention', 'bariatric surgery', 'Bariatric Surgery', 'C1456587', '13.18', '[topp]']
['intervention', 'all investigations; laboratory and radiological, needed for diagnosis', 'Needs', 'C0027552', '12.88', '[qlco]']
['intervention', 'all investigations; laboratory and radiological, needed for diagnosis', 'Radiographic imaging procedure', 'C1962945', '12.84', '[diap]']
['intervention', 'all investigations; laboratory and radiological, needed for diagnosis', 'Laboratory Diagnosis', 'C0011911', '9.74', '[diap]']
['intervention', 'all investigations; laboratory and radiological, needed for diagnosis', 'Laboratory', 'C0022877', '9.73', '[hcro,mnob]']
['intervention', 'all investigations; laboratory and radiological, needed for diagnosis', 'Radiology Specialty', 'C0034599', '9.69', '[bmod]']
['intervention', 'all investigations; laboratory and radiological, needed for diagnosis', 'Diagnostic radiologic examination', 'C0043299', '6.83', '[diap]']
['intervention', 'all investigations; labo

['intervention', 'sbrt', 'Stereotactic Body Radiation Therapy', 'C3896609', '3.64', '[topp]']
['intervention', 'high-intenstiy interval training', 'Training', 'C0220931', '3.56', '[edac]']
['intervention', 'high-intenstiy interval training', 'Training Programs', 'C0040607', '3.56', '[edac]']
['intervention', 'high-intenstiy interval training', 'training aspects', 'C2673163', '3.56', '[qlco]']
['intervention', 'high-intenstiy interval training', 'High Interval Data Type', 'C2987303', '3.46', '[qlco]']
['intervention', 'partosure test', 'E test (procedure)', 'C1275991', '3.68', '[diap]']
['intervention', 'partosure test', 'Epsilometry', 'C2827788', '3.68', '[lbpr]']
['intervention', 'dextrose 0', 'Glucose', 'C0017725', '16.21', '[bacs,orch,phsu]']
['intervention', 'dextrose 0', '0%', 'C3842591', '3.59', '[qnco]']
['intervention', 'aspiration 0', '0%', 'C3842591', '3.59', '[qnco]']
['intervention', 'aspiration 0', 'Aspiration-action', 'C0349707', '3.59', '[topp]']
['intervention', 'aspira

['intervention', 'standard vit d supplementation', 'Vitamin D', 'C0042866', '16.08', '[orch,phsu,vita]']
['intervention', 'standard vit d supplementation', 'Dietary Supplementation', 'C0242297', '3.56', '[topp]']
['intervention', 'standard vit d supplementation', 'D Vitamin', 'C3537249', '3.46', '[phsu]']
['intervention', 'standard vit d supplementation', 'Vitamin D measurement', 'C0919758', '3.46', '[lbpr]']
['intervention', 'standard vit d supplementation', 'Standard (document)', 'C2828392', '3.43', '[inpr]']
['intervention', 'standard vit d supplementation', 'Standard (qualifier)', 'C1442989', '3.43', '[qlco]']
['intervention', 'monitored vit d supplementation', 'Vitamin D', 'C0042866', '16.08', '[orch,phsu,vita]']
['intervention', 'monitored vit d supplementation', 'Patient Monitoring', 'C0030695', '9.72', '[hlca]']
['intervention', 'monitored vit d supplementation', 'Dietary Supplementation', 'C0242297', '3.56', '[topp]']
['intervention', 'monitored vit d supplementation', 'D Vita

['intervention', 'abatacept', 'abatacept', 'C1619966', '25.72', '[aapp,phsu]']
['intervention', 'proprioceptive neuromuscular facilitation', 'Proprioception awareness work', 'C0452254', '3.62', '[topp]']
['intervention', 'proprioceptive neuromuscular facilitation', 'Neuromuscular', 'C1979768', '3.57', '[qlco]']
['intervention', 'infant massage', 'Infant massage', 'C0695595', '3.72', '[topp]']
['intervention', 'thermal radio frequency, selective (unilateral s3, bilateral s4 and s5) saddle rhizotomy', 'Radio', 'C1304639', '16.15', '[mnob]']
['intervention', 'thermal radio frequency, selective (unilateral s3, bilateral s4 and s5) saddle rhizotomy', 'Radio communications', 'C0034546', '16.15', '[inpr]']
['intervention', 'thermal radio frequency, selective (unilateral s3, bilateral s4 and s5) saddle rhizotomy', 'Rhizotomy procedure', 'C0282615', '12.88', '[topp]']
['intervention', 'thermal radio frequency, selective (unilateral s3, bilateral s4 and s5) saddle rhizotomy', 'Frequency selectiv

['intervention', '12% sucrose', 'Sucrose', 'C0038636', '16.21', '[bacs,orch,phsu]']
['intervention', '12% sucrose', 'Saccharum officinale, sucrose, cane sugar, Homeopathic preparation', 'C1161331', '3.59', '[orch,phsu]']
['intervention', '24% sucrose', 'Sucrose', 'C0038636', '16.21', '[bacs,orch,phsu]']
['intervention', '24% sucrose', 'Saccharum officinale, sucrose, cane sugar, Homeopathic preparation', 'C1161331', '3.59', '[orch,phsu]']
['intervention', 'ram cannula', 'Cannula device', 'C0520453', '9.90', '[medd]']
['intervention', 'ram cannula', 'Cannula <eukaryote>', 'C3463845', '3.59', '[euka]']
['intervention', 'ram cannula', 'CCDC26 wt Allele', 'C4521503', '3.45', '[gngm]']
['intervention', 'ram cannula', 'RAB27A wt Allele', 'C3890009', '3.45', '[gngm]']
['intervention', 'short nasal prongs', 'Nasal prongs', 'C0445087', '3.64', '[medd]']
['intervention', 'short nasal prongs', 'Short', 'C1806781', '3.44', '[qnco]']
['intervention', 'short nasal prongs', 'Short Value', 'C2350002', 

['intervention', 'dental treatment', 'Dental Procedures', 'C0011331', '13.18', '[topp]']
['intervention', 'massage, exercises, relaxation and imagination therapies', 'Massage', 'C0024875', '19.32', '[topp]']
['intervention', 'massage, exercises, relaxation and imagination therapies', 'Massage Therapy', 'C3536731', '19.32', '[topp]']
['intervention', 'massage, exercises, relaxation and imagination therapies', 'Distance Counseling', 'C1510538', '19.21', '[topp]']
['intervention', 'massage, exercises, relaxation and imagination therapies', 'Imagination', 'C0020913', '12.89', '[menp]']
['intervention', 'massage, exercises, relaxation and imagination therapies', 'Relaxation exercise', 'C0203993', '3.43', '[topp]']
['intervention', 'massage and stretching exercises', 'Massage', 'C0024875', '19.33', '[topp]']
['intervention', 'massage and stretching exercises', 'Massage Therapy', 'C3536731', '19.33', '[topp]']
['intervention', 'massage and stretching exercises', 'Stretching exercises', 'C0600

['intervention', 'atripla', 'Atripla', 'C1724016', '25.72', '[orch,phsu]']
['intervention', 'radiofrequency ablation', 'Radiofrequency Ablation', 'C0850292', '10.02', '[topp]']
['intervention', 'ethanol ablation', 'Ethanol Ablation Therapy', 'C1880568', '3.72', '[topp]']
['intervention', 'laparoscopic ventral hernia repair', 'Laparoscopic repair of hernia of anterior abdominal wall', 'C2367746', '3.81', '[topp]']
['intervention', 'participant choice of counselor', 'Choice Behavior', 'C0008300', '16.18', '[inbe]']
['intervention', 'participant choice of counselor', 'Professional Counselor', 'C1571885', '9.74', '[prog]']
['intervention', 'participant choice of counselor', 'Choose (action)', 'C1707391', '3.56', '[acty]']
['intervention', 'participant choice of counselor', 'Participant', 'C0679646', '3.43', '[popg]']
['intervention', 'participant choice of counselor', 'Participant Object', 'C2698741', '3.43', '[clas]']
['intervention', 'participant choice of counselor', 'Study Participant'

/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/skrmedpostctl: line 50: kill: (27757) - No such process
/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/wsdserverctl: line 55: kill: (27759) - No such process


Stopping WSD Server process..
Process 27759 stopped


In [35]:
# output all results to TSVs
def compile_and_output(df_dict, ct_terms, remaining_unmapped_possible):
    print("\n")
    print("#   -------- -------- -------- --------  ")
    print("Final Tallies:")
    print("Total # of conditions mapped: {}".format(ct_terms["mapped_conditions"].shape[0]))
    print("Total # of interventions mapped: {}".format(ct_terms["mapped_interventions"].shape[0]))
    print("Total # of conditions unmapped or not mapped: {}".format(len(ct_terms["unmapped_conditions"])))
    print("Total # of interventions unmapped or not mapped: {}".format(len(ct_terms["unmapped_interventions"])))    
    # How many Clinical Trials are there? Well, it's different depending on the Conditions or Interventions dataframes...
    conditions_nctids = len(df_dict["conditions"].nct_id.unique())
    interventions_nctids = len(df_dict["interventions"].nct_id.unique())
    print("Number of Clinical Trials NCITs in Conditions table: {}".format(conditions_nctids))      
    print("Number of Clinical Trials NCITs in Interventions table: {}".format(interventions_nctids))
    print("#   -------- -------- -------- --------  ")

    """ create tables of unused MeSH and MetaMap CURIEs that could be used for unmapped Conditions and Interventions """
    # -------    CONDITIONS    ------- #
    all_conditions = df_dict["conditions"][["nct_id", "downcase_name"]]
    conditions_mesh = pd.merge(all_conditions, 
                               remaining_unmapped_possible["mesh_conditions_per_study"],
                               how='left',
                               left_on=['nct_id'],
                               right_on = ['nct_id'])
    
    metamap_possibilities = remaining_unmapped_possible["all_metamapped_conditions"][["condition_input", "condition_CURIE_id", "condition_CURIE_name", "condition_semantic_type"]]
    conditions_mesh_metamap = pd.merge(conditions_mesh, 
                                       metamap_possibilities,
                                       how='left',
                                       left_on=['downcase_name'],
                                       right_on = ['condition_input'])
    
    unmapped_conditions_possible_terms = conditions_mesh_metamap[conditions_mesh_metamap['downcase_name'].isin(ct_terms["unmapped_conditions"])]
    unmapped_conditions_possible_terms = unmapped_conditions_possible_terms.drop('condition_input', axis=1) # drop the redundant column now
    
    # -------    INTERVENTIONS    ------- #
    all_interventions = df_dict["interventions"][["nct_id", "downcase_name"]]
    interventions_mesh = pd.merge(all_interventions, 
                               remaining_unmapped_possible["mesh_interventions_per_study"],
                               how='left',
                               left_on=['nct_id'],
                               right_on = ['nct_id'])
    
    metamap_possibilities = remaining_unmapped_possible["all_metamapped_interventions"][["intervention_input", "intervention_CURIE_id", "intervention_CURIE_name", "intervention_semantic_type"]]
    interventions_mesh_metamap = pd.merge(interventions_mesh, 
                                       metamap_possibilities,
                                       how='left',
                                       left_on=['downcase_name'],
                                       right_on = ['intervention_input'])
    
    unmapped_interventions_possible_terms = interventions_mesh_metamap[interventions_mesh_metamap['downcase_name'].isin(ct_terms["unmapped_interventions"])]
    unmapped_interventions_possible_terms = unmapped_interventions_possible_terms.drop('intervention_input', axis=1) # drop the redundant column now
          
        
    """   Output all to TSVs   """    
    pd.Series(ct_terms["unmapped_conditions"]).to_csv('unmapped_conditions.tsv', sep="\t", index=False, header=False) # convert the list to a pandas series, then output to TSV
    pd.Series(ct_terms["unmapped_interventions"]).to_csv('unmapped_interventions.tsv', sep="\t", index=False, header=False) # convert the list to a pandas series, then output to TSV
    ct_terms["mapped_conditions"].to_csv('mapped_conditions.tsv', sep="\t", index=False)
    ct_terms["mapped_interventions"].to_csv('mapped_interventions.tsv', sep="\t", index=False)
    unmapped_conditions_possible_terms.to_csv('unmapped_conditions_possible_mappings.tsv', sep="\t", index=False)
    unmapped_interventions_possible_terms.to_csv('unmapped_interventions_possible_mappings.tsv', sep="\t", index=False)
    



In [None]:
# def test_or_prod():
#     print("The test run of this code performs the construction of the KG on a subset of 200 Conditions and 200 Interventions from Clinical Trials.\n")
#     test_or_prod = input("Is this a test run or the production of a new version of the KG? Write T for test, or P for production: ")
#     if test_or_prod == "T":
#         flag_and_path = get_raw_ct_data() # uncomment for production
#         flag_and_path["term_program_flag"] = False
#         run_ETL_mapping(flag_and_path)
#     elif test_or_prod == "P":
#         flag_and_path = get_raw_ct_data() 
#         run_ETL_mapping(flag_and_path)
#     else:
#         print("Bad input")
#         sys.exit(0)
        

        
        

In [None]:
# def run_ETL_mapping(flag_and_path):
#     df_dict = read_raw_ct_data(flag_and_path)
#     ct_terms = exact_match_mesh(df_dict)
#     ct_terms = inexact_match_mesh(df_dict, ct_terms)

#     # pull the available MeSH terms per study out of the returned ct_terms dict 
#     mesh_conditions_per_study = ct_terms["mesh_conditions_per_study"]
#     mesh_interventions_per_study = ct_terms["mesh_interventions_per_study"]

#     ct_terms = term_list_to_nr(df_dict, ct_terms)
#     ct_terms = term_list_to_mm(df_dict, ct_terms)

#     # pull the available UMLS terms per study out of the returned ct_terms dict 
#     all_metamapped_conditions = ct_terms["all_metamapped_conditions"]
#     all_metamapped_interventions = ct_terms["all_metamapped_interventions"]

#     remaining_unmapped_possible = {"mesh_conditions_per_study": mesh_conditions_per_study,
#                                    "mesh_interventions_per_study": mesh_interventions_per_study,
#                                    "all_metamapped_conditions": all_metamapped_conditions,
#                                    "all_metamapped_interventions": all_metamapped_interventions}
#     compile_and_output(df_dict, ct_terms, remaining_unmapped_possible)


    

In [35]:
interventions_alts = pd.read_csv('/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/08_21_2023_extracted' + '/intervention_other_names.txt', sep='|', index_col=False, header=0)
interventions_alts['alt_downcase_name'] = interventions_alts['name'].str.lower()

interventions_alts

Unnamed: 0,id,nct_id,intervention_id,name,alt_downcase_name
0,27584249,NCT01738191,54313664,Strattera,strattera
1,27584250,NCT01737879,54313666,Omontys,omontys
2,27428744,NCT04545502,54003364,Gelsoft Plus,gelsoft plus
3,27584251,NCT01737879,54313667,Epogen,epogen
4,27273339,NCT04571879,53672522,Nebulized Xylocaine,nebulized xylocaine
...,...,...,...,...,...
387960,27583600,NCT03192215,54313417,Eliquis,eliquis
387961,27583601,NCT03192215,54313418,Aspirin Tablet,aspirin tablet
387962,27583602,NCT03052608,54313422,PF-06463922,pf-06463922
387963,27583603,NCT03052608,54313423,Xalkori,xalkori


In [29]:
interventions_df = pd.read_csv('/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/08_21_2023_extracted' + '/interventions.txt', sep='|', index_col=False, header=0)
interventions_df['orig_downcase_name'] = interventions_df['name'].str.lower()

interventions_df

Unnamed: 0,id,nct_id,intervention_type,name,description,orig_downcase_name
0,54367098,NCT03454451,Drug,CPI-006 + ciforadenant,Selected dose of CPI-006 administered intraven...,cpi-006 + ciforadenant
1,54367099,NCT03454451,Drug,CPI-006 + pembrolizumab,Selected dose of CPI-006 in combination with p...,cpi-006 + pembrolizumab
2,54367100,NCT00089843,Drug,Testosterone,Testosterone patch 150mcg daily,testosterone
3,54367101,NCT00089843,Drug,Actonel (risedronate),Actonel (risedronate) 35mg PO one time weekly,actonel (risedronate)
4,54367102,NCT00089843,Drug,Placebo Actonel (risedronate),Placebo tablet identical in appearance to acti...,placebo actonel (risedronate)
...,...,...,...,...,...,...
786249,54367034,NCT01191411,Other,Mailed invitations for FIT test kits,Mailed invitations for the non-invasive immuno...,mailed invitations for fit test kits
786250,54367035,NCT01191411,Other,Mailed invitations for a colonoscopy,These patients will be mailed invitations to d...,mailed invitations for a colonoscopy
786251,54367036,NCT01191411,Other,Visit Based Care,Visit based standard care at John Peter Smith ...,visit based care
786252,54367037,NCT03707873,Other,Education,Education + Medication adherence monitoring + ...,education


In [30]:
interventions_all = pd.merge(interventions_df[["id", "nct_id", "intervention_type", "orig_downcase_name", "description"]], interventions_alts[["nct_id", "intervention_id", "alt_downcase_name"]], how='left', left_on=['id', 'nct_id'], right_on = ['intervention_id', 'nct_id'])
interventions_all = interventions_all.sort_values(by='nct_id', ascending=False, na_position='first')
interventions_all = interventions_all.drop('intervention_id', axis=1) # drop the redundant column now

print(len(interventions_all))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(interventions_all[2000:4000])
    

955979


Unnamed: 0,id,nct_id,intervention_type,orig_downcase_name,description,intervention_id,alt_downcase_name
119713,54238820,NCT05983913,Behavioral,cognitive-motor training,The cognitive-motor training program consists ...,,
119715,54238822,NCT05983900,Procedure,brix3000,chemo-mechanical caries removal agent,,
119716,54238823,NCT05983900,Procedure,papacarie,chemo-mechanical caries removal agent,,
119717,54238824,NCT05983900,Procedure,hand excavation,mechanical caries removal,,
119718,54238825,NCT05983887,Other,therapeutic climbing,The protocol consists of using an in-door clim...,,
119722,54238826,NCT05983874,Biological,"bg505 sosip.664 gp140 vaccine, adjuvanted (3m-...",100µg Month 0 and Month 3,,
119723,54238827,NCT05983861,Diagnostic Test,identification o multi drug resistant bacteria,Identification o Multi Drug Resistant Bacteria,,
119724,54238828,NCT05983848,Radiation,99mmaracticaltide imaging,7.3.1 99mTc-maraciclatide imaging This will be...,,
119725,54238829,NCT05983835,Procedure,patients receive subsegmentectomy,Patients receive lobectomy,,
119726,54238830,NCT05983822,Device,technological group,Specific rehabilitation for the recovery of ha...,54238830.0,"amadeo® (tyromotion, austria)"


In [237]:
metamap_dirs["metamap_bin_dir"]

'bin/metamap18'

In [None]:
start_metamap_servers(metamap_dirs)
mm = MetaMap.get_instance(metamap_dirs["metamap_base_dir"] + metamap_dirs["metamap_bin_dir"])
condition_semtype_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")

params = {"restrict_to_sts":condition_semtype_restriction, "term_processing":True, "ignore_word_order":True, "relaxed_model":True, "strict_model":False}

terms = ['infarction, myocardial', 'aneurysm', 'diabetes', 'common cold', 'fracture', 'juice blend']

concepts,error = mm.extract_concepts(terms,
                                     term_processing = params["term_processing"],
                                     ignore_word_order = params["ignore_word_order"],
#                                      relaxed_model = params["relaxed_model"]
#                                      restrict_to_sts=params["restrict_to_sts"]
                                    strict_model = params["strict_model"]
                                    )
for concept in concepts:
    print(concept)
    print("\n")
stop_metamap_servers(metamap_dirs)

In [252]:
# flag_and_path = get_raw_ct_data() # uncomment for production
flag_and_path = {'term_program_flag': False,
                 'data_extracted_path': '/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/08_21_2023_extracted',
                 'date_string':'08_21_2023'} # comment for production
metamap_dirs = check_os()
df_dict = read_raw_ct_data(flag_and_path)
term_list_to_mm(df_dict, flag_and_path)

# # pull the available UMLS terms per study out of the returned ct_terms dict 
# all_metamapped_conditions = ct_terms["all_metamapped_conditions"]
# all_metamapped_interventions = ct_terms["all_metamapped_interventions"]

# remaining_unmapped_possible = {"mesh_conditions_per_study": mesh_conditions_per_study,
#                                "mesh_interventions_per_study": mesh_interventions_per_study,
#                                "all_metamapped_conditions": all_metamapped_conditions,
#                                "all_metamapped_interventions": all_metamapped_interventions}
# compile_and_output(df_dict, ct_terms, remaining_unmapped_possible)


Using UMLS MetaMap to get mappings for CONDITIONS. MetaMap returns mappings, CUIs, and semantic type of mapping.
MetaMap version < 2020, conduct mapping on terms after removing ascii characters
Starting skrmedpostctl: 
started.
Starting wsdserverctl: 
started.
loading properties file /Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/WSD_Server/config/disambServer.cfg
WSD Server initializing disambiguation methods.
WSD Server databases and disambiguation methods have been initialized.
Could not listen on port : 5554 : Address already in use
Stopping skrmedpostctl: 
Stopping Tagger Server process..
Process 52605 stopped
Stopping wsdserverctl: 
Stopping WSD Server process..
Process 52607 stopped
Using UMLS MetaMap to get mappings for INTERVENTIONS. MetaMap returns mappings, CUIs, and semantic type of mapping.


/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/skrmedpostctl: line 50: kill: (52605) - No such process
/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/wsdserverctl: line 55: kill: (52607) - No such process


MetaMap version < 2020, conduct mapping on terms after removing ascii characters
Starting skrmedpostctl: 
started.
Starting wsdserverctl: 
started.
loading properties file /Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/WSD_Server/config/disambServer.cfg
WSD Server initializing disambiguation methods.
WSD Server databases and disambiguation methods have been initialized.
Could not listen on port : 5554 : Address already in use
Stopping skrmedpostctl: 
Stopping Tagger Server process..
Process 58492 stopped
Stopping wsdserverctl: 
Stopping WSD Server process..
Process 58494 stopped


/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/skrmedpostctl: line 50: kill: (58492) - No such process
/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/wsdserverctl: line 55: kill: (58494) - No such process


Starting skrmedpostctl: 
started.
Starting wsdserverctl: 
started.
loading properties file /Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/WSD_Server/config/disambServer.cfg
WSD Server initializing disambiguation methods.
WSD Server databases and disambiguation methods have been initialized.
Could not listen on port : 5554 : Address already in use
Stopping skrmedpostctl: 
Stopping Tagger Server process..
Process 62809 stopped
Stopping wsdserverctl: 
Stopping WSD Server process..
Process 62811 stopped


/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/skrmedpostctl: line 50: kill: (62809) - No such process
/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/wsdserverctl: line 55: kill: (62811) - No such process


In [None]:
def convert_seconds_to_hms(seconds):

    """ converts the elapsed or run_time to hours, min, sec """
    hours = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return hours, minutes, seconds

current = dt.datetime.now()
ts = dt.datetime.timestamp(current)
d = dt.datetime.fromtimestamp(ts)
str_date_time = d.strftime("%d-%m-%Y, %H:%M:%S")
print("Timestamp of script start: {}".format(str_date_time))

start_time = time.time()
end_time = time.time()
elapsed_time = end_time - start_time
hours, minutes, seconds = convert_seconds_to_hms(elapsed_time)
print(f"Runtime: {hours} hours, {minutes} minutes, {seconds} seconds")

In [None]:
remaining_unmapped_possible