In [42]:
# display cells to maximum width 
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))

# lets you preint multiple outputs per cell, not just last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [43]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
import re
import collections
import os
import json
import numpy as np
import pickle
from functools import reduce
import time
from time import sleep
import concurrent
import multiprocessing
import datetime as dt
from datetime import date
import pathlib
import configparser
import sys
import urllib
import zipfile
import csv
sys.path.insert(0, '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/pymetamap-master')
from pymetamap import MetaMap  # https://github.com/AnthonyMRios/pymetamap/blob/master/pymetamap/SubprocessBackend.py
from pandas import ExcelWriter


In [44]:
# %pip install thefuzz
# %pip install levenshtein

from thefuzz import fuzz # fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

In [45]:
global metamap_dirs
global metamap_pos_server_dir
global metamap_wsd_server_dir


In [46]:
# fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

def get_token_sort_ratio(str1, str2):
    try:
        return fuzz.token_sort_ratio(str1, str2)
    except:
        return None
    
sort_ratio = np.vectorize(get_token_sort_ratio)

def get_token_set_ratio(str1, str2):
    try:
        return fuzz.token_set_ratio(str1, str2)
    except:
        return None  
set_ratio = np.vectorize(get_token_set_ratio)

def get_similarity_score(str1, str2):
    try:
        return fuzz.ratio(str1, str2)
    except:
        return None
sim_score = np.vectorize(get_similarity_score)

In [47]:
def get_raw_ct_data():
    term_program_flag = True
    global data_dir
    global data_extracted
    
    # get all the links and associated dates of upload into a dict called date_link
    url_all = "https://aact.ctti-clinicaltrials.org/pipe_files"
    response = requests.get(url_all)
    soup = BeautifulSoup(response.text)
    body = soup.find_all('option') #Find all
    date_link = {}
    for el in body:
        tags = el.find('a')
        try:
            zip_name = tags.contents[0].split()[0]
            date = zip_name.split("_")[0]
            date = dt.datetime.strptime(date, '%Y%m%d').date()
            date_link[date] = tags.get('href')
        except:
            pass
    latest_file_date = max(date_link.keys())   # get the date of the latest upload
    url = date_link[latest_file_date]   # get the corresponding download link of the latest upload so we can download the raw data
    date_string = latest_file_date.strftime("%m_%d_%Y")
    data_dir = "{}/data".format(pathlib.Path.cwd())
    data_extracted = data_dir + "/{}_extracted".format(date_string)
    data_path = "{}/{}_pipe-delimited-export.zip".format(data_dir, date_string)
    
    if not os.path.exists(data_path):   # if folder containing most recent data doesn't exist, download and extract it into data folder
        
        term_program_flag = False   # flag below for terminating program if latest download exists (KG is assumed up to date)
        print("Downloading Clinical Trial data as of {}".format(date_string))
        response = requests.get(url)
        if response.status_code == 200:
            with open(data_path, 'wb') as file:
                file.write(response.content)
            print("Finished download of zip")
            with zipfile.ZipFile(data_path, 'r') as download:
                print("Unzipping data")
                download.extractall(data_extracted)
        else:
            print("KG is already up to date.")
    return {"term_program_flag": term_program_flag, "data_extracted_path": data_extracted, "date_string": date_string}



In [49]:
def read_raw_ct_data(flag_and_path):
    if flag_and_path["term_program_flag"]:
        print("Exiting program. Assuming KG has already been constructed from most recent data dump from AACT.")
#         exit()
#         pass
    else:
        data_extracted = flag_and_path["data_extracted_path"]
        # read in pipe-delimited files 
        conditions_df = pd.read_csv(data_extracted + '/conditions.txt', sep='|', index_col=False, header=0)
        interventions_df = pd.read_csv(data_extracted + '/interventions.txt', sep='|', index_col=False, header=0)
        interventions_alts = pd.read_csv(data_extracted + '/intervention_other_names.txt', sep='|', index_col=False, header=0)

#         browse_conditions_df = pd.read_csv(data_extracted + '/browse_conditions.txt', sep='|', index_col=False, header=0)
#         browse_interventions_df = pd.read_csv(data_extracted + '/browse_interventions.txt', sep='|', index_col=False, header=0)
        
    ### GET RID OF....CHEAT LINE FOR TESTING
#         conditions_df = conditions_df.iloc[:300]
#         interventions_df = interventions_df.iloc[:300]
        conditions_df = conditions_df.sample(n=300)
        interventions_df = interventions_df.sample(n=300)
        alternate_interventions_df = interventions_alts.sample(n=600)

    return {"conditions": conditions_df, "interventions": interventions_df, "interventions_alts": alternate_interventions_df
#             "browse_conditions": browse_conditions_df, "browse_interventions": browse_interventions_df
           }



In [50]:
def de_ascii_er(text):
    non_ascii = "[^\x00-\x7F]"
    pattern = re.compile(r"[^\x00-\x7F]")
    non_ascii_text = re.sub(pattern, ' ', text)
    return non_ascii_text

In [51]:
def start_metamap_servers(metamap_dirs):
    metamap_pos_server_dir = 'bin/skrmedpostctl' # Part of speech tagger
    metamap_wsd_server_dir = 'bin/wsdserverctl' # Word sense disambiguation 
    
    # Start servers
    os.system(metamap_dirs['metamap_base_dir'] + metamap_pos_server_dir + ' start') # Part of speech tagger
    os.system(metamap_dirs['metamap_base_dir'] + metamap_wsd_server_dir + ' start') # Word sense disambiguation 
    # # Sleep a bit to give time for these servers to start up
    sleep(5)

def stop_metamap_servers(metamap_dirs):
    metamap_pos_server_dir = 'bin/skrmedpostctl' # Part of speech tagger
    metamap_wsd_server_dir = 'bin/wsdserverctl' # Word sense disambiguation 
    # Stop servers
    os.system(metamap_dirs['metamap_base_dir'] + metamap_pos_server_dir + ' stop') # Part of speech tagger
    os.system(metamap_dirs['metamap_base_dir'] + metamap_wsd_server_dir + ' stop') # Word sense disambiguation 
        

In [52]:
def check_os():
    if "linux" in sys.platform:
        print("Linux platform detected")
        metamap_base_dir = "{}/metamap/".format(pathlib.Path.cwd().parents[0])
        metamap_bin_dir = 'bin/metamap20'
    else:
        metamap_base_dir = '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/' # for running on local
        metamap_bin_dir = 'bin/metamap18'
        
    return {"metamap_base_dir":metamap_base_dir, "metamap_bin_dir":metamap_bin_dir}
        

In [53]:
def run_metamap(input_term, params, mm, cond_or_inter, csv_writer):
    from_metamap = []
    if params.get("exclude_sts") is None: # exclude_sts is used for Interventions. restrict_to_sts is used for Conditions. So, the logic is, if we're mapping Conditions, execute "if" part of code. If we're mapping Interventions, execute "else" part of code
        try:
            concepts,error = mm.extract_concepts([input_term],
                                                 restrict_to_sts = params["restrict_to_sts"],
                                                 term_processing = params["term_processing"],
                                                 ignore_word_order = params["ignore_word_order"],
                                                 strict_model = params["strict_model"],
                                                )

            for concept in concepts:
                concept_info = []
                concept = concept._asdict()
                concept_info.extend([cond_or_inter,input_term])
                concept_info.extend([concept.get(k) for k in ['preferred_name', 'cui', 'score', 'semtypes']])
                from_metamap.append(concept_info)
        except:
            from_metamap.extend([input_term, None, None, None, None, None, None])
    else:
        try:
            concepts,error = mm.extract_concepts([input_term],
                                                 exclude_sts = params["exclude_sts"],
                                                 term_processing = params["term_processing"],
                                                 ignore_word_order = params["ignore_word_order"],
                                                 strict_model = params["strict_model"],
                                                )

            for concept in concepts:
                concept_info = []
                concept = concept._asdict()
                concept_info.extend([cond_or_inter,input_term])
                concept_info.extend([concept.get(k) for k in ['preferred_name', 'cui', 'score', 'semtypes']])
                from_metamap.append(concept_info)
        except:
            from_metamap.extend([input_term, None, None, None, None, None, None])
        
    for result in from_metamap:
#         print(result)
        csv_writer.writerow(result)
    return from_metamap

In [54]:
def parallelize_metamap(term_list, params, cond_or_inter, flag_and_path, csv_writer):
    start_metamap_servers(metamap_dirs) # start the MetaMap servers
    mm = MetaMap.get_instance(metamap_dirs["metamap_base_dir"] + metamap_dirs["metamap_bin_dir"])
    with concurrent.futures.ThreadPoolExecutor((multiprocessing.cpu_count()*2) - 1) as executor:
        _ = [executor.submit(run_metamap, term, params, mm, cond_or_inter, csv_writer) for term in term_list]
    stop_metamap_servers(metamap_dirs) # stop the MetaMap servers
    

# USE METAMAP LOCAL TO MAP REMAINING TERMS

In [103]:
def term_list_to_mm(df_dict, flag_and_path):
        
    metamap_version = [int(s) for s in re.findall(r'\d+', metamap_dirs.get('metamap_bin_dir'))] # get MetaMap version being run 
    # some input terms have () with additional text, like an abbreviation, in them. split them out to facilitate better mapping using these regex patterns that we use to find substrings inside and outside ()
    pattern_outside = r'(?<=\().+?(?=\))|([^(]+)'
    pattern_inside = r'\(([^)]+)\)'
    relevant_date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data
    deasciier = np.vectorize(de_ascii_er) # vectorize function

    # -------    CONDITIONS    ------- #
    conditions = df_dict["conditions"][['id', 'nct_id', 'downcase_name']]
    conditions.rename(columns = {'downcase_name':'orig_con'}, inplace = True)

    if metamap_version[0] >= 20:
        matches_outside = conditions['orig_con'].str.extract(pattern_outside)
        conditions['orig_con_outside'] = matches_outside[0].fillna('')
        matches_inside = conditions['orig_con'].str.extract(pattern_inside)
        conditions['orig_con_inside'] = matches_inside[0].fillna('')

    else:
        conditions['deascii_con'] = deasciier(conditions['orig_con'])
        matches_outside = conditions['deascii_con'].str.extract(pattern_outside)
        conditions['deascii_con_outside'] = matches_outside[0].fillna('')
        matches_inside = conditions['deascii_con'].str.extract(pattern_inside)
        conditions['deascii_con_inside'] = matches_inside[0].fillna('')
    
#     see MetaMap Usage instructions: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MM_2016_Usage.pdf
#     condition_args = ['--sldi -I -C -J acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf -z -i -f']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    params = {"restrict_to_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    
    # prep output file of Metamap results
    filename = f"{relevant_date}_metamap_output.tsv"
    metamap_output = open(filename, 'w+', newline='')
    col_names = ['term_type', 'clin_trial_term','metamap_preferred_name', 'metamap_cui', 'metamap_score', 'metamap_semantic_type']
    csv_writer = csv.writer(metamap_output, delimiter='\t')
    csv_writer.writerow(col_names)
    
    if metamap_version[0] >= 20:
        print("MetaMap version >= 2020, conduct mapping on original terms")
        orig_cons = conditions.orig_con.unique().tolist()
        orig_cons = list(filter(None, orig_cons))
        orig_cons = [str(i) for i in orig_cons]
        parallelize_metamap(orig_con, params, "condition", flag_and_path, csv_writer)
    else:
        print("MetaMap version < 2020, conduct mapping on terms after removing ascii characters")
        deascii_cons = conditions.deascii_con.unique().tolist()
        deascii_cons = list(filter(None, deascii_cons))
        deascii_cons = [str(i) for i in deascii_cons]
        parallelize_metamap(deascii_cons, params, "condition", flag_and_path, csv_writer)
        
        """ If the substring that was either outside or inside the () is identical to the term from which it came from, or actually any of the columns have the same value, put None in that cell/put None where that term is duplicated """    
    # Iterate through each column in the DataFrame
    for col1 in conditions.columns:
        for col2 in conditions.columns:
            # Skip comparing a column with itself
            if col1 != col2:
                # Check if the values in col2 are duplicates of col1
                conditions[col2] = conditions.apply(lambda row: row[col2] if row[col2] != row[col1] else None, axis=1)
    # Drop duplicate columns (keeping the first instance)
    conditions = conditions.T.drop_duplicates().T

    conditions.to_csv('{}_conditions.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output interventions to TSV
    
    # -------    INTERVENTIONS    ------- #
    print("Using UMLS MetaMap to get mappings for INTERVENTIONS. MetaMap returns mappings, CUIs, and semantic type of mapping.")
    
    """ Interventions requires unique handling. Another table gives possible alternate names for the interventions in addition to the "original" names. 
        We may map on the alternate names column
        We take the interventions, take the ascii and deasciied versions of them,
        and split substrings in parentheses out of them. We perform MetaMapping on the
        original term or the deasciied term dependinging on what operating system we
        are on. If the mapped term passes the fuzzy scoring thesholds for any of the
        terms (original, deasciied, original inside the parentheses, deasciied inside
        the parentheses, original outside the parentheses, deasciied outside the
        parentheses""" 

    interventions_df = df_dict["interventions"]
    interventions_df['orig_downcase_name'] = interventions_df['name'].str.lower()
    interventions_alts = df_dict["interventions_alts"]
    interventions_alts['alt_downcase_name'] = interventions_alts['name'].str.lower()

    orig_ints = interventions_df["orig_downcase_name"]
    orig_ints = list(orig_ints.unique())
    orig_ints = list(filter(None, orig_ints))
    alt_ints = interventions_alts["alt_downcase_name"]
    alt_ints = list(alt_ints.unique())
    alt_ints = list(filter(None, alt_ints))

    params = {"exclude_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    """ Send the prepared interventions to MetaMap now. If we are on OSX, we have to use MetaMap 2018, which requires deasciied terms. If we are on Linux, we can use MetaMap 2020, which does not require such preprocessing """
    if metamap_version[0] < 20:
        deasciier = np.vectorize(de_ascii_er) # vectorize function
        #  -------   original interventions  -------- #
        orig_ints = [str(i) for i in orig_ints]
        orig_ints = deasciier(orig_ints) # perform deascii-ing on original intervention names
        orig_ints = list(orig_ints)
        print("MetaMap version < 2020, conduct mapping on original interventions after removing ascii characters")
        parallelize_metamap(orig_ints, params, "intervention", flag_and_path, csv_writer)
        #  ---------   alternate interventions ------- #
        alt_ints = [str(i) for i in alt_ints]
        alt_ints = deasciier(alt_ints) # perform deascii-ing on alternate intervention names
        alt_ints = list(alt_ints)
        parallelize_metamap(alt_ints, params, "alternate_intervention", flag_and_path, csv_writer)

    else:
        #  -------   original interventions  -------- #
        print("MetaMap version >= 2020, conduct mapping on original interventions")
        parallelize_metamap(orig_ints, params, "intervention", flag_and_path, csv_writer)
        #  ---------   alternate interventions ------- #
        print("MetaMap version >= 2020, conduct mapping on alternate interventions")
        parallelize_metamap(alt_ints, params, "alternate_intervention", flag_and_path, csv_writer)

    interventions_all = pd.merge(interventions_df[["id", "nct_id", "intervention_type", "orig_downcase_name", "description"]], interventions_alts[["nct_id", "intervention_id", "alt_downcase_name"]], how='left', left_on=['id'], right_on = ['intervention_id'])
    interventions_all = interventions_all.astype(str)
    interventions_all = interventions_all.drop('nct_id_y', axis=1) # drop the redundant column now
    interventions_all.rename(columns = {'nct_id_x':'nct_id'}, inplace = True)

    interventions_all = interventions_all.sort_values(by='nct_id', ascending=False, na_position='last')
    interventions_all = interventions_all.drop('intervention_id', axis=1) # drop the redundant column now
    interventions_all.rename(columns = {'id':'intervention_id', 'orig_downcase_name':'orig_int', 'alt_downcase_name':'alt_int'}, inplace = True)

    if metamap_version[0] >= 20:
        matches_outside = interventions_all['orig_int'].str.extract(pattern_outside)
        interventions_all['orig_int_outside'] = matches_outside[0].fillna('')
        matches_inside = interventions_all['orig_int'].str.extract(pattern_inside)
        interventions_all['orig_int_inside'] = matches_inside[0].fillna('')

        matches_outside = interventions_all['alt_int'].str.extract(pattern_outside)
        interventions_all['alt_int_outside'] = matches_outside[0].fillna('')
        matches_inside = interventions_all['alt_in'].str.extract(pattern_inside)
        interventions_all['alt_int_inside'] = matches_inside[0].fillna('')
    else:
        interventions_all['deascii_orig_int'] = deasciier(interventions_all['orig_int'])
        interventions_all['deascii_alt_int'] = deasciier(interventions_all['alt_int'])

        matches_outside = interventions_all['deascii_orig_int'].str.extract(pattern_outside)
        interventions_all['deascii_orig_int_outside'] = matches_outside[0].fillna('')
        matches_inside = interventions_all['deascii_orig_int'].str.extract(pattern_inside)
        interventions_all['deascii_orig_int_inside'] = matches_inside[0].fillna('')

        matches_outside = interventions_all['deascii_alt_int'].str.extract(pattern_outside)
        interventions_all['deascii_alt_int_outside'] = matches_outside[0].fillna('')
        matches_inside = interventions_all['deascii_alt_int'].str.extract(pattern_inside)
        interventions_all['deascii_alt_name_inside'] = matches_inside[0].fillna('')

    """ I don't want to perform mapping on strings < 4 char in length; these are ambiguous and it's hard to make a call what that concept should be """
    """ Get character counts of all the columns to evaluate """    
    for col in interventions_all.columns: # get the char counts of each column
        char_count_col_name = col + '_char_count'
        interventions_all[char_count_col_name] = interventions_all[col].str.len()

    """ If char_count < 4, replace the string in the corresponding column with None so that we don't use it for comparison """    
    for col in interventions_all.columns[interventions_all.columns.str.contains("char_count")]:
        for index, value in interventions_all[col].items():
            if value < 4:
                # Find the column with the most similar name without "char_count" substring
                most_similar_col = interventions_all.columns[interventions_all.columns.str.replace("_char_count", "") == col.replace("_char_count", "")].values[0]
                # Update the value in the most similar column
                interventions_all.at[index, most_similar_col] = None
        interventions_all = interventions_all.drop(col, axis=1) # drop the count columns now  
        
    """ If the substring that was either outside or inside the () is identical to the term from which it came from, or actually any of the columns have the same value, put None in that cell/put None where that term is duplicated """    
    # Iterate through each column in the DataFrame
    for col1 in interventions_all.columns:
        for col2 in interventions_all.columns:
            # Skip comparing a column with itself
            if col1 != col2:
                # Check if the values in col2 are duplicates of col1
                interventions_all[col2] = interventions_all.apply(lambda row: row[col2] if row[col2] != row[col1] else None, axis=1)
    # Drop duplicate columns (keeping the first instance)
    interventions_all = interventions_all.T.drop_duplicates().T


    interventions_all.to_csv('{}_interventions.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output interventions to TSV



In [109]:
def map_to_trial(df_dict, flag_and_path):
    # send mappings to interventions and conditions, group CUIs that correspond to input condition or intervention
    relevant_date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data
    metamap_version = [int(s) for s in re.findall(r'\d+', metamap_dirs.get('metamap_bin_dir'))] # get MetaMap version being run 

    metamap_input = "{}_metamap_output.tsv".format(relevant_date)
    metamapped = pd.read_csv(metamap_input, sep='\t', index_col=False, header=0)

    # get the full names of the semantic types so we know what we're looking at
    metamap_semantic_types = pd.read_csv("MetaMap_SemanticTypes_2018AB.txt")
    metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].str.replace(r'\[|\]', '', regex=True)
    sem_type_col_names = ["abbv", "group", "semantic_type_full"]
    metamap_semantic_types = pd.read_csv("MetaMap_SemanticTypes_2018AB.txt", sep="|", index_col=False, header=None, names=sem_type_col_names)
    sem_type_dict = dict(zip(metamap_semantic_types['abbv'], metamap_semantic_types['semantic_type_full'])) # make a dict of semantic type abbv and full name
    # Handle NaN (None) values in metamap_semantic_type column
    metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].apply(lambda x: x.split(',') if isinstance(x, str) else np.nan)
    # map semantic type abbreviations to the full name of the semantic type
    metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].apply(lambda x: '|'.join([sem_type_dict[term] if term in sem_type_dict else term for term in x]) if isinstance(x, list) else x)

    metamapped['metamap_preferred_name'] = metamapped['metamap_preferred_name'].str.lower()
    metamapped = metamapped.dropna(axis=0)
    metamapped = metamapped[["term_type", "clin_trial_term", "metamap_cui","metamap_preferred_name", "metamap_semantic_type"]]

    metamapped["metamap_term_info"] = metamapped[["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]].values.tolist() 
    metamapped.drop(["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"], axis = 1, inplace = True)
    metamapped = metamapped.groupby(['term_type', 'clin_trial_term'])['metamap_term_info'].agg(list).reset_index()

    conditions = '{}_conditions.tsv'.format(relevant_date)
    conditions = pd.read_csv(conditions, sep='\t', index_col=False, header=0)
    interventions = '{}_interventions.tsv'.format(relevant_date)
    interventions = pd.read_csv(interventions, sep='\t', index_col=False, header=0)

    metamapped_con = metamapped.loc[metamapped['term_type'] == "condition"]
    metamapped_int = metamapped.loc[(metamapped['term_type'] == "intervention") | (metamapped['term_type'] == "alternate_intervention")]

    mapper_con = dict(zip(metamapped_con['clin_trial_term'], metamapped_con['metamap_term_info'])) # make a dict to map conditions
    mapper_int = dict(zip(metamapped_int['clin_trial_term'], metamapped_int['metamap_term_info'])) # make a dict to map interventions

    cols_to_check = [ele for ele in conditions.columns if(ele not in ['id', 'nct_id', 'condition_id'])]
    conditions["curie_info"] = None

    for index, row in conditions.iterrows():
        for col_name in cols_to_check:
            value = row[col_name]
            if value in mapper_con:
                curie_info = mapper_con[value]
                conditions.at[index, "curie_info"] = curie_info    
                
    conditions.to_csv('{}_conditions.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output conditions to TSV

    cols_to_check = [ele for ele in interventions.columns if(ele not in ['id', 'nct_id', 'intervention_id', 'intervention_type', 'description'])]
    interventions["curie_info"] = None

    for index, row in interventions.iterrows():
        for col_name in cols_to_check:
            value = row[col_name]
            if value in mapper_int:
                curie_info = mapper_int[value]
                interventions.at[index, "curie_info"] = curie_info
    
    interventions.to_csv('{}_interventions.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output interventions to TSV




In [105]:
# send mappings to interventions and conditions, group CUIs that correspond to input condition or intervention
relevant_date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data
metamap_version = [int(s) for s in re.findall(r'\d+', metamap_dirs.get('metamap_bin_dir'))] # get MetaMap version being run 

metamap_input = "{}_metamap_output.tsv".format(relevant_date)
metamapped = pd.read_csv(metamap_input, sep='\t', index_col=False, header=0)

# get the full names of the semantic types so we know what we're looking at
metamap_semantic_types = pd.read_csv("MetaMap_SemanticTypes_2018AB.txt")
metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].str.replace(r'\[|\]', '', regex=True)
sem_type_col_names = ["abbv", "group", "semantic_type_full"]
metamap_semantic_types = pd.read_csv("MetaMap_SemanticTypes_2018AB.txt", sep="|", index_col=False, header=None, names=sem_type_col_names)
sem_type_dict = dict(zip(metamap_semantic_types['abbv'], metamap_semantic_types['semantic_type_full'])) # make a dict of semantic type abbv and full name
# Handle NaN (None) values in metamap_semantic_type column
metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].apply(lambda x: x.split(',') if isinstance(x, str) else np.nan)
# map semantic type abbreviations to the full name of the semantic type
metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].apply(lambda x: '|'.join([sem_type_dict[term] if term in sem_type_dict else term for term in x]) if isinstance(x, list) else x)

metamapped['metamap_preferred_name'] = metamapped['metamap_preferred_name'].str.lower()
metamapped = metamapped.dropna(axis=0)
metamapped = metamapped[["term_type", "clin_trial_term", "metamap_cui","metamap_preferred_name", "metamap_semantic_type"]]

metamapped["metamap_term_info"] = metamapped[["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]].values.tolist() 
metamapped.drop(["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"], axis = 1, inplace = True)
metamapped = metamapped.groupby(['term_type', 'clin_trial_term'])['metamap_term_info'].agg(list).reset_index()

conditions = '{}_conditions.tsv'.format(relevant_date)
conditions = pd.read_csv(conditions, sep='\t', index_col=False, header=0)
interventions = '{}_interventions.tsv'.format(relevant_date)
interventions = pd.read_csv(interventions, sep='\t', index_col=False, header=0)

metamapped_con = metamapped.loc[metamapped['term_type'] == "condition"]
metamapped_int = metamapped.loc[(metamapped['term_type'] == "intervention") | (metamapped['term_type'] == "alternate_intervention")]

mapper_con = dict(zip(metamapped_con['clin_trial_term'], metamapped_con['metamap_term_info'])) # make a dict to map conditions
mapper_int = dict(zip(metamapped_int['clin_trial_term'], metamapped_int['metamap_term_info'])) # make a dict to map interventions


# if metamap_version[0] >= 20:
#     conditions = pd.merge(conditions, metamapped_con, how='left', left_on=['orig_con'], right_on = ['clin_trial_term'])
#     interventions = pd.merge(interventions, metamapped_int, how='left', left_on=['orig_int'], right_on = ['clin_trial_term'])

# else:
#     conditions = pd.merge(conditions, metamapped_con, how='left', left_on=['deascii_con'], right_on = ['clin_trial_term'])
#     conditions = pd.merge(conditions, metamapped_con, how='left', left_on=['orig_con'], right_on = ['clin_trial_term'])

cols_to_check = [ele for ele in conditions.columns if(ele not in ['id', 'nct_id', 'condition_id'])]
conditions["curie_info"] = None

for index, row in conditions.iterrows():
    for col_name in cols_to_check:
        value = row[col_name]
        if value in mapper_con:
            curie_info = mapper_con[value]
            conditions.at[index, "curie_info"] = curie_info    
conditions


cols_to_check = [ele for ele in interventions.columns if(ele not in ['id', 'nct_id', 'intervention_id', 'intervention_type', 'description'])]
interventions["curie_info"] = None

for index, row in interventions.iterrows():
    for col_name in cols_to_check:
        value = row[col_name]
        if value in mapper_int:
            curie_info = mapper_int[value]
            interventions.at[index, "curie_info"] = curie_info  
            
interventions

Unnamed: 0,id,nct_id,orig_con,deascii_con,deascii_con_outside,deascii_con_inside,curie_info
0,53981364,NCT03223025,idiopathic growth hormone deficiency,,,,"[[C0342381, idiopathic growth hormone deficien..."
1,53857255,NCT04429958,obesity,,,,"[[C0028754, obesity, Disease or Syndrome], [C1..."
2,53633124,NCT04049461,"ossification, heterotopic",,,,"[[C0029396, heterotopic ossification, Patholog..."
3,53916831,NCT02341196,stroke,,,,"[[C0038454, cerebrovascular accident, Disease ..."
4,53876066,NCT05041738,postoperative pain,,,,
...,...,...,...,...,...,...,...
295,53344455,NCT05679596,cognitive function,,,,
296,53455295,NCT01910350,breast cancer,,,,"[[C0678222, breast carcinoma, Neoplastic Proce..."
297,53693064,NCT04872582,metastatic nasopharyngeal carcinoma,,,,"[[C1377919, stage iv nasopharyngeal carcinoma,..."
298,53809101,NCT03865537,colonoscopy,,,,


Unnamed: 0,intervention_id,nct_id,intervention_type,orig_int,description,alt_int,deascii_orig_int,deascii_orig_int_outside,deascii_orig_int_inside,curie_info
0,54114376,NCT05997745,Dietary Supplement,study group,Patients will receive early enteral locally nu...,,,,,"[[C0441839, group e, Classification], [C055765..."
1,54086303,NCT05945823,Drug,leucovorin,400 mg/m^2 Q2W as part of investigator's choic...,,,,,"[[C0023413, leucovorin, Organic Chemical|Pharm..."
2,54057998,NCT05942651,Device,non-invasive brain stimulation protocol (corti...,The first stimulation will be applied to the I...,,,non-invasive brain stimulation protocol,cortico-cortical paired associative stimulatio...,"[[C0007776, cerebral cortex, Body Part, Organ,..."
3,54278172,NCT05915195,Other,postural and kinesthetic awarness,"Firstly, patient performed therapeutic activit...",,,,,"[[C0022701, kinesthesis, Organism Function], [..."
4,54278220,NCT05914584,Drug,baricitinib 4 mg,Reference drug,,,,,"[[C0024671, mammography, Diagnostic Procedure]..."
...,...,...,...,...,...,...,...,...,...,...
295,53673252,NCT00005081,Drug,o6-benzylguanine,,,,,,"[[C0083812, o(6)-benzylguanine, Nucleic Acid, ..."
296,53694951,NCT00004110,Drug,etoposide,,,,,,"[[C0015133, etoposide, Organic Chemical|Pharma..."
297,54403439,NCT00002572,Biological,therapeutic tumor infiltrating lymphocytes,,,,,,"[[C1515408, therapeutic tumor infiltrating lym..."
298,53693806,NCT00002400,Drug,delavirdine mesylate,,,,,,"[[C0543492, delavirdine mesylate, Organic Chem..."


{'  f-flt': [['C0016327',
   'fluorides',
   'Inorganic Chemical|Pharmacologic Substance'],
  ['C1568520',
   'flt1 protein, human',
   'Amino Acid, Peptide, or Protein|Enzyme|Receptor'],
  ['C0206795',
   'alovudine',
   'Nucleic Acid, Nucleoside, or Nucleotide|Pharmacologic Substance'],
  ['C0812298', 'flt1 gene', 'Gene or Genome'],
  ['C1705150', 'flt1 wt allele', 'Gene or Genome']],
 '(-)-cyclophosphamide': [['C0010583',
   'cyclophosphamide',
   'Organic Chemical|Pharmacologic Substance']],
 '(sp-4-2)-diamminedichloroplatinum': [['C0008838',
   'cisplatin',
   'Inorganic Chemical|Pharmacologic Substance'],
  ['C3641137', 'square planar 2 molecular geometry', 'Spatial Concept']],
 '0.9 ns': [['C0038944', 'suriname', 'Geographic Area'],
  ['C1541397', 'ns-9', 'Pharmacologic Substance'],
  ['C1705982', 'kras wt allele', 'Gene or Genome'],
  ['C4068881', '0.9', 'Quantitative Concept'],
  ['C3842591', '0%', 'Quantitative Concept']],
 '0.9% saline': [['C0445115',
   'normal saline',
   

In [99]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(metamapped)

Unnamed: 0,term_type,clin_trial_term,metamap_term_info
0,alternate_intervention,f-flt,"[[C0016327, fluorides, Inorganic Chemical|Pharmacologic Substance], [C1568520, flt1 protein, human, Amino Acid, Peptide, or Protein|Enzyme|Receptor], [C0206795, alovudine, Nucleic Acid, Nucleoside, or Nucleotide|Pharmacologic Substance], [C0812298, flt1 gene, Gene or Genome], [C1705150, flt1 wt allele, Gene or Genome]]"
1,alternate_intervention,(-)-cyclophosphamide,"[[C0010583, cyclophosphamide, Organic Chemical|Pharmacologic Substance]]"
2,alternate_intervention,(sp-4-2)-diamminedichloroplatinum,"[[C0008838, cisplatin, Inorganic Chemical|Pharmacologic Substance], [C3641137, square planar 2 molecular geometry, Spatial Concept]]"
3,alternate_intervention,0.9 ns,"[[C0038944, suriname, Geographic Area], [C1541397, ns-9, Pharmacologic Substance], [C1705982, kras wt allele, Gene or Genome], [C4068881, 0.9, Quantitative Concept], [C3842591, 0%, Quantitative Concept]]"
4,alternate_intervention,0.9% saline,"[[C0445115, normal saline, Inorganic Chemical|Pharmacologic Substance]]"
5,alternate_intervention,1-.beta.-d-arabinofuranosyl-4-amino-2(1h)pyrimidinone,"[[C0034290, pyrimidinones, Organic Chemical], [C1538440, cfh gene, Gene or Genome]]"
6,alternate_intervention,1-day acuvue trueye,"[[C1442449, 1 day, Temporal Concept]]"
7,alternate_intervention,1. standard treatment + placebo,"[[C0032042, placebos, Therapeutic or Preventive Procedure], [C1706408, placebo control, Research Activity], [C1696465, placebo, Biomedical or Dental Material], [C2248614, 2,3-diketo-5-methylthiopentyl-1-phosphate enolase activity, Molecular Function], [C4684780, standard treatment, Health Care Activity], [C2266809, acireductone synthase activity, Molecular Function]]"
8,alternate_intervention,150 mg tariquidar through central venous catheter over 30 minutes on days 1 and 3.,"[[C0024671, mammography, Diagnostic Procedure], [C0879444, tariquidar, Organic Chemical|Pharmacologic Substance], [C1145640, central venous catheter, device, Medical Device], [C0026410, mongolia, Geographic Area], [C1442458, 30 minutes, Temporal Concept], [C0456693, per 30 minutes, Temporal Concept], [C2248614, 2,3-diketo-5-methylthiopentyl-1-phosphate enolase activity, Molecular Function], [C4028326, 30 days, Temporal Concept], [C2266809, acireductone synthase activity, Molecular Function], [C4321396, mg, Diagnostic Procedure], [C2346927, magnesium cation, Element, Ion, or Isotope|Pharmacologic Substance], [C1960952, milligram percent, Quantitative Concept], [C1282918, minute (diminutive), Quantitative Concept], [C2347166, minute unit of plane angle, Quantitative Concept], [C0439232, minute of time, Temporal Concept], [C0702093, per minute, Temporal Concept], [C0700321, small, Quantitative Concept], [C4521761, united states military commissioned officer o8, Classification], [C0439228, day, Temporal Concept], [C0439269, mg/dl, Quantitative Concept], [C0439422, milligram/day, Quantitative Concept]]"
9,alternate_intervention,"2(1h)-pyrimidinone, 4-amino-1.beta.-d-arabinofuranosyl-","[[C0034290, pyrimidinones, Organic Chemical], [C1538440, cfh gene, Gene or Genome]]"


In [94]:
conditions

Unnamed: 0,id,nct_id,orig_con,deascii_con,deascii_con_outside,deascii_con_inside
0,53958561,NCT05943548,hiv infections,,,
1,53971468,NCT03327415,food consumption,,,
2,54035044,NCT02404402,posttraumatic stress disorder,,,
3,53671527,NCT02665793,peanut hypersensitivity,,,
4,53616935,NCT02615938,diffuse parenchymal lung disease,,,
...,...,...,...,...,...,...
295,53460225,NCT04166331,cardiomyopathies,,,
296,54024548,NCT02277119,eyes with retinal diseases,,,
297,53728279,NCT00908297,oxidative stress,,,
298,53613949,NCT02535442,common cold,,,


In [95]:
metamapped

Unnamed: 0,term_type,clin_trial_term,metamap_term_info
0,alternate_intervention,(bla) 103705,"[[C2347025, biologics license application, Int..."
1,alternate_intervention,(novolog ),"[[C0939412, novolog, Amino Acid, Peptide, or P..."
2,alternate_intervention,(stadium ),"[[C0442588, stadium, Manufactured Object]]"
3,alternate_intervention,0. 9% sodium chloride injection,"[[C0980221, sodium chloride 9 mg/ml injectable..."
4,alternate_intervention,1 % lidocaine,"[[C0023660, lidocaine, Organic Chemical|Pharma..."
...,...,...,...
996,intervention,xl228,"[[C2348842, xl228, Pharmacologic Substance]]"
997,intervention,yj001 for spray use,"[[C2003858, spray (action), Activity], [C45217..."
998,intervention,ziv-aflibercept,"[[C3485619, ziv-aflibercept, Organic Chemical|..."
999,intervention,zoledronic acid,"[[C0257685, zoledronic acid, Organic Chemical|..."


In [None]:
def score_mappings(flag_and_path):
    relevant_date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data
    pattern_outside = r'(?<=\().+?(?=\))|([^(]+)'
    pattern_inside = r'\(([^)]+)\)'

    sort_ratio = np.vectorize(get_token_sort_ratio)
    set_ratio = np.vectorize(get_token_set_ratio)
    sim_score = np.vectorize(get_similarity_score)

    metamap_input = "{}_metamap_output.tsv".format(relevant_date)
    metamap_semantic_types = pd.read_csv("MetaMap_SemanticTypes_2018AB.txt")
    metamapped = pd.read_csv(metamap_input, sep='\t', index_col=False, header=0)

    # get the full names of the semantic types so we know what we're looking at
    metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].str.replace(r'\[|\]', '', regex=True)
    sem_type_col_names = ["abbv", "group", "semantic_type_full"]
    metamap_semantic_types = pd.read_csv("MetaMap_SemanticTypes_2018AB.txt", sep="|", index_col=False, header=None, names=sem_type_col_names)
    sem_type_dict = dict(zip(metamap_semantic_types['abbv'], metamap_semantic_types['semantic_type_full'])) # make a dict of semantic type abbv and full name
    # Handle NaN (None) values in metamap_semantic_type column
    metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].apply(lambda x: x.split(',') if isinstance(x, str) else np.nan)
    # map semantic type abbreviations to the full name of the semantic type
    metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].apply(lambda x: '|'.join([sem_type_dict[term] if term in sem_type_dict else term for term in x]) if isinstance(x, list) else x)

    metamapped['metamap_preferred_name'] = metamapped['metamap_preferred_name'].str.lower()
    metamapped = metamapped.dropna(axis=0)
    metamapped = metamapped[["clin_trial_term", "metamap_cui","metamap_preferred_name", "metamap_semantic_type"]]

    matches_outside = metamapped['clin_trial_term'].str.extract(pattern_outside)
    metamapped['clin_trial_term_outside_par'] = matches_outside[0].fillna('')
    matches_inside = metamapped['clin_trial_term'].str.extract(pattern_inside)
    metamapped['clin_trial_term_inside_par'] = matches_inside[0].fillna('')

    metamapped = metamapped[['clin_trial_term', 'clin_trial_term_outside_par', 'clin_trial_term_inside_par', 'metamap_cui', 'metamap_preferred_name', 'metamap_semantic_type']] # re-order columns of df

    # score on clin_trial_term term vs MetaMap term
    metamapped["sort_ratio_orig"] = sort_ratio(metamapped[["clin_trial_term"]].values, metamapped[["metamap_preferred_name"]].values) 
    metamapped["sim_score_orig"] = sim_score(metamapped[["clin_trial_term"]].values, metamapped[["metamap_preferred_name"]].values)
    # score on term outside parentheses (clin_trial_term_outside_par) vs MetaMap term
    metamapped["sort_ratio_outside"] = sort_ratio(metamapped[["clin_trial_term_outside_par"]].values, metamapped[["metamap_preferred_name"]].values) 
    metamapped["sim_score_outside"] = sim_score(metamapped[["clin_trial_term_outside_par"]].values, metamapped[["metamap_preferred_name"]].values)
    # score on term inside parentheses (clin_trial_term_inside_par) vs MetaMap term
    metamapped["sort_ratio_inside"] = sort_ratio(metamapped[["clin_trial_term_inside_par"]].values, metamapped[["metamap_preferred_name"]].values) 
    metamapped["sim_score_inside"] = sim_score(metamapped[["clin_trial_term_inside_par"]].values, metamapped[["metamap_preferred_name"]].values)

    # get all columns that have "score" in it so we can filter on the threshold score we want
    score_ratio_columns = [col for col in metamapped.columns if 'score' in col or 'ratio' in col]
    metamapped['max_score'] = metamapped[score_ratio_columns].max(axis=1)
    metamapped.drop(score_ratio_columns, axis = 1, inplace = True) # drop all scoring col except max score columns now

    # filter for rows that have scores > 88 in max_score col
    metamapped_threshold_pass = metamapped[metamapped['max_score'] > 88]
    # multiple MetaMap terms may have passed scoring threshold for a single input Clinical trial term. Choose only 1 MetaMap term for each input Clinical Trial term. Do this by picking row with max score of all score columns
    metamapped_threshold_pass = metamapped_threshold_pass.sort_values('max_score').drop_duplicates('clin_trial_term', keep='first')

    # filter for clin trial terms that didn't pass threshold of 88
    metamapped_threshold_fail = metamapped.loc[~metamapped['clin_trial_term'].isin(metamapped_threshold_pass['clin_trial_term'])].copy()

    metamapped_threshold_pass = metamapped_threshold_pass[["clin_trial_term", "metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]] # get only columns of relevance
    metamapped_threshold_fail = metamapped_threshold_fail[["clin_trial_term", "metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]] # get only columns of relevance

    # prep output file of MetaMap terms that failed threshold for manual review
    metamapped_threshold_fail['metamap_term_info']= metamapped_threshold_fail[["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]].values.tolist()
    metamapped_threshold_fail.drop(["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"], axis = 1, inplace = True)
    metamapped_threshold_fail['metamap_term_info'] = metamapped_threshold_fail['metamap_term_info'].apply(lambda x: ', '.join(map(str, x))) # remove the MetaMap info from their lists bc pandas Multi-indexing doesn't work on lists
    metamapped_threshold_fail['temp'] = "temp"
    metamapped_threshold_fail.set_index(["clin_trial_term", "metamap_term_info"], inplace=True)
    metamapped_threshold_fail.drop(["temp"], axis = 1, inplace = True)
    metamapped_threshold_fail['manually_selected_CURIE'] = None

    metamapped_threshold_fail.to_excel('{}_CURIES_manual_review.xlsx'.format(relevant_date), engine='xlsxwriter', index=True)
    metamapped_threshold_pass.to_csv('{}_metamap_threshold_pass.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output interventions to TSV, avoid storing in memory


    # # get all columns that have "score" in it so we can filter on the threshold score we want
    # score_ratio_columns = [col for col in metamapped_scored.columns if 'score' in col or 'ratio' in col]
    # metamapped_threshold_pass = metamapped_scored[(metamapped_scored[score_ratio_columns] > 88).any(axis=1)]
    # metamapped_threshold_fail = metamapped_scored.loc[~metamapped_scored['clin_trial_term'].isin(metamapped_threshold_pass['clin_trial_term'])].copy()

    # metamapped_threshold_pass.drop(score_ratio_columns, axis = 1, inplace = True) # drop the scoring columns now
    # metamapped_threshold_fail.drop(score_ratio_columns, axis = 1, inplace = True) # drop the scoring columns now


    # metamapped_manual_curation = metamapped_threshold_fail[["clin_trial_term", "metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]]
    # metamapped_manual_curation = metamapped_manual_curation.copy()
    # metamapped_manual_curation['metamap_term_info']= metamapped_manual_curation[["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]].values.tolist()
    # metamapped_manual_curation.drop(["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"], axis = 1, inplace = True)

    # metamapped_manual_curation['metamap_term_info'] = metamapped_manual_curation['metamap_term_info'].apply(lambda x: ','.join(map(str, x))) # remove the MetaMap info from their lists bc pandas Multi-indexing doesn't work on lists
    # metamapped_manual_curation['temp'] = "temp"
    # metamapped_manual_curation.set_index(["clin_trial_term", "metamap_term_info"],inplace=True)


    # metamapped_manual_curation = metamapped_manual_curation.drop('temp', axis=1) # drop the redundant column now






    # metamapped_manual_curation.drop(["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"], axis = 1, inplace = True)
    # metamapped_manual_curation = metamapped_manual_curation.groupby('clin_trial_term')['metamap_term_info'].agg(list).reset_index()

    # use Multiindexing to see lists of CURIEs available for single term
    # Explode the column of lists of lists
    # metamapped_manual_curation = metamapped_manual_curation.explode('metamap_term_info')

    # Reset the index if needed
    # metamapped_manual_curation.reset_index(drop=True, inplace=True)



    # metamapped_manual_curation = metamapped_manual_curation.groupby('clin_trial_term')['metamap_term_info'].agg(list).reset_index()

    # metamapped_con['max_score'] = metamapped_con[['sort_ratio', 'sim_score']].max(axis=1)
    # metamapped_con = metamapped_con.sort_values('max_score').drop_duplicates('clin_trial_term', keep='first')


    # metamapped_con["metamap_term_info"] = metamapped_con[["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]].values.tolist() 
    # metamapped_con.drop(["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"], axis = 1, inplace = True)
    # metamapped_con = metamapped_con.groupby('clin_trial_term')['metamap_term_info'].agg(list).reset_index()


with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(metamapped_threshold_fail)

In [14]:
def merge_mappings_to_trials(df_dict):
    conditions = df_dict["conditions"]

    interventions = df_dict["interventions"]
    conditions = df_dict["interventions_alts"]
        

In [170]:
for index, row in conditions.iterrows():
    for col_name in cols_to_check:
        value = row[col_name]
        if value in hiconf_con_dict:
            curie_info = hiconf_con_dict[value]
            conditions.at[index, "high_conf_curie_info"] = curie_info

#         curie_info = hiconf_con_dict[value]
#         print(curie_info)
# #         conditions.at[index, "high_conf_curie_info"] = curie_info

In [173]:
hiconf_int = hiconfidence_curies.loc[hiconfidence_curies['term_type'] == "intervention"]
hiconf_int_dict = dict(zip(hiconf_int['clin_trial_term'].tolist(), hiconf_int['metamap_term_info'].tolist())) # make a dict of clinical trial term and corresponding metamap info
cols_to_check = [ele for ele in interventions.columns if(ele not in ['id', 'nct_id'])]
interventions["high_conf_curie_info"] = None

for index, row in interventions.iterrows():
    for col_name in cols_to_check:
        value = row[col_name]
        if value in hiconf_int_dict:
            curie_info = hiconf_int_dict[value]
            interventions.at[index, "high_conf_curie_info"] = curie_info

In [204]:
interventions_alts = df_dict["interventions_alts"][["id", "nct_id", "intervention_id", "alt_downcase_name"]]
interventions = df_dict["interventions"]
interventions
interventions_alts

Unnamed: 0,id,nct_id,intervention_type,name,description,orig_downcase_name
193316,54403505,NCT04160000,Drug,Rate or Rhythm control antiarrhythmic drugs fo...,Administration of antiarrhythmic drug to achie...,rate or rhythm control antiarrhythmic drugs fo...
335174,53644628,NCT03069131,Drug,Rifaximin,twice daily administration of 1 tablet contain...,rifaximin
599099,53998293,NCT00463840,Drug,Oxaliplatin,,oxaliplatin
556232,53712417,NCT01506284,Procedure,Forced oscillatory technique (FOT) and laser i...,FOT: The stimulating signal is generated by an...,forced oscillatory technique (fot) and laser i...
324723,53925544,NCT02810262,Other,Bone metastases biopsy,The vast majority of bone biopsy are performed...,bone metastases biopsy
...,...,...,...,...,...,...
583887,53987390,NCT00308113,Dietary Supplement,Coenzyme Q10,serum levels of greater or equal to 2.5 microg...,coenzyme q10
493635,53702684,NCT01422954,Drug,Chloroquine prophylaxis,Standard prophylactic regime: a loading dose o...,chloroquine prophylaxis
356939,53658216,NCT02519036,Other,Placebo,Placebo was administered by intrathecal inject...,placebo
196727,54405081,NCT05774873,Drug,IBI334,Subjects will receive IBI334 once a week durin...,ibi334


Unnamed: 0,id,nct_id,intervention_id,alt_downcase_name
0,27584249,NCT01738191,54313664,strattera
1,27584250,NCT01737879,54313666,omontys
2,27428744,NCT04545502,54003364,gelsoft plus
3,27584251,NCT01737879,54313667,epogen
4,27273339,NCT04571879,53672522,nebulized xylocaine
...,...,...,...,...
387960,27583600,NCT03192215,54313417,eliquis
387961,27583601,NCT03192215,54313418,aspirin tablet
387962,27583602,NCT03052608,54313422,pf-06463922
387963,27583603,NCT03052608,54313423,xalkori


In [183]:
hiconf_altint = hiconfidence_curies.loc[hiconfidence_curies['term_type'] == "alternate_intervention"]
hiconf_altint_dict = dict(zip(hiconf_altint['clin_trial_term'].tolist(), hiconf_altint['metamap_term_info'].tolist())) # make a dict of clinical trial term and corresponding metamap info
cols_to_check = [ele for ele in interventions_alts.columns if(ele not in ['id', 'nct_id', 'intervention_id'])]
interventions_alts["high_conf_curie_info"] = None

for index, row in interventions_alts.iterrows():
    for col_name in cols_to_check:
        value = row[col_name]
        if value in hiconf_altint_dict:
            curie_info = hiconf_altint_dict[value]
            interventions_alts.at[index, "high_conf_curie_info"] = curie_info

In [100]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(interventions_all[:100])

Unnamed: 0,intervention_id,nct_id,intervention_type,orig_int,description,alt_int,orig_int_outside,orig_int_inside,alt_int_outside,alt_int_inside
316,53647000,NCT05986136,Drug,dapagliflozin 10mg tab,Dapagliflozin has emerged as a selective SGLT2...,,dapagliflozin 10mg tab,,,
170,54383515,NCT05965375,Other,observational; no interventions were given.,Observational; No Interventions were given.,,observational; no interventions were given.,,,
58,54002989,NCT05912244,Drug,io102/io103,IO102/IO103 will be administered subcutaneousl...,,io102/io103,,,
5,54036407,NCT05864040,Device,bitrack system-assisted laparoscopic radical/s...,Robot-Assisted Laparoscopic Transperitoneal Ra...,,bitrack system-assisted laparoscopic radical/s...,,,
70,53743983,NCT05853731,Device,white light emitting diode,Filtered white light via light emitting diode ...,wled,white light emitting diode,,wled,
228,53727180,NCT05835037,Drug,placebo,Participants will take one placebo tablet per ...,,placebo,,,
81,53639435,NCT05806385,Device,cryoablation,Tumor ablation before neoadjuvant chemotherapy,,cryoablation,,,
276,53682687,NCT05766514,Drug,azacitadine or decitabine,Subjects will be given either 20 mg/m2 decitab...,,azacitadine or decitabine,,,
48,54201879,NCT05748665,Drug,etomidate,Using etomidate for induction of general anest...,,etomidate,,,
283,53977051,NCT05740826,Device,algometer,The algometer head was applied to the trigger ...,,algometer,,,


In [35]:
# # output all results to TSVs
# def compile_and_output(df_dict, ct_terms, remaining_unmapped_possible):
#     print("\n")
#     print("#   -------- -------- -------- --------  ")
#     print("Final Tallies:")
#     print("Total # of conditions mapped: {}".format(ct_terms["mapped_conditions"].shape[0]))
#     print("Total # of interventions mapped: {}".format(ct_terms["mapped_interventions"].shape[0]))
#     print("Total # of conditions unmapped or not mapped: {}".format(len(ct_terms["unmapped_conditions"])))
#     print("Total # of interventions unmapped or not mapped: {}".format(len(ct_terms["unmapped_interventions"])))    
#     # How many Clinical Trials are there? Well, it's different depending on the Conditions or Interventions dataframes...
#     conditions_nctids = len(df_dict["conditions"].nct_id.unique())
#     interventions_nctids = len(df_dict["interventions"].nct_id.unique())
#     print("Number of Clinical Trials NCITs in Conditions table: {}".format(conditions_nctids))      
#     print("Number of Clinical Trials NCITs in Interventions table: {}".format(interventions_nctids))
#     print("#   -------- -------- -------- --------  ")

#     """ create tables of unused MeSH and MetaMap CURIEs that could be used for unmapped Conditions and Interventions """
#     # -------    CONDITIONS    ------- #
#     all_conditions = df_dict["conditions"][["nct_id", "downcase_name"]]
#     conditions_mesh = pd.merge(all_conditions, 
#                                remaining_unmapped_possible["mesh_conditions_per_study"],
#                                how='left',
#                                left_on=['nct_id'],
#                                right_on = ['nct_id'])
    
#     metamap_possibilities = remaining_unmapped_possible["all_metamapped_conditions"][["condition_input", "condition_CURIE_id", "condition_CURIE_name", "condition_semantic_type"]]
#     conditions_mesh_metamap = pd.merge(conditions_mesh, 
#                                        metamap_possibilities,
#                                        how='left',
#                                        left_on=['downcase_name'],
#                                        right_on = ['condition_input'])
    
#     unmapped_conditions_possible_terms = conditions_mesh_metamap[conditions_mesh_metamap['downcase_name'].isin(ct_terms["unmapped_conditions"])]
#     unmapped_conditions_possible_terms = unmapped_conditions_possible_terms.drop('condition_input', axis=1) # drop the redundant column now
    
#     # -------    INTERVENTIONS    ------- #
#     all_interventions = df_dict["interventions"][["nct_id", "downcase_name"]]
#     interventions_mesh = pd.merge(all_interventions, 
#                                remaining_unmapped_possible["mesh_interventions_per_study"],
#                                how='left',
#                                left_on=['nct_id'],
#                                right_on = ['nct_id'])
    
#     metamap_possibilities = remaining_unmapped_possible["all_metamapped_interventions"][["intervention_input", "intervention_CURIE_id", "intervention_CURIE_name", "intervention_semantic_type"]]
#     interventions_mesh_metamap = pd.merge(interventions_mesh, 
#                                        metamap_possibilities,
#                                        how='left',
#                                        left_on=['downcase_name'],
#                                        right_on = ['intervention_input'])
    
#     unmapped_interventions_possible_terms = interventions_mesh_metamap[interventions_mesh_metamap['downcase_name'].isin(ct_terms["unmapped_interventions"])]
#     unmapped_interventions_possible_terms = unmapped_interventions_possible_terms.drop('intervention_input', axis=1) # drop the redundant column now
          
        
#     """   Output all to TSVs   """    
#     pd.Series(ct_terms["unmapped_conditions"]).to_csv('unmapped_conditions.tsv', sep="\t", index=False, header=False) # convert the list to a pandas series, then output to TSV
#     pd.Series(ct_terms["unmapped_interventions"]).to_csv('unmapped_interventions.tsv', sep="\t", index=False, header=False) # convert the list to a pandas series, then output to TSV
#     ct_terms["mapped_conditions"].to_csv('mapped_conditions.tsv', sep="\t", index=False)
#     ct_terms["mapped_interventions"].to_csv('mapped_interventions.tsv', sep="\t", index=False)
#     unmapped_conditions_possible_terms.to_csv('unmapped_conditions_possible_mappings.tsv', sep="\t", index=False)
#     unmapped_interventions_possible_terms.to_csv('unmapped_interventions_possible_mappings.tsv', sep="\t", index=False)
    



In [None]:
# def test_or_prod():
#     print("The test run of this code performs the construction of the KG on a subset of 200 Conditions and 200 Interventions from Clinical Trials.\n")
#     test_or_prod = input("Is this a test run or the production of a new version of the KG? Write T for test, or P for production: ")
#     if test_or_prod == "T":
#         flag_and_path = get_raw_ct_data() # uncomment for production
#         flag_and_path["term_program_flag"] = False
#         run_ETL_mapping(flag_and_path)
#     elif test_or_prod == "P":
#         flag_and_path = get_raw_ct_data() 
#         run_ETL_mapping(flag_and_path)
#     else:
#         print("Bad input")
#         sys.exit(0)
        

        
        

In [None]:
# def run_ETL_mapping(flag_and_path):
#     df_dict = read_raw_ct_data(flag_and_path)
#     ct_terms = exact_match_mesh(df_dict)
#     ct_terms = inexact_match_mesh(df_dict, ct_terms)

#     # pull the available MeSH terms per study out of the returned ct_terms dict 
#     mesh_conditions_per_study = ct_terms["mesh_conditions_per_study"]
#     mesh_interventions_per_study = ct_terms["mesh_interventions_per_study"]

#     ct_terms = term_list_to_nr(df_dict, ct_terms)
#     ct_terms = term_list_to_mm(df_dict, ct_terms)

#     # pull the available UMLS terms per study out of the returned ct_terms dict 
#     all_metamapped_conditions = ct_terms["all_metamapped_conditions"]
#     all_metamapped_interventions = ct_terms["all_metamapped_interventions"]

#     remaining_unmapped_possible = {"mesh_conditions_per_study": mesh_conditions_per_study,
#                                    "mesh_interventions_per_study": mesh_interventions_per_study,
#                                    "all_metamapped_conditions": all_metamapped_conditions,
#                                    "all_metamapped_interventions": all_metamapped_interventions}
#     compile_and_output(df_dict, ct_terms, remaining_unmapped_possible)


    

In [111]:
# flag_and_path = get_raw_ct_data() # uncomment for production
flag_and_path = {'term_program_flag': False,
                 'data_extracted_path': '/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/08_21_2023_extracted',
                 'date_string':'08_21_2023'} # comment for production
metamap_dirs = check_os()
df_dict = read_raw_ct_data(flag_and_path)
term_list_to_mm(df_dict, flag_and_path)
map_to_trial(df_dict, flag_and_path)
# score_mappings(flag_and_path)
# merge_mappings_to_trials()

# # pull the available UMLS terms per study out of the returned ct_terms dict 
# all_metamapped_conditions = ct_terms["all_metamapped_conditions"]
# all_metamapped_interventions = ct_terms["all_metamapped_interventions"]

# remaining_unmapped_possible = {"mesh_conditions_per_study": mesh_conditions_per_study,
#                                "mesh_interventions_per_study": mesh_interventions_per_study,
#                                "all_metamapped_conditions": all_metamapped_conditions,
#                                "all_metamapped_interventions": all_metamapped_interventions}
# compile_and_output(df_dict, ct_terms, remaining_unmapped_possible)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


MetaMap version < 2020, conduct mapping on terms after removing ascii characters
Starting skrmedpostctl: 
started.
Starting wsdserverctl: 
started.
loading properties file /Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/WSD_Server/config/disambServer.cfg
WSD Server initializing disambiguation methods.
WSD Server databases and disambiguation methods have been initialized.
Could not listen on port : 5554 : Address already in use
Stopping skrmedpostctl: 
Stopping Tagger Server process..
Process 69250 stopped
Stopping wsdserverctl: 
Stopping WSD Server process..
Process 69267 stopped


/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/skrmedpostctl: line 50: kill: (69250) - No such process
/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/wsdserverctl: line 55: kill: (69267) - No such process


Using UMLS MetaMap to get mappings for INTERVENTIONS. MetaMap returns mappings, CUIs, and semantic type of mapping.
MetaMap version < 2020, conduct mapping on original interventions after removing ascii characters
Starting skrmedpostctl: 
started.
Starting wsdserverctl: 
started.
loading properties file /Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/WSD_Server/config/disambServer.cfg
WSD Server initializing disambiguation methods.
WSD Server databases and disambiguation methods have been initialized.
Could not listen on port : 5554 : Address already in use
Stopping skrmedpostctl: 
Stopping Tagger Server process..
Process 75006 stopped
Stopping wsdserverctl: 


/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/skrmedpostctl: line 50: kill: (75006) - No such process
/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/wsdserverctl: line 55: kill: (75008) - No such process


Stopping WSD Server process..
Process 75008 stopped
Starting skrmedpostctl: 
started.
Starting wsdserverctl: 
started.
loading properties file /Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/WSD_Server/config/disambServer.cfg
WSD Server initializing disambiguation methods.
WSD Server databases and disambiguation methods have been initialized.
Could not listen on port : 5554 : Address already in use
Stopping skrmedpostctl: 
Stopping Tagger Server process..
Process 79911 stopped
Stopping wsdserverctl: 
Stopping WSD Server process..
Process 79913 stopped


/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/skrmedpostctl: line 50: kill: (79911) - No such process
/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/wsdserverctl: line 55: kill: (79913) - No such process


In [None]:
def convert_seconds_to_hms(seconds):

    """ converts the elapsed or run_time to hours, min, sec """
    hours = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return hours, minutes, seconds

current = dt.datetime.now()
ts = dt.datetime.timestamp(current)
d = dt.datetime.fromtimestamp(ts)
str_date_time = d.strftime("%d-%m-%Y, %H:%M:%S")
print("Timestamp of script start: {}".format(str_date_time))

start_time = time.time()
end_time = time.time()
elapsed_time = end_time - start_time
hours, minutes, seconds = convert_seconds_to_hms(elapsed_time)
print(f"Runtime: {hours} hours, {minutes} minutes, {seconds} seconds")

In [None]:
def report_stats(df_dict, flag_and_path):
    """ Report counts of conditions, interventions"""
    relevant_date = flag_and_path["date_string"] # get date
    
    total_conditions = df_dict["conditions"].downcase_name
    total_conditions = list(total_conditions.unique())
    total_conditions = list(filter(None, total_conditions))
    
    orig_interventions = df_dict["interventions"]
    orig_interventions = orig_interventions['name'].str.lower()
    orig_interventions = list(orig_interventions.unique())
    orig_interventions = list(filter(None, orig_interventions))
    
    alt_interventions = df_dict["interventions_alts"].alt_downcase_name
    alt_interventions = list(alt_interventions.unique())
    alt_interventions = list(filter(None, alt_interventions))
    
#     metamap_input = "{}_metamap_output.tsv".format(relevant_date)
    
#     """ Get the full names of the semantic types and replace the abbreviations with the full names """
#     metamapped = pd.read_csv(metamap_input, sep='\t', index_col=False, header=0)

    print("Clinical Trial Data from: {}".format(relevant_date))
    print("Total # of unique conditions : {}".format(len(total_conditions)))
    print("Total # of unique interventions : {}".format(len(orig_interventions) + len(alt_interventions)))
    

    
