In [5]:
# display cells to maximum width 
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))

# lets you preint multiple outputs per cell, not just last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [6]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
import re
import collections
import os
import json
import numpy as np
import pickle
from functools import reduce
import time
from time import sleep
import concurrent
import multiprocessing
import datetime as dt
from datetime import date
import pathlib
import configparser
import sys
import urllib
import zipfile
import csv
sys.path.insert(0, '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/pymetamap-master')
from pymetamap import MetaMap  # https://github.com/AnthonyMRios/pymetamap/blob/master/pymetamap/SubprocessBackend.py
from pandas import ExcelWriter
import ast



In [7]:
# %pip install thefuzz
# %pip install levenshtein

from thefuzz import fuzz # fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

In [8]:
global metamap_dirs
global metamap_pos_server_dir
global metamap_wsd_server_dir


In [9]:
# fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

def get_token_sort_ratio(str1, str2):
    try:
        return fuzz.token_sort_ratio(str1, str2)
    except:
        return None
    
sort_ratio = np.vectorize(get_token_sort_ratio)

def get_token_set_ratio(str1, str2):
    try:
        return fuzz.token_set_ratio(str1, str2)
    except:
        return None  
set_ratio = np.vectorize(get_token_set_ratio)

def get_similarity_score(str1, str2):
    try:
        return fuzz.ratio(str1, str2)
    except:
        return None
sim_score = np.vectorize(get_similarity_score)

In [10]:
def get_raw_ct_data():
    term_program_flag = True
    global data_dir
    global data_extracted
    
    # get all the links and associated dates of upload into a dict called date_link
    url_all = "https://aact.ctti-clinicaltrials.org/pipe_files"
    response = requests.get(url_all)
    soup = BeautifulSoup(response.text)
    body = soup.find_all('option') #Find all
    date_link = {}
    for el in body:
        tags = el.find('a')
        try:
            zip_name = tags.contents[0].split()[0]
            date = zip_name.split("_")[0]
            date = dt.datetime.strptime(date, '%Y%m%d').date()
            date_link[date] = tags.get('href')
        except:
            pass
    latest_file_date = max(date_link.keys())   # get the date of the latest upload
    url = date_link[latest_file_date]   # get the corresponding download link of the latest upload so we can download the raw data
    date_string = latest_file_date.strftime("%m_%d_%Y")
    data_dir = "{}/data".format(pathlib.Path.cwd())
    data_extracted = data_dir + "/{}_extracted".format(date_string)
    data_path = "{}/{}_pipe-delimited-export.zip".format(data_dir, date_string)
    
    if not os.path.exists(data_path):   # if folder containing most recent data doesn't exist, download and extract it into data folder
        
        term_program_flag = False   # flag below for terminating program if latest download exists (KG is assumed up to date)
        print("Downloading Clinical Trial data as of {}".format(date_string))
        response = requests.get(url)
        if response.status_code == 200:
            with open(data_path, 'wb') as file:
                file.write(response.content)
            print("Finished download of zip")
            with zipfile.ZipFile(data_path, 'r') as download:
                print("Unzipping data")
                download.extractall(data_extracted)
        else:
            print("KG is already up to date.")
    return {"term_program_flag": term_program_flag, "data_extracted_path": data_extracted, "date_string": date_string}



In [11]:
def read_raw_ct_data(flag_and_path):
    if flag_and_path["term_program_flag"]:
        print("Exiting program. Assuming KG has already been constructed from most recent data dump from AACT.")
#         exit()
#         pass
    else:
        data_extracted = flag_and_path["data_extracted_path"]
        # read in pipe-delimited files 
        conditions_df = pd.read_csv(data_extracted + '/conditions.txt', sep='|', index_col=False, header=0)
        interventions_df = pd.read_csv(data_extracted + '/interventions.txt', sep='|', index_col=False, header=0)
        interventions_alts = pd.read_csv(data_extracted + '/intervention_other_names.txt', sep='|', index_col=False, header=0)

#         browse_conditions_df = pd.read_csv(data_extracted + '/browse_conditions.txt', sep='|', index_col=False, header=0)
#         browse_interventions_df = pd.read_csv(data_extracted + '/browse_interventions.txt', sep='|', index_col=False, header=0)
        
    ### GET RID OF....CHEAT LINE FOR TESTING
#         conditions_df = conditions_df.iloc[:300]
#         interventions_df = interventions_df.iloc[:300]
        conditions_df = conditions_df.sample(n=300)
        interventions_df = interventions_df.sample(n=300)
        alternate_interventions_df = interventions_alts.sample(n=600)

    return {"conditions": conditions_df, "interventions": interventions_df, "interventions_alts": alternate_interventions_df
#             "browse_conditions": browse_conditions_df, "browse_interventions": browse_interventions_df
           }



In [12]:
def de_ascii_er(text):
    non_ascii = "[^\x00-\x7F]"
    pattern = re.compile(r"[^\x00-\x7F]")
    non_ascii_text = re.sub(pattern, ' ', text)
    return non_ascii_text

In [13]:
def start_metamap_servers(metamap_dirs):
    metamap_pos_server_dir = 'bin/skrmedpostctl' # Part of speech tagger
    metamap_wsd_server_dir = 'bin/wsdserverctl' # Word sense disambiguation 
    
    # Start servers
    os.system(metamap_dirs['metamap_base_dir'] + metamap_pos_server_dir + ' start') # Part of speech tagger
    os.system(metamap_dirs['metamap_base_dir'] + metamap_wsd_server_dir + ' start') # Word sense disambiguation 
    # # Sleep a bit to give time for these servers to start up
    sleep(5)

def stop_metamap_servers(metamap_dirs):
    metamap_pos_server_dir = 'bin/skrmedpostctl' # Part of speech tagger
    metamap_wsd_server_dir = 'bin/wsdserverctl' # Word sense disambiguation 
    # Stop servers
    os.system(metamap_dirs['metamap_base_dir'] + metamap_pos_server_dir + ' stop') # Part of speech tagger
    os.system(metamap_dirs['metamap_base_dir'] + metamap_wsd_server_dir + ' stop') # Word sense disambiguation 
        

In [14]:
def check_os():
    if "linux" in sys.platform:
        print("Linux platform detected")
        metamap_base_dir = "{}/metamap/".format(pathlib.Path.cwd().parents[0])
        metamap_bin_dir = 'bin/metamap20'
    else:
        metamap_base_dir = '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/' # for running on local
        metamap_bin_dir = 'bin/metamap18'
        
    return {"metamap_base_dir":metamap_base_dir, "metamap_bin_dir":metamap_bin_dir}
        

In [15]:
def run_metamap(input_term, params, mm, cond_or_inter, csv_writer):
    from_metamap = []
    if params.get("exclude_sts") is None: # exclude_sts is used for Interventions. restrict_to_sts is used for Conditions. So, the logic is, if we're mapping Conditions, execute "if" part of code. If we're mapping Interventions, execute "else" part of code
        try:
            concepts,error = mm.extract_concepts([input_term],
                                                 restrict_to_sts = params["restrict_to_sts"],
                                                 term_processing = params["term_processing"],
                                                 ignore_word_order = params["ignore_word_order"],
                                                 strict_model = params["strict_model"],
                                                )

            for concept in concepts:
                concept_info = []
                concept = concept._asdict()
                concept_info.extend([cond_or_inter,input_term])
                concept_info.extend([concept.get(k) for k in ['preferred_name', 'cui', 'score', 'semtypes']])
                from_metamap.append(concept_info)
        except:
            from_metamap.extend([input_term, None, None, None, None, None, None])
    else:
        try:
            concepts,error = mm.extract_concepts([input_term],
                                                 exclude_sts = params["exclude_sts"],
                                                 term_processing = params["term_processing"],
                                                 ignore_word_order = params["ignore_word_order"],
                                                 strict_model = params["strict_model"],
                                                )

            for concept in concepts:
                concept_info = []
                concept = concept._asdict()
                concept_info.extend([cond_or_inter,input_term])
                concept_info.extend([concept.get(k) for k in ['preferred_name', 'cui', 'score', 'semtypes']])
                from_metamap.append(concept_info)
        except:
            from_metamap.extend([input_term, None, None, None, None, None, None])
        
    for result in from_metamap:
#         print(result)
        csv_writer.writerow(result)
    return from_metamap

In [16]:
def parallelize_metamap(term_list, params, cond_or_inter, flag_and_path, csv_writer):
    start_metamap_servers(metamap_dirs) # start the MetaMap servers
    mm = MetaMap.get_instance(metamap_dirs["metamap_base_dir"] + metamap_dirs["metamap_bin_dir"])
    with concurrent.futures.ThreadPoolExecutor((multiprocessing.cpu_count()*2) - 1) as executor:
        _ = [executor.submit(run_metamap, term, params, mm, cond_or_inter, csv_writer) for term in term_list]
    stop_metamap_servers(metamap_dirs) # stop the MetaMap servers
    

# USE METAMAP LOCAL TO MAP REMAINING TERMS

In [17]:
def term_list_to_mm(df_dict, flag_and_path):
        
    metamap_version = [int(s) for s in re.findall(r'\d+', metamap_dirs.get('metamap_bin_dir'))] # get MetaMap version being run 
    # some input terms have () with additional text, like an abbreviation, in them. split them out to facilitate better mapping using these regex patterns that we use to find substrings inside and outside ()
    pattern_outside = r'(?<=\().+?(?=\))|([^(]+)'
    pattern_inside = r'\(([^)]+)\)'
    relevant_date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data
    deasciier = np.vectorize(de_ascii_er) # vectorize function

    # -------    CONDITIONS    ------- #
    conditions = df_dict["conditions"][['id', 'nct_id', 'downcase_name']]
    conditions.rename(columns = {'downcase_name':'orig_con'}, inplace = True)

    if metamap_version[0] >= 20:
        matches_outside = conditions['orig_con'].str.extract(pattern_outside)
        conditions['orig_con_outside'] = matches_outside[0].fillna('')
        matches_inside = conditions['orig_con'].str.extract(pattern_inside)
        conditions['orig_con_inside'] = matches_inside[0].fillna('')

    else:
        conditions['deascii_con'] = deasciier(conditions['orig_con'])
        matches_outside = conditions['deascii_con'].str.extract(pattern_outside)
        conditions['deascii_con_outside'] = matches_outside[0].fillna('')
        matches_inside = conditions['deascii_con'].str.extract(pattern_inside)
        conditions['deascii_con_inside'] = matches_inside[0].fillna('')
    
#     see MetaMap Usage instructions: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MM_2016_Usage.pdf
#     condition_args = ['--sldi -I -C -J acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf -z -i -f']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    params = {"restrict_to_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    
    # prep output file of Metamap results
    filename = f"{relevant_date}_metamap_output.tsv"
    metamap_output = open(filename, 'w+', newline='')
    col_names = ['term_type', 'clin_trial_term','metamap_preferred_name', 'metamap_cui', 'metamap_score', 'metamap_semantic_type']
    csv_writer = csv.writer(metamap_output, delimiter='\t')
    csv_writer.writerow(col_names)
    
    if metamap_version[0] >= 20:
        print("MetaMap version >= 2020, conduct mapping on original terms")
        orig_cons = conditions.orig_con.unique().tolist()
        orig_cons = list(filter(None, orig_cons))
        orig_cons = [str(i) for i in orig_cons]
        parallelize_metamap(orig_con, params, "condition", flag_and_path, csv_writer)
    else:
        print("MetaMap version < 2020, conduct mapping on terms after removing ascii characters")
        deascii_cons = conditions.deascii_con.unique().tolist()
        deascii_cons = list(filter(None, deascii_cons))
        deascii_cons = [str(i) for i in deascii_cons]
        parallelize_metamap(deascii_cons, params, "condition", flag_and_path, csv_writer)
        
        """ If the substring that was either outside or inside the () is identical to the term from which it came from, or actually any of the columns have the same value, put None in that cell/put None where that term is duplicated """    
    # Iterate through each column in the DataFrame
    for col1 in conditions.columns:
        for col2 in conditions.columns:
            # Skip comparing a column with itself
            if col1 != col2:
                # Check if the values in col2 are duplicates of col1
                conditions[col2] = conditions.apply(lambda row: row[col2] if row[col2] != row[col1] else None, axis=1)
    # Drop duplicate columns (keeping the first instance)
    conditions = conditions.T.drop_duplicates().T

    conditions.to_csv('{}_conditions.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output interventions to TSV
    
    # -------    INTERVENTIONS    ------- #
    print("Using UMLS MetaMap to get mappings for INTERVENTIONS. MetaMap returns mappings, CUIs, and semantic type of mapping.")
    
    """ Interventions requires unique handling. Another table gives possible alternate names for the interventions in addition to the "original" names. 
        We may map on the alternate names column
        We take the interventions, take the ascii and deasciied versions of them,
        and split substrings in parentheses out of them. We perform MetaMapping on the
        original term or the deasciied term dependinging on what operating system we
        are on. If the mapped term passes the fuzzy scoring thesholds for any of the
        terms (original, deasciied, original inside the parentheses, deasciied inside
        the parentheses, original outside the parentheses, deasciied outside the
        parentheses""" 

    interventions_df = df_dict["interventions"]
    interventions_df['orig_downcase_name'] = interventions_df['name'].str.lower()
    interventions_alts = df_dict["interventions_alts"]
    interventions_alts['alt_downcase_name'] = interventions_alts['name'].str.lower()

    orig_ints = interventions_df["orig_downcase_name"]
    orig_ints = list(orig_ints.unique())
    orig_ints = list(filter(None, orig_ints))
    alt_ints = interventions_alts["alt_downcase_name"]
    alt_ints = list(alt_ints.unique())
    alt_ints = list(filter(None, alt_ints))

    params = {"exclude_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    """ Send the prepared interventions to MetaMap now. If we are on OSX, we have to use MetaMap 2018, which requires deasciied terms. If we are on Linux, we can use MetaMap 2020, which does not require such preprocessing """
    if metamap_version[0] < 20:
        deasciier = np.vectorize(de_ascii_er) # vectorize function
        #  -------   original interventions  -------- #
        orig_ints = [str(i) for i in orig_ints]
        orig_ints = deasciier(orig_ints) # perform deascii-ing on original intervention names
        orig_ints = list(orig_ints)
        print("MetaMap version < 2020, conduct mapping on original interventions after removing ascii characters")
        parallelize_metamap(orig_ints, params, "intervention", flag_and_path, csv_writer)
        #  ---------   alternate interventions ------- #
        alt_ints = [str(i) for i in alt_ints]
        alt_ints = deasciier(alt_ints) # perform deascii-ing on alternate intervention names
        alt_ints = list(alt_ints)
        parallelize_metamap(alt_ints, params, "alternate_intervention", flag_and_path, csv_writer)

    else:
        #  -------   original interventions  -------- #
        print("MetaMap version >= 2020, conduct mapping on original interventions")
        parallelize_metamap(orig_ints, params, "intervention", flag_and_path, csv_writer)
        #  ---------   alternate interventions ------- #
        print("MetaMap version >= 2020, conduct mapping on alternate interventions")
        parallelize_metamap(alt_ints, params, "alternate_intervention", flag_and_path, csv_writer)

    interventions_all = pd.merge(interventions_df[["id", "nct_id", "intervention_type", "orig_downcase_name", "description"]], interventions_alts[["nct_id", "intervention_id", "alt_downcase_name"]], how='left', left_on=['id'], right_on = ['intervention_id'])
    interventions_all = interventions_all.astype(str)
    interventions_all = interventions_all.drop('nct_id_y', axis=1) # drop the redundant column now
    interventions_all.rename(columns = {'nct_id_x':'nct_id'}, inplace = True)

    interventions_all = interventions_all.sort_values(by='nct_id', ascending=False, na_position='last')
    interventions_all = interventions_all.drop('intervention_id', axis=1) # drop the redundant column now
    interventions_all.rename(columns = {'id':'intervention_id', 'orig_downcase_name':'orig_int', 'alt_downcase_name':'alt_int'}, inplace = True)

    if metamap_version[0] >= 20:
        matches_outside = interventions_all['orig_int'].str.extract(pattern_outside)
        interventions_all['orig_int_outside'] = matches_outside[0].fillna('')
        matches_inside = interventions_all['orig_int'].str.extract(pattern_inside)
        interventions_all['orig_int_inside'] = matches_inside[0].fillna('')

        matches_outside = interventions_all['alt_int'].str.extract(pattern_outside)
        interventions_all['alt_int_outside'] = matches_outside[0].fillna('')
        matches_inside = interventions_all['alt_in'].str.extract(pattern_inside)
        interventions_all['alt_int_inside'] = matches_inside[0].fillna('')
    else:
        interventions_all['deascii_orig_int'] = deasciier(interventions_all['orig_int'])
        interventions_all['deascii_alt_int'] = deasciier(interventions_all['alt_int'])

        matches_outside = interventions_all['deascii_orig_int'].str.extract(pattern_outside)
        interventions_all['deascii_orig_int_outside'] = matches_outside[0].fillna('')
        matches_inside = interventions_all['deascii_orig_int'].str.extract(pattern_inside)
        interventions_all['deascii_orig_int_inside'] = matches_inside[0].fillna('')

        matches_outside = interventions_all['deascii_alt_int'].str.extract(pattern_outside)
        interventions_all['deascii_alt_int_outside'] = matches_outside[0].fillna('')
        matches_inside = interventions_all['deascii_alt_int'].str.extract(pattern_inside)
        interventions_all['deascii_alt_name_inside'] = matches_inside[0].fillna('')

    """ I don't want to perform mapping on strings < 4 char in length; these are ambiguous and it's hard to make a call what that concept should be """
    """ Get character counts of all the columns to evaluate """    
    for col in interventions_all.columns: # get the char counts of each column
        char_count_col_name = col + '_char_count'
        interventions_all[char_count_col_name] = interventions_all[col].str.len()

    """ If char_count < 4, replace the string in the corresponding column with None so that we don't use it for comparison """    
    for col in interventions_all.columns[interventions_all.columns.str.contains("char_count")]:
        for index, value in interventions_all[col].items():
            if value < 4:
                # Find the column with the most similar name without "char_count" substring
                most_similar_col = interventions_all.columns[interventions_all.columns.str.replace("_char_count", "") == col.replace("_char_count", "")].values[0]
                # Update the value in the most similar column
                interventions_all.at[index, most_similar_col] = None
        interventions_all = interventions_all.drop(col, axis=1) # drop the count columns now  
        
    """ If the substring that was either outside or inside the () is identical to the term from which it came from, or actually any of the columns have the same value, put None in that cell/put None where that term is duplicated """    
    # Iterate through each column in the DataFrame
    for col1 in interventions_all.columns:
        for col2 in interventions_all.columns:
            # Skip comparing a column with itself
            if col1 != col2:
                # Check if the values in col2 are duplicates of col1
                interventions_all[col2] = interventions_all.apply(lambda row: row[col2] if row[col2] != row[col1] else None, axis=1)
    # Drop duplicate columns (keeping the first instance)
    interventions_all = interventions_all.T.drop_duplicates().T


    interventions_all.to_csv('{}_interventions.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output interventions to TSV



In [18]:
def map_to_trial(df_dict, flag_and_path):
    # send mappings to interventions and conditions, group CUIs that correspond to input condition or intervention
    relevant_date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data
    metamap_version = [int(s) for s in re.findall(r'\d+', metamap_dirs.get('metamap_bin_dir'))] # get MetaMap version being run 

    metamap_input = "{}_metamap_output.tsv".format(relevant_date)
    metamapped = pd.read_csv(metamap_input, sep='\t', index_col=False, header=0)

    # get the full names of the semantic types so we know what we're looking at
    metamap_semantic_types = pd.read_csv("MetaMap_SemanticTypes_2018AB.txt")
    metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].str.replace(r'\[|\]', '', regex=True)
    sem_type_col_names = ["abbv", "group", "semantic_type_full"]
    metamap_semantic_types = pd.read_csv("MetaMap_SemanticTypes_2018AB.txt", sep="|", index_col=False, header=None, names=sem_type_col_names)
    sem_type_dict = dict(zip(metamap_semantic_types['abbv'], metamap_semantic_types['semantic_type_full'])) # make a dict of semantic type abbv and full name
    # Handle NaN (None) values in metamap_semantic_type column
    metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].apply(lambda x: x.split(',') if isinstance(x, str) else np.nan)
    # map semantic type abbreviations to the full name of the semantic type
    metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].apply(lambda x: '|'.join([sem_type_dict[term] if term in sem_type_dict else term for term in x]) if isinstance(x, list) else x)

    metamapped['metamap_preferred_name'] = metamapped['metamap_preferred_name'].str.lower()
    metamapped = metamapped.dropna(axis=0)
    metamapped = metamapped[["term_type", "clin_trial_term", "metamap_cui","metamap_preferred_name", "metamap_semantic_type"]]

    metamapped["metamap_term_info"] = metamapped[["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]].values.tolist() 
    metamapped.drop(["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"], axis = 1, inplace = True)
    metamapped = metamapped.groupby(['term_type', 'clin_trial_term'])['metamap_term_info'].agg(list).reset_index()

    conditions = '{}_conditions.tsv'.format(relevant_date)
    conditions = pd.read_csv(conditions, sep='\t', index_col=False, header=0)
    interventions = '{}_interventions.tsv'.format(relevant_date)
    interventions = pd.read_csv(interventions, sep='\t', index_col=False, header=0)

    metamapped_con = metamapped.loc[metamapped['term_type'] == "condition"]
    metamapped_int = metamapped.loc[(metamapped['term_type'] == "intervention") | (metamapped['term_type'] == "alternate_intervention")]

    mapper_con = dict(zip(metamapped_con['clin_trial_term'], metamapped_con['metamap_term_info'])) # make a dict to map conditions
    mapper_int = dict(zip(metamapped_int['clin_trial_term'], metamapped_int['metamap_term_info'])) # make a dict to map interventions

#     cols_to_check = [ele for ele in conditions.columns if(ele not in ['id', 'nct_id', 'condition_id'])]
    cols_to_check = [ele for ele in conditions.columns if any([substr in ele for substr in ['_con']])]

    conditions["curie_info"] = None

    for index, row in conditions.iterrows():
        for col_name in cols_to_check:
            value = row[col_name]
            if value in mapper_con:
                curie_info = mapper_con[value]
                conditions.at[index, "curie_info"] = curie_info    
                
    conditions.to_csv('{}_conditions.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output conditions to TSV

#     cols_to_check = [ele for ele in interventions.columns if(ele not in ['id', 'nct_id', 'intervention_id', 'intervention_type', 'description'])]
    cols_to_check = [ele for ele in interventions.columns if any([substr in ele for substr in ['_int']])]

    interventions["curie_info"] = None

    for index, row in interventions.iterrows():
        for col_name in cols_to_check:
            value = row[col_name]
            if value in mapper_int:
                curie_info = mapper_int[value]
                interventions.at[index, "curie_info"] = curie_info
    
    interventions.to_csv('{}_interventions.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output interventions to TSV




In [133]:
def score_mappings(flag_and_path):
    
    relevant_date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data
    
    #   -- --- --   CONDITIONS   -- --- -- #
    conditions = "{}_conditions.tsv".format(relevant_date)
    conditions = pd.read_csv(conditions, sep='\t', index_col=False, header=0)
    cols_to_check = [ele for ele in conditions.columns if any([substr in ele for substr in ['_con']])]
    conditions = conditions.where(pd.notnull(conditions), None)

    for index, row in conditions.iterrows():
        curies_sublists_scored = []
        for col_name in cols_to_check:
            value = row[col_name]
            curie_info = row["curie_info"]
            if None not in [value, curie_info]:
#                 print(value)
#                 print(curie_info)
#                 print()
                curie_sublists = ast.literal_eval(curie_info)
                for sublist in curie_sublists:
                    sublist.append(f'sort_ratio: {get_token_sort_ratio(value, sublist[1])}')
                    sublist.append(f'similarity_score: {get_similarity_score(value, sublist[1])}')
                    curies_sublists_scored.append(sublist)
        conditions.at[index, "curie_info"] = curies_sublists_scored
    conditions.to_csv('{}_conditions.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output to TSV

    #   -- --- --   INTERVENTIONS   -- --- -- #
    
    interventions = "{}_interventions.tsv".format(relevant_date)
    interventions = pd.read_csv(interventions, sep='\t', index_col=False, header=0)
    cols_to_check = [ele for ele in interventions.columns if any([substr in ele for substr in ['_int']])]
    interventions = interventions.where(pd.notnull(interventions), None)

    for index, row in interventions.iterrows():
        curies_sublists_scored = []
        for col_name in cols_to_check:
            value = row[col_name]
            curie_info = row["curie_info"]
            if None not in [value, curie_info]:
#                 print(value)
#                 print(curie_info)
#                 print()
                curie_sublists = ast.literal_eval(curie_info)
                for sublist in curie_sublists:
                    sublist.append(f'sort_ratio: {get_token_sort_ratio(value, sublist[1])}')
                    sublist.append(f'similarity_score: {get_similarity_score(value, sublist[1])}')
                    curies_sublists_scored.append(sublist)

        interventions.at[index, "curie_info"] = curies_sublists_scored
    interventions.to_csv('{}_interventions.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output interventions to TSV


In [176]:
relevant_date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data
conditions = "{}_conditions.tsv".format(relevant_date)
conditions = pd.read_csv(conditions, sep='\t', index_col=False, header=0)
conditions

interventions = "{}_interventions.tsv".format(relevant_date)
interventions = pd.read_csv(interventions, sep='\t', index_col=False, header=0)
interventions

Unnamed: 0,id,nct_id,orig_con,deascii_con,deascii_con_outside,deascii_con_inside,curie_info
0,59157052,NCT05113303,"bone; deformity, congenital",,,,"[['C0000768', 'congenital abnormality', 'Conge..."
1,59881198,NCT01504126,stage iib fallopian tube cancer ajcc v6 and v7,,,,"[['C1336183', 'stage iib fallopian tube cancer..."
2,59401394,NCT00367484,relapsing remitting multiple sclerosis,,,,"[['C0751967', 'multiple sclerosis, relapsing-r..."
3,59685708,NCT02266706,proven or suspected gram-negative bacterial in...,,,,"[['C0085423', 'gram-negative bacterial infecti..."
4,59241341,NCT01511588,hypogonadism,,,,"[['C0020619', 'hypogonadism', 'Disease or Synd..."
...,...,...,...,...,...,...,...
295,59430595,NCT01137942,persistence of infection with helicobacter pylori,,,,"[['C0850666', 'infection caused by helicobacte..."
296,59699048,NCT00074308,stage iii melanoma,,,,"[['C0278882', 'stage iii cutaneous melanoma aj..."
297,59706585,NCT05744050,obesity,,,,"[['C0028754', 'obesity', 'Disease or Syndrome'..."
298,59291139,NCT02576665,sarcoma,,,,"[['C1261473', 'sarcoma', 'Neoplastic Process',..."


Unnamed: 0,intervention_id,nct_id,intervention_type,orig_int,description,alt_int,deascii_orig_int,deascii_orig_int_outside,deascii_orig_int_inside,curie_info
0,59397996,NCT06012656,Device,minimax,Total or Partial Hip Arthroplasty,,,,,[]
1,59568706,NCT06008353,Other,observational study,This is an observational study; patients will ...,,,,,"[['C1518527', 'observational study', 'Research..."
2,59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with ...,Mode of usage: Two times a day Route of admini...,,,,,"[['C0005575', 'biotin', 'Organic Chemical|Phar..."
3,59940409,NCT05966350,Other,comparison of data with parametric test (t-test),Clinical data from positive Argonaute (AGO) pa...,,,comparison of data with parametric test,t-test,"[['C1707455', 'comparison', 'Activity', 'sort_..."
4,59968815,NCT05945160,Drug,alpha lipoic acid 300mg,A naturally occuring mitochondrial antioxidant,,,,,"[['C0023791', 'thioctic acid', 'Organic Chemic..."
...,...,...,...,...,...,...,...,...,...,...
295,59310654,NCT00006367,Biological,filgrastim,,,,,,"[['C0210630', 'filgrastim', 'Amino Acid, Pepti..."
296,60008271,NCT00004904,Drug,etoposide,,,,,,"[['C0015133', 'etoposide', 'Organic Chemical|P..."
297,60012657,NCT00004199,Drug,gemcitabine hydrochloride,,,,,,"[['C0771488', 'gemcitabine hydrochloride', 'Nu..."
298,60034595,NCT00002790,Drug,cyclosporine,,,,,,"[['C0010592', 'cyclosporine', 'Amino Acid, Pep..."


In [291]:
def auto_select_curies(flag_and_path):

    relevant_date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data
    
    def filter_and_select_sublist(sublists):
        if sublists is None or len(sublists) == 0:
            return None

        high_score = -1
        selected_sublist = None

        sublists = ast.literal_eval(sublists)
        for sublist in sublists:

            if len(sublist) >= 4:
                sort_ratio = int(sublist[3].split(": ")[1])
                sim_score = int(sublist[4].split(": ")[1])
                max_score = max(sort_ratio, sim_score)
                if max_score > 88: 
                    if max_score > high_score:
                        high_score = max_score
                        selected_sublist = sublist
        return selected_sublist

    #   -----   -----    -----   -----   CONDITIONS   -----   -----    -----   -----  #

    conditions = "{}_conditions.tsv".format(relevant_date)
    conditions = pd.read_csv(conditions, sep='\t', index_col=False, header=0)
    """  Create an output TSV of CURIEs that are auto-selected based on passing the threshold of scoring > 88  """
    conditions['auto_selected_curie'] = conditions['curie_info'].apply(filter_and_select_sublist)  # select CURIE that scores highest using filter_and_select_sublist function = auto-select
    auto_selected_conditions = conditions[conditions[['auto_selected_curie']].notnull().all(1)]   # get the rows where a CURIE has been auto-selected
    auto_selected_conditions = auto_selected_conditions[["id", "nct_id", "orig_con", "curie_info", "auto_selected_curie"]]  # subset dataframe
    auto_selected_conditions.to_csv('{}_conditions_auto_selected.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output to TSV

    conditions_manual_review = conditions[conditions["auto_selected_curie"].isna()]   # select rows where no CURIE was auto-selected
    conditions_manual_review = conditions_manual_review[["id", "nct_id", "orig_con", "curie_info"]]  # subset

    """  Create an output TSV of possible CURIEs available for each term that was not auto-selected  """
    conditions_manual_review['curie_info'] = conditions_manual_review['curie_info'].apply(ast.literal_eval)   # in order to multi-index, we have to group-by the original input term. To do this, first convert the column to list of lists
    conditions_manual_review = conditions_manual_review.explode('curie_info')  # explode that column so every sublist is on a separate row
    conditions_manual_review['curie_info'] = conditions_manual_review['curie_info'].apply(lambda x: x[:3] if isinstance(x, list) else None)   # remove the scores (sort_ratio and similarity score) from the list, don't need them and they compromise readability of manual outputs 
    conditions_manual_review['curie_info'] = conditions_manual_review['curie_info'].apply(lambda x: ', '.join(x) if isinstance(x, list) else None)  # Multindexing does not work on lists, so remove the CURIE information out of the list to enable this

    conditions_manual_review['temp'] = "temp"   # create a temp column to facilitate multi-indexing
    conditions_manual_review.set_index(["id", "nct_id", "orig_con", 'curie_info'], inplace=True)   # create index
    conditions_manual_review.drop(["temp"], axis = 1, inplace = True)   # drop the temp column
    conditions_manual_review['manually_selected_CURIE'] = None # make a column 

    conditions_manual_review.to_excel('{}_conditions_manual_review.xlsx'.format(relevant_date), engine='xlsxwriter', index=True)

    #   -----   -----    -----   -----   INTERVENTIONS   -----   -----    -----   -----  #
    interventions = "{}_interventions.tsv".format(relevant_date)
    interventions = pd.read_csv(interventions, sep='\t', index_col=False, header=0)
    """  Create an output TSV of CURIEs that are auto-selected based on passing the threshold of scoring > 88  """
    interventions['auto_selected_curie'] = interventions['curie_info'].apply(filter_and_select_sublist)
    auto_selected_interventions = interventions[interventions[['auto_selected_curie']].notnull().all(1)]
    auto_selected_interventions = auto_selected_interventions[["intervention_id", "nct_id", "intervention_type", "orig_int", "description", "curie_info", "auto_selected_curie"]]
    auto_selected_interventions.to_csv('{}_interventions_auto_selected.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output interventions to TSV, avoid storing in memory

    interventions_manual_review = interventions[interventions["auto_selected_curie"].isna()]
    interventions_manual_review = interventions_manual_review[["intervention_id", "nct_id", "intervention_type", "orig_int", "description", "curie_info"]]

    """  Create an output TSV of possible CURIEs available for each term that was not auto-selected  """
    interventions_manual_review['curie_info'] = interventions_manual_review['curie_info'].apply(ast.literal_eval)
    interventions_manual_review = interventions_manual_review.explode('curie_info')
    interventions_manual_review['curie_info'] = interventions_manual_review['curie_info'].apply(lambda x: x[:3] if isinstance(x, list) else None)   # remove the scores (sort_ratio and similarity score) from the list, don't need them and they compromise readability of manual outputs 
    interventions_manual_review['curie_info'] = interventions_manual_review['curie_info'].apply(lambda x: ', '.join(x) if isinstance(x, list) else None)

    interventions_manual_review['temp'] = "temp"
    interventions_manual_review.set_index(["intervention_id", "nct_id", "intervention_type", "orig_int", "description", 'curie_info'], inplace=True)
    interventions_manual_review.drop(["temp"], axis = 1, inplace = True)
    interventions_manual_review['manually_selected_CURIE'] = None

    interventions_manual_review.to_excel('{}_interventions_manual_review.xlsx'.format(relevant_date), engine='xlsxwriter', index=True)

In [185]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(interventions[1:100])

Unnamed: 0,intervention_id,nct_id,intervention_type,orig_int,description,alt_int,deascii_orig_int,deascii_orig_int_outside,deascii_orig_int_inside,curie_info,auto_selected_curie
1,59568706,NCT06008353,Other,observational study,This is an observational study; patients will not be exposed to clinical interventions different from those belonging to the standard of care.,,,,,"[['C1518527', 'observational study', 'Research Activity', 'sort_ratio: 100', 'similarity_score: 100']]","[C1518527, observational study, Research Activity, sort_ratio: 100, similarity_score: 100]"
2,59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,,,,,"[['C0005575', 'biotin', 'Organic Chemical|Pharmacologic Substance|Vitamin', 'sort_ratio: 21', 'similarity_score: 21'], ['C0037098', 'silicon dioxide', 'Biomedical or Dental Material|Inorganic Chemical', 'sort_ratio: 33', 'similarity_score: 27'], ['C2828366', 'extract (substance)', 'Substance', 'sort_ratio: 41', 'similarity_score: 37'], ['C1456557', 'botanical', 'Organic Chemical|Pharmacologic Substance', 'sort_ratio: 30', 'similarity_score: 30'], ['C2700256', 'vitamin b7 measurement', 'Laboratory Procedure', 'sort_ratio: 27', 'similarity_score: 30']]",
3,59940409,NCT05966350,Other,comparison of data with parametric test (t-test),"Clinical data from positive Argonaute (AGO) patients (age, sex, type of neuropathy, clinical manifestations of neuropathy, electroneuromyography, evolution of neuropathy, presence of associated autoimmune disease, response to possible immunomodulatory treatment) will be compared with those of negative patients using a parametric test (T-test).",,,comparison of data with parametric test,t-test,"[['C1707455', 'comparison', 'Activity', 'sort_ratio: 36', 'similarity_score: 34'], ['C1711157', 'parametric test', 'Intellectual Product', 'sort_ratio: 49', 'similarity_score: 48'], ['C0871472', 't test', 'Intellectual Product', 'sort_ratio: 23', 'similarity_score: 22'], ['C1511726', 'data', 'Idea or Concept', 'sort_ratio: 16', 'similarity_score: 15'], ['C3714741', 'data (eukaryote)', 'Eukaryote', 'sort_ratio: 30', 'similarity_score: 31'], ['C3245479', 'data call receiving device', 'Medical Device', 'sort_ratio: 42', 'similarity_score: 32'], ['C1707455', 'comparison', 'Activity', 'sort_ratio: 41', 'similarity_score: 40'], ['C1711157', 'parametric test', 'Intellectual Product', 'sort_ratio: 56', 'similarity_score: 55'], ['C0871472', 't test', 'Intellectual Product', 'sort_ratio: 27', 'similarity_score: 26'], ['C1511726', 'data', 'Idea or Concept', 'sort_ratio: 19', 'similarity_score: 18'], ['C3714741', 'data (eukaryote)', 'Eukaryote', 'sort_ratio: 34', 'similarity_score: 32'], ['C3245479', 'data call receiving device', 'Medical Device', 'sort_ratio: 43', 'similarity_score: 33'], ['C1707455', 'comparison', 'Activity', 'sort_ratio: 12', 'similarity_score: 12'], ['C1711157', 'parametric test', 'Intellectual Product', 'sort_ratio: 57', 'similarity_score: 48'], ['C0871472', 't test', 'Intellectual Product', 'sort_ratio: 100', 'similarity_score: 83'], ['C1511726', 'data', 'Idea or Concept', 'sort_ratio: 20', 'similarity_score: 20'], ['C3714741', 'data (eukaryote)', 'Eukaryote', 'sort_ratio: 40', 'similarity_score: 27'], ['C3245479', 'data call receiving device', 'Medical Device', 'sort_ratio: 19', 'similarity_score: 12']]","[C0871472, t test, Intellectual Product, sort_ratio: 100, similarity_score: 83]"
4,59968815,NCT05945160,Drug,alpha lipoic acid 300mg,A naturally occuring mitochondrial antioxidant,,,,,"[['C0023791', 'thioctic acid', 'Organic Chemical|Pharmacologic Substance|Vitamin', 'sort_ratio: 56', 'similarity_score: 56']]",
5,59428875,NCT05939661,Drug,chemotherapy,"CAPOX (Oxaliplatin 130mg/m2, Capecitabine2000mg/m2/day, d1-14, 3week)x6cycles",,,,,"[['C0013217', 'pharmacotherapeutic', 'Functional Concept', 'sort_ratio: 58', 'similarity_score: 58'], ['C3665472', 'chemotherapy', 'Therapeutic or Preventive Procedure', 'sort_ratio: 100', 'similarity_score: 100'], ['C0013216', 'pharmacotherapy', 'Therapeutic or Preventive Procedure', 'sort_ratio: 74', 'similarity_score: 74'], ['C0392920', 'chemotherapy regimen', 'Therapeutic or Preventive Procedure', 'sort_ratio: 75', 'similarity_score: 75']]","[C3665472, chemotherapy, Therapeutic or Preventive Procedure, sort_ratio: 100, similarity_score: 100]"
6,59672340,NCT05929768,Drug,carboplatin,Given IV,,,,,"[['C0079083', 'carboplatin', 'Organic Chemical|Pharmacologic Substance', 'sort_ratio: 100', 'similarity_score: 100']]","[C0079083, carboplatin, Organic Chemical|Pharmacologic Substance, sort_ratio: 100, similarity_score: 100]"
7,59834170,NCT05907785,Behavioral,educational intervention,"The intervention will consist of recommendations to improve physical activity (PA) and dietary habits. It is based on the PAIDO Programme (www.programapaido.es), an outpatient family-based multidisciplinary programme that combines PA, education on nutrition, and behavioural therapy. It will last 6 months and improvement of metabolic health and body weight loss is expected, particularly in the groups with overweight and obesity. The dietary intervention will be focused on the promotion of the Mediterranean diet, with follow-up and advice from a dietitian-nutritionist. Participants will be encouraged in reducing sedentary behaviour (watching television, playing computer games, playing board games). Performance of aerobic and strength physical exercises will be scheduled, progressively increasing the intensity. Volunteers and their parents will participate in monthly sessions to follow-up on adherence to the PA and dietary recommendations and give personalized advice.",,,,,"[['C0281163', 'educational intervention', 'Therapeutic or Preventive Procedure', 'sort_ratio: 100', 'similarity_score: 100']]","[C0281163, educational intervention, Therapeutic or Preventive Procedure, sort_ratio: 100, similarity_score: 100]"
8,59611757,NCT05835180,Drug,tvb-2640 - 50 mg,TVB-2640 -50 mg administered orally once daily,,,,,"[['C0024671', 'mammography', 'Diagnostic Procedure', 'sort_ratio: 16', 'similarity_score: 15'], ['C0026410', 'mongolia', 'Geographic Area', 'sort_ratio: 18', 'similarity_score: 17'], ['C4321396', 'mg', 'Diagnostic Procedure', 'sort_ratio: 25', 'similarity_score: 22'], ['C2346927', 'magnesium cation', 'Element, Ion, or Isotope|Pharmacologic Substance', 'sort_ratio: 20', 'similarity_score: 12'], ['C1960952', 'milligram percent', 'Quantitative Concept', 'sort_ratio: 26', 'similarity_score: 12'], ['C4521761', 'united states military commissioned officer o8', 'Classification', 'sort_ratio: 13', 'similarity_score: 16'], ['C0439269', 'mg/dl', 'Quantitative Concept', 'sort_ratio: 32', 'similarity_score: 19'], ['C3896754', 'fasn inhibitor tvb-2640', 'Organic Chemical|Pharmacologic Substance', 'sort_ratio: 54', 'similarity_score: 41'], ['C3842586', '50%', 'Quantitative Concept', 'sort_ratio: 25', 'similarity_score: 21'], ['C4084918', 'psa level greater than fifty', 'Conceptual Entity', 'sort_ratio: 19', 'similarity_score: 18']]",
9,59758694,NCT05827055,Drug,nab paclitaxel,"125 mg/m2 given days 1, 8, and 15 every 28 days (1 cycle)",,,,,"[['C1527223', '130-nm albumin-bound paclitaxel', 'Organic Chemical|Pharmacologic Substance', 'sort_ratio: 58', 'similarity_score: 62']]",
10,59790341,NCT05790564,Other,crackers,Daily consumption of non-whole grain crackers for 12 weeks (caloric equivalent to 2 ounces of dry roasted almonds),,,,,"[['C0452505', 'cracker', 'Food', 'sort_ratio: 93', 'similarity_score: 93']]","[C0452505, cracker, Food, sort_ratio: 93, similarity_score: 93]"


In [174]:
interventions

Unnamed: 0,intervention_id,nct_id,intervention_type,orig_int,description,alt_int,deascii_orig_int,deascii_alt_int,deascii_orig_int_outside,deascii_orig_int_inside,curie_info
0,60047523,NCT06038071,Other,scheduled anthropometric screening,anthropometric screening by health care worker...,,,,,,"[[C1710032, screening, Health Care Activity, s..."
1,59389265,NCT06023667,Behavioral,smart-ibd,Daily medication and diary completion reminder...,,,,,,[]
2,59735138,NCT06021418,Device,dkb-119,Experimental and Acitve Comparator are applied...,,,,,,"[[C0012024, dibekacin, Antibiotic|Organic Chem..."
3,59912969,NCT06010901,Drug,tqb2618 injection,TQB2618 is a monoclonal antibody targeting T c...,,,,,,"[[C1272883, injection, Biomedical or Dental Ma..."
4,59989680,NCT05972083,Drug,analgesia management; group s and c,Patients will be administered ibuprofen 400 mg...,,,,,,"[[C1257890, population group, Population Group..."
...,...,...,...,...,...,...,...,...,...,...,...
295,59316311,NCT00019552,Drug,irofulven,,,,,,,"[[C0532362, irofulven, Organic Chemical|Pharma..."
296,59317853,NCT00015873,Drug,prednisolone,,,,,,,"[[C0032950, prednisolone, Organic Chemical|Pha..."
297,59960467,NCT00003556,Biological,canarypox-hil-12 melanoma vaccine,,,,,,,"[[C0677735, canarypox-hil-12 melanoma vaccine,..."
298,60007614,NCT00003428,Drug,arzoxifene hydrochloride,,,,,,,"[[C1527089, arzoxifene hydrochloride, Organic ..."


In [289]:
relevant_date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data
conditions = "{}_conditions.tsv".format(relevant_date)
conditions = pd.read_csv(conditions, sep='\t', index_col=False, header=0)

interventions = "{}_interventions.tsv".format(relevant_date)
interventions = pd.read_csv(interventions, sep='\t', index_col=False, header=0)

def filter_and_select_sublist(sublists):
    if sublists is None or len(sublists) == 0:
        return None

    high_score = -1
    selected_sublist = None

    sublists = ast.literal_eval(sublists)
    for sublist in sublists:

        if len(sublist) >= 4:
            sort_ratio = int(sublist[3].split(": ")[1])
            sim_score = int(sublist[4].split(": ")[1])
            max_score = max(sort_ratio, sim_score)
            if max_score > 88: 
                if max_score > high_score:
                    high_score = max_score
                    selected_sublist = sublist
    return selected_sublist

# # Apply the custom function to each row and create the 'selected_sublist' column
# conditions['auto_selected_curie'] = conditions['curie_info'].apply(filter_and_select_sublist)
# interventions['auto_selected_curie'] = interventions['curie_info'].apply(filter_and_select_sublist)

# auto_selected_conditions = conditions[conditions[['auto_selected_curie']].notnull().all(1)]
# auto_selected_conditions = auto_selected_conditions[["id", "nct_id", "orig_con", "curie_info", "auto_selected_curie"]]

# auto_selected_interventions = interventions[interventions[['auto_selected_curie']].notnull().all(1)]
# auto_selected_interventions = auto_selected_interventions[["intervention_id", "nct_id", "intervention_type", "orig_int", "description", "curie_info", "auto_selected_curie"]]

# conditions_manual_review = conditions[conditions["auto_selected_curie"].isna()]
# conditions_manual_review = conditions_manual_review[["id", "nct_id", "orig_con", "curie_info"]]
# interventions_manual_review = interventions[interventions["auto_selected_curie"].isna()]
# interventions_manual_review = interventions_manual_review[["intervention_id", "nct_id", "intervention_type", "orig_int", "description", "curie_info"]]


#   -----   -----    -----   -----   CONDITIONS   -----   -----    -----   -----  #

"""  Create an output TSV of CURIEs that are auto-selected based on passing the threshold of scoring > 88  """
conditions['auto_selected_curie'] = conditions['curie_info'].apply(filter_and_select_sublist)  # select CURIE that scores highest using filter_and_select_sublist function = auto-select
auto_selected_conditions = conditions[conditions[['auto_selected_curie']].notnull().all(1)]   # get the rows where a CURIE has been auto-selected
auto_selected_conditions = auto_selected_conditions[["id", "nct_id", "orig_con", "curie_info", "auto_selected_curie"]]  # subset dataframe
auto_selected_conditions.to_csv('{}_conditions_auto_selected.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output to TSV

conditions_manual_review = conditions[conditions["auto_selected_curie"].isna()]   # select rows where no CURIE was auto-selected
conditions_manual_review = conditions_manual_review[["id", "nct_id", "orig_con", "curie_info"]]  # subset

"""  Create an output TSV of possible CURIEs available for each term that was not auto-selected  """
conditions_manual_review['curie_info'] = conditions_manual_review['curie_info'].apply(ast.literal_eval)   # in order to multi-index, we have to group-by the original input term. To do this, first convert the column to list of lists
conditions_manual_review = conditions_manual_review.explode('curie_info')  # explode that column so every sublist is on a separate row
conditions_manual_review['curie_info'] = conditions_manual_review['curie_info'].apply(lambda x: x[:3] if isinstance(x, list) else None)
conditions_manual_review['curie_info'] = conditions_manual_review['curie_info'].apply(lambda x: ', '.join(x) if isinstance(x, list) else None)  # Multindexing does not work on lists, so remove the CURIE information out of the list to enable this

conditions_manual_review['temp'] = "temp"   # create a temp column to facilitate multi-indexing
conditions_manual_review.set_index(["id", "nct_id", "orig_con", 'curie_info'], inplace=True)   # create index
conditions_manual_review.drop(["temp"], axis = 1, inplace = True)   # drop the temp column
conditions_manual_review['manually_selected_CURIE'] = None # make a column 

conditions_manual_review.to_excel('{}_conditions_manual_review.xlsx'.format(relevant_date), engine='xlsxwriter', index=True)

#   -----   -----    -----   -----   INTERVENTIONS   -----   -----    -----   -----  #
"""  Create an output TSV of CURIEs that are auto-selected based on passing the threshold of scoring > 88  """
interventions['auto_selected_curie'] = interventions['curie_info'].apply(filter_and_select_sublist)
auto_selected_interventions = interventions[interventions[['auto_selected_curie']].notnull().all(1)]
auto_selected_interventions = auto_selected_interventions[["intervention_id", "nct_id", "intervention_type", "orig_int", "description", "curie_info", "auto_selected_curie"]]
auto_selected_interventions.to_csv('{}_interventions_auto_selected.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output interventions to TSV, avoid storing in memory

interventions_manual_review = interventions[interventions["auto_selected_curie"].isna()]
interventions_manual_review = interventions_manual_review[["intervention_id", "nct_id", "intervention_type", "orig_int", "description", "curie_info"]]

"""  Create an output TSV of possible CURIEs available for each term that was not auto-selected  """
interventions_manual_review['curie_info'] = interventions_manual_review['curie_info'].apply(ast.literal_eval)
interventions_manual_review = interventions_manual_review.explode('curie_info')
interventions_manual_review['curie_info'] = interventions_manual_review['curie_info'].apply(lambda x: ', '.join(x) if isinstance(x, list) else None)

interventions_manual_review['temp'] = "temp"
interventions_manual_review.set_index(["intervention_id", "nct_id", "intervention_type", "orig_int", "description", 'curie_info'], inplace=True)
interventions_manual_review.drop(["temp"], axis = 1, inplace = True)
interventions_manual_review['manually_selected_CURIE'] = None

interventions_manual_review.to_excel('{}_interventions_manual_review.xlsx'.format(relevant_date), engine='xlsxwriter', index=True)

auto_selected_conditions
conditions_manual_review

auto_selected_interventions
interventions_manual_review

'  Create an output TSV of CURIEs that are auto-selected based on passing the threshold of scoring > 88  '

'  Create an output TSV of possible CURIEs available for each term that was not auto-selected  '

'  Create an output TSV of CURIEs that are auto-selected based on passing the threshold of scoring > 88  '

'  Create an output TSV of possible CURIEs available for each term that was not auto-selected  '

Unnamed: 0,id,nct_id,orig_con,curie_info,auto_selected_curie
1,59881198,NCT01504126,stage iib fallopian tube cancer ajcc v6 and v7,"[['C1336183', 'stage iib fallopian tube cancer...","[C1336183, stage iib fallopian tube cancer ajc..."
2,59401394,NCT00367484,relapsing remitting multiple sclerosis,"[['C0751967', 'multiple sclerosis, relapsing-r...","[C0751967, multiple sclerosis, relapsing-remit..."
4,59241341,NCT01511588,hypogonadism,"[['C0020619', 'hypogonadism', 'Disease or Synd...","[C0020619, hypogonadism, Disease or Syndrome, ..."
5,59185335,NCT00206921,erythema,"[['C0041834', 'erythema', 'Disease or Syndrome...","[C0041834, erythema, Disease or Syndrome, sort..."
7,59294882,NCT00628433,ulcerative colitis,"[['C0009324', 'ulcerative colitis', 'Disease o...","[C0009324, ulcerative colitis, Disease or Synd..."
...,...,...,...,...,...
293,59561432,NCT02224781,metastatic melanoma,"[['C0278883', 'metastatic melanoma', 'Neoplast...","[C0278883, metastatic melanoma, Neoplastic Pro..."
294,59857603,NCT02204124,pancreatic neoplasms,"[['C0030297', 'pancreatic neoplasm', 'Neoplast...","[C0030297, pancreatic neoplasm, Neoplastic Pro..."
297,59706585,NCT05744050,obesity,"[['C0028754', 'obesity', 'Disease or Syndrome'...","[C0028754, obesity, Disease or Syndrome, sort_..."
298,59291139,NCT02576665,sarcoma,"[['C1261473', 'sarcoma', 'Neoplastic Process',...","[C1261473, sarcoma, Neoplastic Process, sort_r..."


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,manually_selected_CURIE
id,nct_id,orig_con,curie_info,Unnamed: 4_level_1
59157052,NCT05113303,"bone; deformity, congenital","C0000768, congenital abnormality, Congenital Abnormality",
59157052,NCT05113303,"bone; deformity, congenital","C0410719, deformity of bone, Anatomical Abnormality",
59157052,NCT05113303,"bone; deformity, congenital","C1705254, neonatal deformity, Congenital Abnormality",
59685708,NCT02266706,proven or suspected gram-negative bacterial infection,"C0085423, gram-negative bacterial infections, Disease or Syndrome",
59164855,NCT00436267,pancreatic cancer,"C0235974, pancreatic carcinoma, Neoplastic Process",
...,...,...,...,...
59298493,NCT04262804,breast cancer metastatic,"C0278488, carcinoma breast stage iv, Neoplastic Process",
59298493,NCT04262804,breast cancer metastatic,"C4520898, stage iv breast cancer ajcc v6 and v7, Neoplastic Process",
59334597,NCT00648557,healthy,,
59430595,NCT01137942,persistence of infection with helicobacter pylori,"C0850666, infection caused by helicobacter pylori, Disease or Syndrome",


Unnamed: 0,intervention_id,nct_id,intervention_type,orig_int,description,curie_info,auto_selected_curie
1,59568706,NCT06008353,Other,observational study,This is an observational study; patients will ...,"[['C1518527', 'observational study', 'Research...","[C1518527, observational study, Research Activ..."
3,59940409,NCT05966350,Other,comparison of data with parametric test (t-test),Clinical data from positive Argonaute (AGO) pa...,"[['C1707455', 'comparison', 'Activity', 'sort_...","[C0871472, t test, Intellectual Product, sort_..."
5,59428875,NCT05939661,Drug,chemotherapy,"CAPOX (Oxaliplatin 130mg/m2, Capecitabine2000m...","[['C0013217', 'pharmacotherapeutic', 'Function...","[C3665472, chemotherapy, Therapeutic or Preven..."
6,59672340,NCT05929768,Drug,carboplatin,Given IV,"[['C0079083', 'carboplatin', 'Organic Chemical...","[C0079083, carboplatin, Organic Chemical|Pharm..."
7,59834170,NCT05907785,Behavioral,educational intervention,The intervention will consist of recommendatio...,"[['C0281163', 'educational intervention', 'The...","[C0281163, educational intervention, Therapeut..."
...,...,...,...,...,...,...,...
295,59310654,NCT00006367,Biological,filgrastim,,"[['C0210630', 'filgrastim', 'Amino Acid, Pepti...","[C0210630, filgrastim, Amino Acid, Peptide, or..."
296,60008271,NCT00004904,Drug,etoposide,,"[['C0015133', 'etoposide', 'Organic Chemical|P...","[C0015133, etoposide, Organic Chemical|Pharmac..."
297,60012657,NCT00004199,Drug,gemcitabine hydrochloride,,"[['C0771488', 'gemcitabine hydrochloride', 'Nu...","[C0771488, gemcitabine hydrochloride, Nucleic ..."
298,60034595,NCT00002790,Drug,cyclosporine,,"[['C0010592', 'cyclosporine', 'Amino Acid, Pep...","[C0010592, cyclosporine, Amino Acid, Peptide, ..."


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,manually_selected_CURIE
intervention_id,nct_id,intervention_type,orig_int,description,curie_info,Unnamed: 6_level_1
59397996,NCT06012656,Device,minimax,Total or Partial Hip Arthroplasty,,
59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"C0005575, biotin, Organic Chemical|Pharmacologic Substance|Vitamin, sort_ratio: 21, similarity_score: 21",
59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"C0037098, silicon dioxide, Biomedical or Dental Material|Inorganic Chemical, sort_ratio: 33, similarity_score: 27",
59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"C2828366, extract (substance), Substance, sort_ratio: 41, similarity_score: 37",
59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"C1456557, botanical, Organic Chemical|Pharmacologic Substance, sort_ratio: 30, similarity_score: 30",
...,...,...,...,...,...,...
59332558,NCT00200629,Procedure,combined adenosine / exercise spect myocardial perfusion imaging,,"C1522704, exercise pain management, Therapeutic or Preventive Procedure, sort_ratio: 42, similarity_score: 41",
59331414,NCT00177112,Procedure,clean patient preparation for cystoscopy,,"C0010702, cystoscopy, Diagnostic Procedure, sort_ratio: 40, similarity_score: 40",
59331414,NCT00177112,Procedure,clean patient preparation for cystoscopy,,"C0582431, preparation of patient, Therapeutic or Preventive Procedure, sort_ratio: 71, similarity_score: 48",
59331414,NCT00177112,Procedure,clean patient preparation for cystoscopy,,"C1947930, cleaning (activity), Activity, sort_ratio: 35, similarity_score: 44",


In [282]:
conditions_manual_review

Unnamed: 0,id,nct_id,orig_con,curie_info
0,59157052,NCT05113303,"bone; deformity, congenital","[['C0000768', 'congenital abnormality', 'Conge..."
3,59685708,NCT02266706,proven or suspected gram-negative bacterial in...,"[['C0085423', 'gram-negative bacterial infecti..."
6,59164855,NCT00436267,pancreatic cancer,"[['C0235974', 'pancreatic carcinoma', 'Neoplas..."
9,59131044,NCT02377830,intensive care unit acquired weakness,[]
11,59422337,NCT03910712,her2-positive breast cancer,"[['C1960398', 'her2-positive carcinoma of brea..."
...,...,...,...,...
288,59382573,NCT00760253,bispectral index,[]
291,59298493,NCT04262804,breast cancer metastatic,"[['C0278488', 'carcinoma breast stage iv', 'Ne..."
292,59334597,NCT00648557,healthy,[]
295,59430595,NCT01137942,persistence of infection with helicobacter pylori,"[['C0850666', 'infection caused by helicobacte..."


In [276]:
interventions_manual_review['curie_info'] = interventions_manual_review['curie_info'].apply(ast.literal_eval)
interventions_manual_review = interventions_manual_review.explode('curie_info')
interventions_manual_review['curie_info'] = interventions_manual_review['curie_info'].apply(lambda x: ', '.join(x) if isinstance(x, list) else None)


interventions_manual_review['temp'] = "temp"
interventions_manual_review.set_index(["intervention_id", "nct_id", "intervention_type", "orig_int", "description", 'curie_info'], inplace=True)
interventions_manual_review.drop(["temp"], axis = 1, inplace = True)
interventions_manual_review['manually_selected_CURIE'] = None

interventions_manual_review

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,manually_selected_CURIE
intervention_id,nct_id,intervention_type,orig_int,description,curie_info,Unnamed: 6_level_1
59397996,NCT06012656,Device,minimax,Total or Partial Hip Arthroplasty,,
59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"C0005575, biotin, Organic Chemical|Pharmacologic Substance|Vitamin, sort_ratio: 21, similarity_score: 21",
59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"C0037098, silicon dioxide, Biomedical or Dental Material|Inorganic Chemical, sort_ratio: 33, similarity_score: 27",
59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"C2828366, extract (substance), Substance, sort_ratio: 41, similarity_score: 37",
59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"C1456557, botanical, Organic Chemical|Pharmacologic Substance, sort_ratio: 30, similarity_score: 30",
...,...,...,...,...,...,...
59332558,NCT00200629,Procedure,combined adenosine / exercise spect myocardial perfusion imaging,,"C1522704, exercise pain management, Therapeutic or Preventive Procedure, sort_ratio: 42, similarity_score: 41",
59331414,NCT00177112,Procedure,clean patient preparation for cystoscopy,,"C0010702, cystoscopy, Diagnostic Procedure, sort_ratio: 40, similarity_score: 40",
59331414,NCT00177112,Procedure,clean patient preparation for cystoscopy,,"C0582431, preparation of patient, Therapeutic or Preventive Procedure, sort_ratio: 71, similarity_score: 48",
59331414,NCT00177112,Procedure,clean patient preparation for cystoscopy,,"C1947930, cleaning (activity), Activity, sort_ratio: 35, similarity_score: 44",


Unnamed: 0_level_0,Unnamed: 1_level_0,manually_selected_CURIE
orig_int,curie_info,Unnamed: 2_level_1
minimax,,
botanical extract of standardised biotin with silica,"C0005575, biotin, Organic Chemical|Pharmacologic Substance|Vitamin, sort_ratio: 21, similarity_score: 21",
botanical extract of standardised biotin with silica,"C0037098, silicon dioxide, Biomedical or Dental Material|Inorganic Chemical, sort_ratio: 33, similarity_score: 27",
botanical extract of standardised biotin with silica,"C2828366, extract (substance), Substance, sort_ratio: 41, similarity_score: 37",
botanical extract of standardised biotin with silica,"C1456557, botanical, Organic Chemical|Pharmacologic Substance, sort_ratio: 30, similarity_score: 30",
...,...,...
combined adenosine / exercise spect myocardial perfusion imaging,"C1522704, exercise pain management, Therapeutic or Preventive Procedure, sort_ratio: 42, similarity_score: 41",
clean patient preparation for cystoscopy,"C0010702, cystoscopy, Diagnostic Procedure, sort_ratio: 40, similarity_score: 40",
clean patient preparation for cystoscopy,"C0582431, preparation of patient, Therapeutic or Preventive Procedure, sort_ratio: 71, similarity_score: 48",
clean patient preparation for cystoscopy,"C1947930, cleaning (activity), Activity, sort_ratio: 35, similarity_score: 44",


In [211]:
interventions_manual_review['curie_info'] = interventions_manual_review['curie_info'].apply(ast.literal_eval)

interventions_manual_review_ex = interventions_manual_review.explode('curie_info')
    metamapped_threshold_fail['metamap_term_info'] = metamapped_threshold_fail['metamap_term_info'].apply(lambda x: ', '.join(map(str, x))) # remove the MetaMap info from their lists bc pandas Multi-indexing doesn't work on lists

interventions_manual_review_ex = interventions_manual_review_ex.groupby(['intervention_type', 'orig_int', 'description'])['curie_info'].agg(list).reset_index()
interventions_manual_review_ex

Unnamed: 0,intervention_id,nct_id,intervention_type,orig_int,description,curie_info
0,59265019,NCT04959461,Behavioral,webchat,"A self-guided, single session web intervention...",[nan]
1,59269145,NCT03959982,Other,st. george's respiratory questionnaire (sgrq),SGRQ questionnaire,"[[C0034394, questionnaires, Intellectual Produ..."
2,59282314,NCT01749917,Other,exercise program,Exercise program An brief exercise session inc...,"[[C0015259, exercise, Daily or Recreational Ac..."
3,59283867,NCT01768221,Behavioral,caregiver intervention,Participants will speak with a chaplain for th...,"[[C0085537, caregiver, Professional or Occupat..."
4,59301349,NCT01977261,Procedure,opening-wedge hto,Opening-wedge high tibial osteotomy fixated wi...,"[[C0445153, opening wedge, Spatial Concept, so..."
...,...,...,...,...,...,...
159,60030489,NCT02267850,Device,non-functional orthopulse™,Patients carry out daily sham-OrthoPulse™ trea...,"[[C0205300, non-functional, Functional Concept..."
160,60031740,NCT04227366,Biological,bcd-089,BCD-089 162 mg SC,"[[C0053048, bcd protocol, Therapeutic or Preve..."
161,60032084,NCT02673138,Other,basal interruption without canagliflozin,basal interruption,"[[C2974540, canagliflozin, Organic Chemical|Ph..."
162,60036537,NCT03809130,Other,internet-based intervention,Use Untire application after 3 months up to 6 ...,"[[C0282111, internet, Manufactured Object, sor..."


In [241]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(interventions_manual_review[:20])

Unnamed: 0,intervention_id,nct_id,intervention_type,orig_int,description,curie_info
0,59397996,NCT06012656,Device,minimax,Total or Partial Hip Arthroplasty,
2,59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"[C0005575, biotin, Organic Chemical|Pharmacologic Substance|Vitamin, sort_ratio: 21, similarity_score: 21]"
2,59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"[C0037098, silicon dioxide, Biomedical or Dental Material|Inorganic Chemical, sort_ratio: 33, similarity_score: 27]"
2,59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"[C2828366, extract (substance), Substance, sort_ratio: 41, similarity_score: 37]"
2,59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"[C1456557, botanical, Organic Chemical|Pharmacologic Substance, sort_ratio: 30, similarity_score: 30]"
2,59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"[C2700256, vitamin b7 measurement, Laboratory Procedure, sort_ratio: 27, similarity_score: 30]"
4,59968815,NCT05945160,Drug,alpha lipoic acid 300mg,A naturally occuring mitochondrial antioxidant,"[C0023791, thioctic acid, Organic Chemical|Pharmacologic Substance|Vitamin, sort_ratio: 56, similarity_score: 56]"
8,59611757,NCT05835180,Drug,tvb-2640 - 50 mg,TVB-2640 -50 mg administered orally once daily,"[C0024671, mammography, Diagnostic Procedure, sort_ratio: 16, similarity_score: 15]"
8,59611757,NCT05835180,Drug,tvb-2640 - 50 mg,TVB-2640 -50 mg administered orally once daily,"[C0026410, mongolia, Geographic Area, sort_ratio: 18, similarity_score: 17]"
8,59611757,NCT05835180,Drug,tvb-2640 - 50 mg,TVB-2640 -50 mg administered orally once daily,"[C4321396, mg, Diagnostic Procedure, sort_ratio: 25, similarity_score: 22]"


In [267]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(interventions_manual_review[:20])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,manually_selected_CURIE
intervention_id,nct_id,intervention_type,orig_int,description,curie_info,Unnamed: 6_level_1
59397996,NCT06012656,Device,minimax,Total or Partial Hip Arthroplasty,,
59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"C0005575, biotin, Organic Chemical|Pharmacologic Substance|Vitamin, sort_ratio: 21, similarity_score: 21",
59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"C0037098, silicon dioxide, Biomedical or Dental Material|Inorganic Chemical, sort_ratio: 33, similarity_score: 27",
59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"C2828366, extract (substance), Substance, sort_ratio: 41, similarity_score: 37",
59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"C1456557, botanical, Organic Chemical|Pharmacologic Substance, sort_ratio: 30, similarity_score: 30",
59470477,NCT05972512,Dietary Supplement,botanical extract of standardised biotin with silica,Mode of usage: Two times a day Route of administration: Oral,"C2700256, vitamin b7 measurement, Laboratory Procedure, sort_ratio: 27, similarity_score: 30",
59968815,NCT05945160,Drug,alpha lipoic acid 300mg,A naturally occuring mitochondrial antioxidant,"C0023791, thioctic acid, Organic Chemical|Pharmacologic Substance|Vitamin, sort_ratio: 56, similarity_score: 56",
59611757,NCT05835180,Drug,tvb-2640 - 50 mg,TVB-2640 -50 mg administered orally once daily,"C0024671, mammography, Diagnostic Procedure, sort_ratio: 16, similarity_score: 15",
59611757,NCT05835180,Drug,tvb-2640 - 50 mg,TVB-2640 -50 mg administered orally once daily,"C0026410, mongolia, Geographic Area, sort_ratio: 18, similarity_score: 17",
59611757,NCT05835180,Drug,tvb-2640 - 50 mg,TVB-2640 -50 mg administered orally once daily,"C4321396, mg, Diagnostic Procedure, sort_ratio: 25, similarity_score: 22",


In [None]:
def score_mappings(flag_and_path):
    relevant_date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data
    pattern_outside = r'(?<=\().+?(?=\))|([^(]+)'
    pattern_inside = r'\(([^)]+)\)'

    sort_ratio = np.vectorize(get_token_sort_ratio)
    set_ratio = np.vectorize(get_token_set_ratio)
    sim_score = np.vectorize(get_similarity_score)

    metamap_input = "{}_metamap_output.tsv".format(relevant_date)
    metamap_semantic_types = pd.read_csv("MetaMap_SemanticTypes_2018AB.txt")
    metamapped = pd.read_csv(metamap_input, sep='\t', index_col=False, header=0)

    # get the full names of the semantic types so we know what we're looking at
    metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].str.replace(r'\[|\]', '', regex=True)
    sem_type_col_names = ["abbv", "group", "semantic_type_full"]
    metamap_semantic_types = pd.read_csv("MetaMap_SemanticTypes_2018AB.txt", sep="|", index_col=False, header=None, names=sem_type_col_names)
    sem_type_dict = dict(zip(metamap_semantic_types['abbv'], metamap_semantic_types['semantic_type_full'])) # make a dict of semantic type abbv and full name
    # Handle NaN (None) values in metamap_semantic_type column
    metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].apply(lambda x: x.split(',') if isinstance(x, str) else np.nan)
    # map semantic type abbreviations to the full name of the semantic type
    metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].apply(lambda x: '|'.join([sem_type_dict[term] if term in sem_type_dict else term for term in x]) if isinstance(x, list) else x)

    metamapped['metamap_preferred_name'] = metamapped['metamap_preferred_name'].str.lower()
    metamapped = metamapped.dropna(axis=0)
    metamapped = metamapped[["clin_trial_term", "metamap_cui","metamap_preferred_name", "metamap_semantic_type"]]

    matches_outside = metamapped['clin_trial_term'].str.extract(pattern_outside)
    metamapped['clin_trial_term_outside_par'] = matches_outside[0].fillna('')
    matches_inside = metamapped['clin_trial_term'].str.extract(pattern_inside)
    metamapped['clin_trial_term_inside_par'] = matches_inside[0].fillna('')

    metamapped = metamapped[['clin_trial_term', 'clin_trial_term_outside_par', 'clin_trial_term_inside_par', 'metamap_cui', 'metamap_preferred_name', 'metamap_semantic_type']] # re-order columns of df

    # score on clin_trial_term term vs MetaMap term
    metamapped["sort_ratio_orig"] = sort_ratio(metamapped[["clin_trial_term"]].values, metamapped[["metamap_preferred_name"]].values) 
    metamapped["sim_score_orig"] = sim_score(metamapped[["clin_trial_term"]].values, metamapped[["metamap_preferred_name"]].values)
    # score on term outside parentheses (clin_trial_term_outside_par) vs MetaMap term
    metamapped["sort_ratio_outside"] = sort_ratio(metamapped[["clin_trial_term_outside_par"]].values, metamapped[["metamap_preferred_name"]].values) 
    metamapped["sim_score_outside"] = sim_score(metamapped[["clin_trial_term_outside_par"]].values, metamapped[["metamap_preferred_name"]].values)
    # score on term inside parentheses (clin_trial_term_inside_par) vs MetaMap term
    metamapped["sort_ratio_inside"] = sort_ratio(metamapped[["clin_trial_term_inside_par"]].values, metamapped[["metamap_preferred_name"]].values) 
    metamapped["sim_score_inside"] = sim_score(metamapped[["clin_trial_term_inside_par"]].values, metamapped[["metamap_preferred_name"]].values)

    # get all columns that have "score" in it so we can filter on the threshold score we want
    score_ratio_columns = [col for col in metamapped.columns if 'score' in col or 'ratio' in col]
    metamapped['max_score'] = metamapped[score_ratio_columns].max(axis=1)
    metamapped.drop(score_ratio_columns, axis = 1, inplace = True) # drop all scoring col except max score columns now

    # filter for rows that have scores > 88 in max_score col
    metamapped_threshold_pass = metamapped[metamapped['max_score'] > 88]
    # multiple MetaMap terms may have passed scoring threshold for a single input Clinical trial term. Choose only 1 MetaMap term for each input Clinical Trial term. Do this by picking row with max score of all score columns
    metamapped_threshold_pass = metamapped_threshold_pass.sort_values('max_score').drop_duplicates('clin_trial_term', keep='first')

    # filter for clin trial terms that didn't pass threshold of 88
    metamapped_threshold_fail = metamapped.loc[~metamapped['clin_trial_term'].isin(metamapped_threshold_pass['clin_trial_term'])].copy()

    metamapped_threshold_pass = metamapped_threshold_pass[["clin_trial_term", "metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]] # get only columns of relevance
    metamapped_threshold_fail = metamapped_threshold_fail[["clin_trial_term", "metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]] # get only columns of relevance

    # prep output file of MetaMap terms that failed threshold for manual review
    metamapped_threshold_fail['metamap_term_info']= metamapped_threshold_fail[["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]].values.tolist()
    metamapped_threshold_fail.drop(["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"], axis = 1, inplace = True)
    metamapped_threshold_fail['metamap_term_info'] = metamapped_threshold_fail['metamap_term_info'].apply(lambda x: ', '.join(map(str, x))) # remove the MetaMap info from their lists bc pandas Multi-indexing doesn't work on lists
    metamapped_threshold_fail['temp'] = "temp"
    metamapped_threshold_fail.set_index(["clin_trial_term", "metamap_term_info"], inplace=True)
    metamapped_threshold_fail.drop(["temp"], axis = 1, inplace = True)
    metamapped_threshold_fail['manually_selected_CURIE'] = None

    metamapped_threshold_fail.to_excel('{}_CURIES_manual_review.xlsx'.format(relevant_date), engine='xlsxwriter', index=True)
    metamapped_threshold_pass.to_csv('{}_metamap_threshold_pass.tsv'.format(relevant_date), sep="\t", index=False, header=True) # output interventions to TSV, avoid storing in memory


    # # get all columns that have "score" in it so we can filter on the threshold score we want
    # score_ratio_columns = [col for col in metamapped_scored.columns if 'score' in col or 'ratio' in col]
    # metamapped_threshold_pass = metamapped_scored[(metamapped_scored[score_ratio_columns] > 88).any(axis=1)]
    # metamapped_threshold_fail = metamapped_scored.loc[~metamapped_scored['clin_trial_term'].isin(metamapped_threshold_pass['clin_trial_term'])].copy()

    # metamapped_threshold_pass.drop(score_ratio_columns, axis = 1, inplace = True) # drop the scoring columns now
    # metamapped_threshold_fail.drop(score_ratio_columns, axis = 1, inplace = True) # drop the scoring columns now


    # metamapped_manual_curation = metamapped_threshold_fail[["clin_trial_term", "metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]]
    # metamapped_manual_curation = metamapped_manual_curation.copy()
    # metamapped_manual_curation['metamap_term_info']= metamapped_manual_curation[["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]].values.tolist()
    # metamapped_manual_curation.drop(["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"], axis = 1, inplace = True)

    # metamapped_manual_curation['metamap_term_info'] = metamapped_manual_curation['metamap_term_info'].apply(lambda x: ','.join(map(str, x))) # remove the MetaMap info from their lists bc pandas Multi-indexing doesn't work on lists
    # metamapped_manual_curation['temp'] = "temp"
    # metamapped_manual_curation.set_index(["clin_trial_term", "metamap_term_info"],inplace=True)


    # metamapped_manual_curation = metamapped_manual_curation.drop('temp', axis=1) # drop the redundant column now






    # metamapped_manual_curation.drop(["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"], axis = 1, inplace = True)
    # metamapped_manual_curation = metamapped_manual_curation.groupby('clin_trial_term')['metamap_term_info'].agg(list).reset_index()

    # use Multiindexing to see lists of CURIEs available for single term
    # Explode the column of lists of lists
    # metamapped_manual_curation = metamapped_manual_curation.explode('metamap_term_info')

    # Reset the index if needed
    # metamapped_manual_curation.reset_index(drop=True, inplace=True)



    # metamapped_manual_curation = metamapped_manual_curation.groupby('clin_trial_term')['metamap_term_info'].agg(list).reset_index()

    # metamapped_con['max_score'] = metamapped_con[['sort_ratio', 'sim_score']].max(axis=1)
    # metamapped_con = metamapped_con.sort_values('max_score').drop_duplicates('clin_trial_term', keep='first')


    # metamapped_con["metamap_term_info"] = metamapped_con[["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]].values.tolist() 
    # metamapped_con.drop(["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"], axis = 1, inplace = True)
    # metamapped_con = metamapped_con.groupby('clin_trial_term')['metamap_term_info'].agg(list).reset_index()


with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(metamapped_threshold_fail)

In [14]:
def merge_mappings_to_trials(df_dict):
    conditions = df_dict["conditions"]

    interventions = df_dict["interventions"]
    conditions = df_dict["interventions_alts"]
        

In [170]:
for index, row in conditions.iterrows():
    for col_name in cols_to_check:
        value = row[col_name]
        if value in hiconf_con_dict:
            curie_info = hiconf_con_dict[value]
            conditions.at[index, "high_conf_curie_info"] = curie_info

#         curie_info = hiconf_con_dict[value]
#         print(curie_info)
# #         conditions.at[index, "high_conf_curie_info"] = curie_info

In [173]:
hiconf_int = hiconfidence_curies.loc[hiconfidence_curies['term_type'] == "intervention"]
hiconf_int_dict = dict(zip(hiconf_int['clin_trial_term'].tolist(), hiconf_int['metamap_term_info'].tolist())) # make a dict of clinical trial term and corresponding metamap info
cols_to_check = [ele for ele in interventions.columns if(ele not in ['id', 'nct_id'])]
interventions["high_conf_curie_info"] = None

for index, row in interventions.iterrows():
    for col_name in cols_to_check:
        value = row[col_name]
        if value in hiconf_int_dict:
            curie_info = hiconf_int_dict[value]
            interventions.at[index, "high_conf_curie_info"] = curie_info

In [204]:
interventions_alts = df_dict["interventions_alts"][["id", "nct_id", "intervention_id", "alt_downcase_name"]]
interventions = df_dict["interventions"]
interventions
interventions_alts

Unnamed: 0,id,nct_id,intervention_type,name,description,orig_downcase_name
193316,54403505,NCT04160000,Drug,Rate or Rhythm control antiarrhythmic drugs fo...,Administration of antiarrhythmic drug to achie...,rate or rhythm control antiarrhythmic drugs fo...
335174,53644628,NCT03069131,Drug,Rifaximin,twice daily administration of 1 tablet contain...,rifaximin
599099,53998293,NCT00463840,Drug,Oxaliplatin,,oxaliplatin
556232,53712417,NCT01506284,Procedure,Forced oscillatory technique (FOT) and laser i...,FOT: The stimulating signal is generated by an...,forced oscillatory technique (fot) and laser i...
324723,53925544,NCT02810262,Other,Bone metastases biopsy,The vast majority of bone biopsy are performed...,bone metastases biopsy
...,...,...,...,...,...,...
583887,53987390,NCT00308113,Dietary Supplement,Coenzyme Q10,serum levels of greater or equal to 2.5 microg...,coenzyme q10
493635,53702684,NCT01422954,Drug,Chloroquine prophylaxis,Standard prophylactic regime: a loading dose o...,chloroquine prophylaxis
356939,53658216,NCT02519036,Other,Placebo,Placebo was administered by intrathecal inject...,placebo
196727,54405081,NCT05774873,Drug,IBI334,Subjects will receive IBI334 once a week durin...,ibi334


Unnamed: 0,id,nct_id,intervention_id,alt_downcase_name
0,27584249,NCT01738191,54313664,strattera
1,27584250,NCT01737879,54313666,omontys
2,27428744,NCT04545502,54003364,gelsoft plus
3,27584251,NCT01737879,54313667,epogen
4,27273339,NCT04571879,53672522,nebulized xylocaine
...,...,...,...,...
387960,27583600,NCT03192215,54313417,eliquis
387961,27583601,NCT03192215,54313418,aspirin tablet
387962,27583602,NCT03052608,54313422,pf-06463922
387963,27583603,NCT03052608,54313423,xalkori


In [183]:
hiconf_altint = hiconfidence_curies.loc[hiconfidence_curies['term_type'] == "alternate_intervention"]
hiconf_altint_dict = dict(zip(hiconf_altint['clin_trial_term'].tolist(), hiconf_altint['metamap_term_info'].tolist())) # make a dict of clinical trial term and corresponding metamap info
cols_to_check = [ele for ele in interventions_alts.columns if(ele not in ['id', 'nct_id', 'intervention_id'])]
interventions_alts["high_conf_curie_info"] = None

for index, row in interventions_alts.iterrows():
    for col_name in cols_to_check:
        value = row[col_name]
        if value in hiconf_altint_dict:
            curie_info = hiconf_altint_dict[value]
            interventions_alts.at[index, "high_conf_curie_info"] = curie_info

In [100]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(interventions_all[:100])

Unnamed: 0,intervention_id,nct_id,intervention_type,orig_int,description,alt_int,orig_int_outside,orig_int_inside,alt_int_outside,alt_int_inside
316,53647000,NCT05986136,Drug,dapagliflozin 10mg tab,Dapagliflozin has emerged as a selective SGLT2...,,dapagliflozin 10mg tab,,,
170,54383515,NCT05965375,Other,observational; no interventions were given.,Observational; No Interventions were given.,,observational; no interventions were given.,,,
58,54002989,NCT05912244,Drug,io102/io103,IO102/IO103 will be administered subcutaneousl...,,io102/io103,,,
5,54036407,NCT05864040,Device,bitrack system-assisted laparoscopic radical/s...,Robot-Assisted Laparoscopic Transperitoneal Ra...,,bitrack system-assisted laparoscopic radical/s...,,,
70,53743983,NCT05853731,Device,white light emitting diode,Filtered white light via light emitting diode ...,wled,white light emitting diode,,wled,
228,53727180,NCT05835037,Drug,placebo,Participants will take one placebo tablet per ...,,placebo,,,
81,53639435,NCT05806385,Device,cryoablation,Tumor ablation before neoadjuvant chemotherapy,,cryoablation,,,
276,53682687,NCT05766514,Drug,azacitadine or decitabine,Subjects will be given either 20 mg/m2 decitab...,,azacitadine or decitabine,,,
48,54201879,NCT05748665,Drug,etomidate,Using etomidate for induction of general anest...,,etomidate,,,
283,53977051,NCT05740826,Device,algometer,The algometer head was applied to the trigger ...,,algometer,,,


In [35]:
# # output all results to TSVs
# def compile_and_output(df_dict, ct_terms, remaining_unmapped_possible):
#     print("\n")
#     print("#   -------- -------- -------- --------  ")
#     print("Final Tallies:")
#     print("Total # of conditions mapped: {}".format(ct_terms["mapped_conditions"].shape[0]))
#     print("Total # of interventions mapped: {}".format(ct_terms["mapped_interventions"].shape[0]))
#     print("Total # of conditions unmapped or not mapped: {}".format(len(ct_terms["unmapped_conditions"])))
#     print("Total # of interventions unmapped or not mapped: {}".format(len(ct_terms["unmapped_interventions"])))    
#     # How many Clinical Trials are there? Well, it's different depending on the Conditions or Interventions dataframes...
#     conditions_nctids = len(df_dict["conditions"].nct_id.unique())
#     interventions_nctids = len(df_dict["interventions"].nct_id.unique())
#     print("Number of Clinical Trials NCITs in Conditions table: {}".format(conditions_nctids))      
#     print("Number of Clinical Trials NCITs in Interventions table: {}".format(interventions_nctids))
#     print("#   -------- -------- -------- --------  ")

#     """ create tables of unused MeSH and MetaMap CURIEs that could be used for unmapped Conditions and Interventions """
#     # -------    CONDITIONS    ------- #
#     all_conditions = df_dict["conditions"][["nct_id", "downcase_name"]]
#     conditions_mesh = pd.merge(all_conditions, 
#                                remaining_unmapped_possible["mesh_conditions_per_study"],
#                                how='left',
#                                left_on=['nct_id'],
#                                right_on = ['nct_id'])
    
#     metamap_possibilities = remaining_unmapped_possible["all_metamapped_conditions"][["condition_input", "condition_CURIE_id", "condition_CURIE_name", "condition_semantic_type"]]
#     conditions_mesh_metamap = pd.merge(conditions_mesh, 
#                                        metamap_possibilities,
#                                        how='left',
#                                        left_on=['downcase_name'],
#                                        right_on = ['condition_input'])
    
#     unmapped_conditions_possible_terms = conditions_mesh_metamap[conditions_mesh_metamap['downcase_name'].isin(ct_terms["unmapped_conditions"])]
#     unmapped_conditions_possible_terms = unmapped_conditions_possible_terms.drop('condition_input', axis=1) # drop the redundant column now
    
#     # -------    INTERVENTIONS    ------- #
#     all_interventions = df_dict["interventions"][["nct_id", "downcase_name"]]
#     interventions_mesh = pd.merge(all_interventions, 
#                                remaining_unmapped_possible["mesh_interventions_per_study"],
#                                how='left',
#                                left_on=['nct_id'],
#                                right_on = ['nct_id'])
    
#     metamap_possibilities = remaining_unmapped_possible["all_metamapped_interventions"][["intervention_input", "intervention_CURIE_id", "intervention_CURIE_name", "intervention_semantic_type"]]
#     interventions_mesh_metamap = pd.merge(interventions_mesh, 
#                                        metamap_possibilities,
#                                        how='left',
#                                        left_on=['downcase_name'],
#                                        right_on = ['intervention_input'])
    
#     unmapped_interventions_possible_terms = interventions_mesh_metamap[interventions_mesh_metamap['downcase_name'].isin(ct_terms["unmapped_interventions"])]
#     unmapped_interventions_possible_terms = unmapped_interventions_possible_terms.drop('intervention_input', axis=1) # drop the redundant column now
          
        
#     """   Output all to TSVs   """    
#     pd.Series(ct_terms["unmapped_conditions"]).to_csv('unmapped_conditions.tsv', sep="\t", index=False, header=False) # convert the list to a pandas series, then output to TSV
#     pd.Series(ct_terms["unmapped_interventions"]).to_csv('unmapped_interventions.tsv', sep="\t", index=False, header=False) # convert the list to a pandas series, then output to TSV
#     ct_terms["mapped_conditions"].to_csv('mapped_conditions.tsv', sep="\t", index=False)
#     ct_terms["mapped_interventions"].to_csv('mapped_interventions.tsv', sep="\t", index=False)
#     unmapped_conditions_possible_terms.to_csv('unmapped_conditions_possible_mappings.tsv', sep="\t", index=False)
#     unmapped_interventions_possible_terms.to_csv('unmapped_interventions_possible_mappings.tsv', sep="\t", index=False)
    



In [None]:
# def test_or_prod():
#     print("The test run of this code performs the construction of the KG on a subset of 200 Conditions and 200 Interventions from Clinical Trials.\n")
#     test_or_prod = input("Is this a test run or the production of a new version of the KG? Write T for test, or P for production: ")
#     if test_or_prod == "T":
#         flag_and_path = get_raw_ct_data() # uncomment for production
#         flag_and_path["term_program_flag"] = False
#         run_ETL_mapping(flag_and_path)
#     elif test_or_prod == "P":
#         flag_and_path = get_raw_ct_data() 
#         run_ETL_mapping(flag_and_path)
#     else:
#         print("Bad input")
#         sys.exit(0)
        

        
        

In [None]:
# def run_ETL_mapping(flag_and_path):
#     df_dict = read_raw_ct_data(flag_and_path)
#     ct_terms = exact_match_mesh(df_dict)
#     ct_terms = inexact_match_mesh(df_dict, ct_terms)

#     # pull the available MeSH terms per study out of the returned ct_terms dict 
#     mesh_conditions_per_study = ct_terms["mesh_conditions_per_study"]
#     mesh_interventions_per_study = ct_terms["mesh_interventions_per_study"]

#     ct_terms = term_list_to_nr(df_dict, ct_terms)
#     ct_terms = term_list_to_mm(df_dict, ct_terms)

#     # pull the available UMLS terms per study out of the returned ct_terms dict 
#     all_metamapped_conditions = ct_terms["all_metamapped_conditions"]
#     all_metamapped_interventions = ct_terms["all_metamapped_interventions"]

#     remaining_unmapped_possible = {"mesh_conditions_per_study": mesh_conditions_per_study,
#                                    "mesh_interventions_per_study": mesh_interventions_per_study,
#                                    "all_metamapped_conditions": all_metamapped_conditions,
#                                    "all_metamapped_interventions": all_metamapped_interventions}
#     compile_and_output(df_dict, ct_terms, remaining_unmapped_possible)


    

In [292]:
# flag_and_path = get_raw_ct_data() # uncomment for production
flag_and_path = {'term_program_flag': False,
                 'data_extracted_path': '/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/09_26_2023_extracted',
                 'date_string':'09_26_2023'} # comment for production
metamap_dirs = check_os()
df_dict = read_raw_ct_data(flag_and_path)
term_list_to_mm(df_dict, flag_and_path)
map_to_trial(df_dict, flag_and_path)
score_mappings(flag_and_path)
auto_select_curies(flag_and_path)
# merge_mappings_to_trials()

# # pull the available UMLS terms per study out of the returned ct_terms dict 
# all_metamapped_conditions = ct_terms["all_metamapped_conditions"]
# all_metamapped_interventions = ct_terms["all_metamapped_interventions"]

# remaining_unmapped_possible = {"mesh_conditions_per_study": mesh_conditions_per_study,
#                                "mesh_interventions_per_study": mesh_interventions_per_study,
#                                "all_metamapped_conditions": all_metamapped_conditions,
#                                "all_metamapped_interventions": all_metamapped_interventions}
# compile_and_output(df_dict, ct_terms, remaining_unmapped_possible)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


MetaMap version < 2020, conduct mapping on terms after removing ascii characters
Starting skrmedpostctl: 
started.
Starting wsdserverctl: 
started.
loading properties file /Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/WSD_Server/config/disambServer.cfg
WSD Server initializing disambiguation methods.
WSD Server databases and disambiguation methods have been initialized.
Could not listen on port : 5554 : Address already in use
Stopping skrmedpostctl: 
Stopping Tagger Server process..
Process 29401 stopped
Stopping wsdserverctl: 
Stopping WSD Server process..
Process 29409 stopped


/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/skrmedpostctl: line 50: kill: (29401) - No such process
/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/wsdserverctl: line 55: kill: (29409) - No such process


Using UMLS MetaMap to get mappings for INTERVENTIONS. MetaMap returns mappings, CUIs, and semantic type of mapping.
MetaMap version < 2020, conduct mapping on original interventions after removing ascii characters
Starting skrmedpostctl: 
started.
Starting wsdserverctl: 
started.
loading properties file /Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/WSD_Server/config/disambServer.cfg
WSD Server initializing disambiguation methods.
WSD Server databases and disambiguation methods have been initialized.
Could not listen on port : 5554 : Address already in use
Stopping skrmedpostctl: 
Stopping Tagger Server process..
Process 34826 stopped
Stopping wsdserverctl: 
Stopping WSD Server process..
Process 34828 stopped


/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/skrmedpostctl: line 50: kill: (34826) - No such process
/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/wsdserverctl: line 55: kill: (34828) - No such process


Starting skrmedpostctl: 
started.
Starting wsdserverctl: 
started.
loading properties file /Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/WSD_Server/config/disambServer.cfg
WSD Server initializing disambiguation methods.
WSD Server databases and disambiguation methods have been initialized.
Could not listen on port : 5554 : Address already in use
Stopping skrmedpostctl: 
Stopping Tagger Server process..
Process 39876 stopped
Stopping wsdserverctl: 
Stopping WSD Server process..
Process 39878 stopped


/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/skrmedpostctl: line 50: kill: (39876) - No such process
/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/wsdserverctl: line 55: kill: (39878) - No such process


In [None]:
def convert_seconds_to_hms(seconds):

    """ converts the elapsed or run_time to hours, min, sec """
    hours = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return hours, minutes, seconds

current = dt.datetime.now()
ts = dt.datetime.timestamp(current)
d = dt.datetime.fromtimestamp(ts)
str_date_time = d.strftime("%d-%m-%Y, %H:%M:%S")
print("Timestamp of script start: {}".format(str_date_time))

start_time = time.time()
end_time = time.time()
elapsed_time = end_time - start_time
hours, minutes, seconds = convert_seconds_to_hms(elapsed_time)
print(f"Runtime: {hours} hours, {minutes} minutes, {seconds} seconds")

In [None]:
def report_stats(df_dict, flag_and_path):
    """ Report counts of conditions, interventions"""
    relevant_date = flag_and_path["date_string"] # get date
    
    total_conditions = df_dict["conditions"].downcase_name
    total_conditions = list(total_conditions.unique())
    total_conditions = list(filter(None, total_conditions))
    
    orig_interventions = df_dict["interventions"]
    orig_interventions = orig_interventions['name'].str.lower()
    orig_interventions = list(orig_interventions.unique())
    orig_interventions = list(filter(None, orig_interventions))
    
    alt_interventions = df_dict["interventions_alts"].alt_downcase_name
    alt_interventions = list(alt_interventions.unique())
    alt_interventions = list(filter(None, alt_interventions))
    
#     metamap_input = "{}_metamap_output.tsv".format(relevant_date)
    
#     """ Get the full names of the semantic types and replace the abbreviations with the full names """
#     metamapped = pd.read_csv(metamap_input, sep='\t', index_col=False, header=0)

    print("Clinical Trial Data from: {}".format(relevant_date))
    print("Total # of unique conditions : {}".format(len(total_conditions)))
    print("Total # of unique interventions : {}".format(len(orig_interventions) + len(alt_interventions)))
    

    
