In [324]:
# display cells to maximum width 
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))

# lets you preint multiple outputs per cell, not just last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [421]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
import re
import collections
import os
import json
import numpy as np
import pickle
from functools import reduce
import time
from time import sleep
import concurrent
import multiprocessing
import datetime as dt
from datetime import date
import pathlib
import configparser
import sys
import urllib
import zipfile
import csv
sys.path.insert(0, '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/pymetamap-master')
from pymetamap import MetaMap  # https://github.com/AnthonyMRios/pymetamap/blob/master/pymetamap/SubprocessBackend.py
from pandas import ExcelWriter


In [326]:
# %pip install thefuzz
# %pip install levenshtein

from thefuzz import fuzz # fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

In [327]:
global metamap_dirs
global metamap_pos_server_dir
global metamap_wsd_server_dir


In [328]:
# fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

def get_token_sort_ratio(str1, str2):
    try:
        return fuzz.token_sort_ratio(str1, str2)
    except:
        return None
    
sort_ratio = np.vectorize(get_token_sort_ratio)

def get_token_set_ratio(str1, str2):
    try:
        return fuzz.token_set_ratio(str1, str2)
    except:
        return None  
set_ratio = np.vectorize(get_token_set_ratio)

def get_similarity_score(str1, str2):
    try:
        return fuzz.ratio(str1, str2)
    except:
        return None
sim_score = np.vectorize(get_similarity_score)

In [329]:
def get_raw_ct_data():
    term_program_flag = True
    global data_dir
    global data_extracted
    
    # get all the links and associated dates of upload into a dict called date_link
    url_all = "https://aact.ctti-clinicaltrials.org/pipe_files"
    response = requests.get(url_all)
    soup = BeautifulSoup(response.text)
    body = soup.find_all('option') #Find all
    date_link = {}
    for el in body:
        tags = el.find('a')
        try:
            zip_name = tags.contents[0].split()[0]
            date = zip_name.split("_")[0]
            date = dt.datetime.strptime(date, '%Y%m%d').date()
            date_link[date] = tags.get('href')
        except:
            pass
    latest_file_date = max(date_link.keys())   # get the date of the latest upload
    url = date_link[latest_file_date]   # get the corresponding download link of the latest upload so we can download the raw data
    date_string = latest_file_date.strftime("%m_%d_%Y")
    data_dir = "{}/data".format(pathlib.Path.cwd())
    data_extracted = data_dir + "/{}_extracted".format(date_string)
    data_path = "{}/{}_pipe-delimited-export.zip".format(data_dir, date_string)
    
    if not os.path.exists(data_path):   # if folder containing most recent data doesn't exist, download and extract it into data folder
        
        term_program_flag = False   # flag below for terminating program if latest download exists (KG is assumed up to date)
        print("Downloading Clinical Trial data as of {}".format(date_string))
        response = requests.get(url)
        if response.status_code == 200:
            with open(data_path, 'wb') as file:
                file.write(response.content)
            print("Finished download of zip")
            with zipfile.ZipFile(data_path, 'r') as download:
                print("Unzipping data")
                download.extractall(data_extracted)
        else:
            print("KG is already up to date.")
    return {"term_program_flag": term_program_flag, "data_extracted_path": data_extracted, "date_string": date_string}



In [330]:
def read_raw_ct_data(flag_and_path):
    if flag_and_path["term_program_flag"]:
        print("Exiting program. Assuming KG has already been constructed from most recent data dump from AACT.")
#         exit()
#         pass
    else:
        data_extracted = flag_and_path["data_extracted_path"]
        # read in pipe-delimited files 
        conditions_df = pd.read_csv(data_extracted + '/conditions.txt', sep='|', index_col=False, header=0)
        interventions_df = pd.read_csv(data_extracted + '/interventions.txt', sep='|', index_col=False, header=0)
        interventions_alts = pd.read_csv(data_extracted + '/intervention_other_names.txt', sep='|', index_col=False, header=0)

#         browse_conditions_df = pd.read_csv(data_extracted + '/browse_conditions.txt', sep='|', index_col=False, header=0)
#         browse_interventions_df = pd.read_csv(data_extracted + '/browse_interventions.txt', sep='|', index_col=False, header=0)
        
    ### GET RID OF....CHEAT LINE FOR TESTING
#         conditions_df = conditions_df.iloc[:300]
#         interventions_df = interventions_df.iloc[:300]
        conditions_df = conditions_df.sample(n=300)
        interventions_df = interventions_df.sample(n=300)

    return {"conditions": conditions_df, "interventions": interventions_df, "interventions_alts": interventions_alts
#             "browse_conditions": browse_conditions_df, "browse_interventions": browse_interventions_df
           }



In [331]:
def de_ascii_er(text):
    non_ascii = "[^\x00-\x7F]"
    pattern = re.compile(r"[^\x00-\x7F]")
    non_ascii_text = re.sub(pattern, ' ', text)
    return non_ascii_text

In [332]:
def start_metamap_servers(metamap_dirs):
    metamap_pos_server_dir = 'bin/skrmedpostctl' # Part of speech tagger
    metamap_wsd_server_dir = 'bin/wsdserverctl' # Word sense disambiguation 
    
    # Start servers
    os.system(metamap_dirs['metamap_base_dir'] + metamap_pos_server_dir + ' start') # Part of speech tagger
    os.system(metamap_dirs['metamap_base_dir'] + metamap_wsd_server_dir + ' start') # Word sense disambiguation 
    # # Sleep a bit to give time for these servers to start up
    sleep(5)

def stop_metamap_servers(metamap_dirs):
    metamap_pos_server_dir = 'bin/skrmedpostctl' # Part of speech tagger
    metamap_wsd_server_dir = 'bin/wsdserverctl' # Word sense disambiguation 
    # Stop servers
    os.system(metamap_dirs['metamap_base_dir'] + metamap_pos_server_dir + ' stop') # Part of speech tagger
    os.system(metamap_dirs['metamap_base_dir'] + metamap_wsd_server_dir + ' stop') # Word sense disambiguation 
        

In [333]:
def check_os():
    if "linux" in sys.platform:
        print("Linux platform detected")
        metamap_base_dir = "{}/metamap/".format(pathlib.Path.cwd().parents[0])
        metamap_bin_dir = 'bin/metamap20'
    else:
        metamap_base_dir = '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/' # for running on local
        metamap_bin_dir = 'bin/metamap18'
        
    return {"metamap_base_dir":metamap_base_dir, "metamap_bin_dir":metamap_bin_dir}
        

In [235]:
# def run_metamap(input_term, params, mm, cond_or_inter, csv_writer):
#     from_metamap = []
#     try:
#         concepts,error = mm.extract_concepts([input_term],
# #                                              restrict_to_sts = params["restrict_to_sts"],
#                                              term_processing = params["term_processing"],
#                                              ignore_word_order = params["ignore_word_order"],
#                                              strict_model = params["strict_model"],
#                                              exclude_sts = params["exclude_sts"]
#                                             )

#         for concept in concepts:
#             concept_info = []
#             concept = concept._asdict()
#             concept_info.extend([cond_or_inter,input_term])
#             concept_info.extend([concept.get(k) for k in ['preferred_name', 'cui', 'score', 'semtypes']])
#             from_metamap.append(concept_info)
#     except:
#         from_metamap.extend([input_term, None, None, None, None, None, None])
#     for result in from_metamap:
#         print(result)
#         csv_writer.writerow(result)
#     return from_metamap

In [334]:
def run_metamap(input_term, params, mm, cond_or_inter, csv_writer):
    from_metamap = []
    if params.get("exclude_sts") is None: # exclude_sts is used for Interventions. restrict_to_sts is used for Conditions. So, the logic is, if we're mapping Conditions, execute "if" part of code. If we're mapping Interventions, execute "else" part of code
        try:
            concepts,error = mm.extract_concepts([input_term],
                                                 restrict_to_sts = params["restrict_to_sts"],
                                                 term_processing = params["term_processing"],
                                                 ignore_word_order = params["ignore_word_order"],
                                                 strict_model = params["strict_model"],
                                                )

            for concept in concepts:
                concept_info = []
                concept = concept._asdict()
                concept_info.extend([cond_or_inter,input_term])
                concept_info.extend([concept.get(k) for k in ['preferred_name', 'cui', 'score', 'semtypes']])
                from_metamap.append(concept_info)
        except:
            from_metamap.extend([input_term, None, None, None, None, None, None])
    else:
        try:
            concepts,error = mm.extract_concepts([input_term],
                                                 exclude_sts = params["exclude_sts"],
                                                 term_processing = params["term_processing"],
                                                 ignore_word_order = params["ignore_word_order"],
                                                 strict_model = params["strict_model"],
                                                )

            for concept in concepts:
                concept_info = []
                concept = concept._asdict()
                concept_info.extend([cond_or_inter,input_term])
                concept_info.extend([concept.get(k) for k in ['preferred_name', 'cui', 'score', 'semtypes']])
                from_metamap.append(concept_info)
        except:
            from_metamap.extend([input_term, None, None, None, None, None, None])
        
    for result in from_metamap:
#         print(result)
        csv_writer.writerow(result)
    return from_metamap

In [59]:
# this cell tests if MetaMap is working on the term list below
# terms = ['infarction, myocardial', 'aneurysm', 'diabetes', 'common cold', 'fracture', 'juice blend', "hormones"]

# condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
# params = {"restrict_to_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False}
# start_metamap_servers(metamap_dirs) # start the MetaMap servers
# mm = MetaMap.get_instance(metamap_dirs["metamap_base_dir"] + metamap_dirs["metamap_bin_dir"])
# cond_or_inter = "condition"

# # prep file that stores MetaMap output
# col_names = ['term_type', 'clin_trial_term','metamap_preferred_name', 'metamap_cui', 'metamap_score', 'metamap_semantic_type']
# # metamap_output = open("metamap_output().tsv".format(flag_and_path["date_string"]), 'w+', newline='') 
# metamap_output = open("metamap_output.tsv", 'w+', newline='') 
# csv_writer = csv.writer(metamap_output, delimiter='\t')
# csv_writer.writerow(col_names)

# for term in terms:
# #     test = run_metamap(term, params, mm, cond_or_inter)
#     run_metamap(term, params, mm, cond_or_inter, csv_writer)
# metamap_output.close()    
# stop_metamap_servers(metamap_dirs) # stop the MetaMap servers


In [335]:
def parallelize_metamap(term_list, params, cond_or_inter, flag_and_path, csv_writer):
    start_metamap_servers(metamap_dirs) # start the MetaMap servers
    mm = MetaMap.get_instance(metamap_dirs["metamap_base_dir"] + metamap_dirs["metamap_bin_dir"])
    with concurrent.futures.ThreadPoolExecutor((multiprocessing.cpu_count()*2) - 1) as executor:
        _ = [executor.submit(run_metamap, term, params, mm, cond_or_inter, csv_writer) for term in term_list]
    stop_metamap_servers(metamap_dirs) # stop the MetaMap servers
    


# USE METAMAP LOCAL TO MAP REMAINING TERMS

In [336]:
def term_list_to_mm(df_dict, flag_and_path):
    
    metamap_version = [int(s) for s in re.findall(r'\d+', metamap_dirs.get('metamap_bin_dir'))] # get MetaMap version being run 
    # some input terms have () with additional text, like an abbreviation, in them. split them out to facilitate better mapping 
    pattern_outisde = r'(?<=\().+?(?=\))|([^(]+)'
    pattern_inside = r'\(([^)]+)\)'
    date = flag_and_path["date_string"]   # get date of bulk download of clinical trial data

    
    # -------    CONDITIONS    ------- #
    print("Using UMLS MetaMap to get mappings for CONDITIONS. MetaMap returns mappings, CUIs, and semantic type of mapping.")
    unmapped_conditions = df_dict["conditions"].downcase_name
    unmapped_conditions = list(unmapped_conditions.unique())
    unmapped_conditions = list(filter(None, unmapped_conditions))

#     ascii_and_deascii_con = pd.DataFrame({'orig_con': unmapped_conditions, 'deasciied_con': deasciied_conditions})
    if metamap_version[0] >= 20:
        conditions = pd.DataFrame({'orig_con': unmapped_conditions})
        matches_outside = conditions['orig_con'].str.extract(pattern_outisde)
        conditions['orig_con_outside'] = matches_outside[0].fillna('')
        matches_inside = conditions['orig_con'].str.extract(pattern_inside)
        conditions['orig_con_inside'] = matches_inside[0].fillna('')
    else:
        deasciied_conditions = list(map(de_ascii_er, unmapped_conditions)) # Metamap 2020 does not need de_asciier. Metamap 2018 and prior does.
        
        conditions = pd.DataFrame({'orig_con': unmapped_conditions, 'deasciied_con': deasciied_conditions})
        matches_outside = conditions['deasciied_con'].str.extract(pattern_outisde)
        conditions['deasciied_con_outside'] = matches_outside[0].fillna('')
        matches_inside = conditions['deasciied_con'].str.extract(pattern_inside)
        conditions['deasciied_con_inside'] = matches_inside[0].fillna('')


    
    
    
    
    
    
    
#     deasciied_conditions = list(map(de_ascii_er, unmapped_conditions)) # Metamap 2020 does not need de_asciier. Metamap 2018 and prior does. 

    
    
#     conditions = pd.DataFrame({'orig_con': unmapped_conditions, 'deasciied_con': deasciied_conditions})
    
#     # some input terms have () with additional text, like an abbreviation, in them. split them out to facilitate better mapping 
#     pattern_outisde = r'(?<=\().+?(?=\))|([^(]+)'
#     pattern_inside = r'\(([^)]+)\)'

#     matches_outside = conditions['orig_con'].str.extract(pattern_outisde)
#     conditions['orig_con_outside'] = matches_outside[0].fillna('')
#     matches_inside = conditions['orig_con'].str.extract(pattern_inside)
#     conditions['orig_con_inside'] = matches_inside[0].fillna('')

#     matches_outside = conditions['deasciied_con'].str.extract(pattern_outisde)
#     conditions['deasciied_con_outside'] = matches_outside[0].fillna('')
#     matches_inside = conditions['deasciied_con'].str.extract(pattern_inside)
#     conditions['deasciied_con_inside'] = matches_inside[0].fillna('')


    # see MetaMap Usage instructions: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MM_2016_Usage.pdf
#      condition_args = ['--sldi -I -C -J acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf -z -i -f']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    params = {"restrict_to_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...
    
    # prep output file of Metamap results
    filename = f"metamap_output_{date}.tsv"
    metamap_output = open(filename, 'w+', newline='')
    col_names = ['term_type', 'clin_trial_term','metamap_preferred_name', 'metamap_cui', 'metamap_score', 'metamap_semantic_type']
    csv_writer = csv.writer(metamap_output, delimiter='\t')
    csv_writer.writerow(col_names)
    
    if metamap_version[0] >= 20:
        print("MetaMap version >= 2020, conduct mapping on original terms")
        parallelize_metamap(conditions["orig_con"].tolist(), params, "condition", flag_and_path, csv_writer)
    else:
        print("MetaMap version < 2020, conduct mapping on terms after removing ascii characters")
        parallelize_metamap(conditions["deasciied_con"].tolist(), params, "condition", flag_and_path, csv_writer)
    
    """ If the substring that was either outside or inside the () is identical to the term from which it came from, or actually any of the columns have the same value, put None in that cell/put None where that term is duplicated """    
    # Iterate through each column in the DataFrame
    for col1 in conditions.columns:
        for col2 in conditions.columns:
            # Skip comparing a column with itself
            if col1 != col2:
                # Check if the values in col2 are duplicates of col1
                conditions[col2] = conditions.apply(lambda row: row[col2] if row[col2] != row[col1] else None, axis=1)
    # Drop duplicate columns (keeping the first instance)
    conditions = conditions.T.drop_duplicates().T
    
    # output conditions    
    if metamap_version[0] >= 20:
        conditions[['orig_con', 'orig_con_outside', 'orig_con_inside']].to_csv('conditions_{}.tsv'.format(date), sep="\t", index=False, header=True) # output interventions to TSV, avoid storing in memory
    else:
        conditions[['orig_con', 'deasciied_con', 'deasciied_con_outside', 'deasciied_con_inside']].to_csv('conditions_{}.tsv'.format(date), sep="\t", index=False, header=True) # output interventions to TSV, avoid storing in memory
    
    # -------    INTERVENTIONS    ------- #
    print("Using UMLS MetaMap to get mappings for INTERVENTIONS. MetaMap returns mappings, CUIs, and semantic type of mapping.")
    
    """ Interventions requires unique handling. Another table gives possible alternate names for the interventions in addition to the "original" names. 
        We may map on the alternate names column
        We take the interventions, take the ascii and deasciied versions of them,
        and split substrings in parentheses out of them. We perform MetaMapping on the
        original term or the deasciied term dependinging on what operating system we
        are on. If the mapped term passes the fuzzy scoring thesholds for any of the
        terms (original, deasciied, original inside the parentheses, deasciied inside
        the parentheses, original outside the parentheses, deasciied outside the
        parentheses), we keep that CURIE 
    """

    interventions_df = df_dict["interventions"]
    interventions_df['orig_downcase_name'] = interventions_df['name'].str.lower()
    interventions_alts = df_dict["interventions_alts"]
    interventions_alts['alt_downcase_name'] = interventions_alts['name'].str.lower()

    interventions_all = pd.merge(interventions_df[["id", "nct_id", "intervention_type", "orig_downcase_name", "description"]], interventions_alts[["nct_id", "intervention_id", "alt_downcase_name"]], how='left', left_on=['id'], right_on = ['intervention_id'])
    interventions_all = interventions_all.astype(str)
    interventions_all = interventions_all.drop('nct_id_y', axis=1) # drop the redundant column now
    interventions_all.rename(columns = {'nct_id_x':'nct_id'}, inplace = True)

    interventions_all = interventions_all.sort_values(by='nct_id', ascending=False, na_position='last')
    interventions_all = interventions_all.drop('intervention_id', axis=1) # drop the redundant column now
    interventions_all.rename(columns = {'id':'intervention_id'}, inplace = True)

    """ remove any placebo/saline/water rows """
    interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('placebo')]
    interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('placebo')]

    interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('saline')]
    interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('saline')]

    interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('water')]
    interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('water')]

    interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('sham')]
    interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('sham')]
    
    interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('standard of care')]
    interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('standard of care')]
    
    interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('pharmacological study')]
    interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('pharmacological study')]
    
    interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains('laboratory biomarker analysis')]
    interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains('laboratory biomarker analysis')]
    
    interventions_all = interventions_all[~interventions_all['orig_downcase_name'].str.contains(r'^survey$')]
    interventions_all = interventions_all[~interventions_all['alt_downcase_name'].str.contains(r'^survey$')]

    if metamap_version[0] >= 20:
        original_int = pd.Series(interventions_all.orig_downcase_name.unique()).dropna() 
        alternate_int = pd.Series(interventions_all.alt_downcase_name.unique()).dropna()
        
        interventions_mapping = pd.DataFrame(list(zip(original_int, alternate_int)), columns =['orig_int', 'alt_int'])
        
        matches_outside = interventions_mapping['orig_int'].str.extract(pattern_outisde)
        interventions_mapping['orig_int_outside'] = matches_outside[0].fillna('')
        matches_inside = interventions_mapping['orig_int'].str.extract(pattern_inside)
        interventions_mapping['orig_int_inside'] = matches_inside[0].fillna('')
        
        matches_outside = interventions_mapping['alt_int'].str.extract(pattern_outisde)
        interventions_mapping['alt_int_outside'] = matches_outside[0].fillna('')
        matches_inside = interventions_mapping['alt_int'].str.extract(pattern_inside)
        interventions_mapping['alt_int_inside'] = matches_inside[0].fillna('')

    else:       
        # make df of de-asciied columns for the original_name_interventions and alternate_name_interventions
        deasciier = np.vectorize(de_ascii_er) # vectorize function
        
        original_int = pd.Series(interventions_all.orig_downcase_name.unique()).dropna() 
        deascii_orig_int = deasciier(original_int.values) # perform deascii-ing on original intervention names
    
        alternate_int = pd.Series(interventions_all.alt_downcase_name.unique()).dropna()
        deascii_alt_int = deasciier(alternate_int.values) # perform deascii-ing on alternate intervention names
    
        interventions_mapping = pd.DataFrame(list(zip(original_int, deascii_orig_int, alternate_int, deascii_alt_int)), columns =['orig_int', 'deascii_orig_int', 'alt_int', 'deascii_alt_int'])
     
        matches_outside = interventions_mapping['orig_int'].str.extract(pattern_outisde)
#         interventions_mapping['orig_int_outside'] = matches_outside[0].fillna('')
#         matches_inside = interventions_mapping['orig_int'].str.extract(pattern_inside)
#         interventions_mapping['orig_int_inside'] = matches_inside[0].fillna('')
    
        matches_outside = interventions_mapping['deascii_orig_int'].str.extract(pattern_outisde)
        interventions_mapping['deascii_orig_int_outside'] = matches_outside[0].fillna('')
        matches_inside = interventions_mapping['deascii_orig_int'].str.extract(pattern_inside)
        interventions_mapping['deascii_orig_int_inside'] = matches_inside[0].fillna('')
    
        matches_outside = interventions_mapping['alt_int'].str.extract(pattern_outisde)
#         interventions_mapping['alt_int_outside'] = matches_outside[0].fillna('')
#         matches_inside = interventions_mapping['alt_int'].str.extract(pattern_inside)
#         interventions_mapping['alt_int_inside'] = matches_inside[0].fillna('')

        matches_outside = interventions_mapping['deascii_alt_int'].str.extract(pattern_outisde)
        interventions_mapping['deascii_alt_int_outside'] = matches_outside[0].fillna('')
        matches_inside = interventions_mapping['deascii_alt_int'].str.extract(pattern_inside)
        interventions_mapping['deascii_alt_int_inside'] = matches_inside[0].fillna('')

    """ I don't want to perform mapping on strings < 4 char in length; these are ambiguous and it's hard to make a call what that concept should be """
    """ Get character counts of all the columns to evaluate """    
    interventions_mapping_counts = interventions_mapping.copy() # make a copy of deascii char df
    for col in interventions_mapping_counts.columns: # get the char counts of each column
        char_count_col_name = col + '_char_count'
        interventions_mapping_counts[char_count_col_name] = interventions_mapping_counts[col].str.len()
        
    """ If char_count < 4, replace the string in the corresponding column with None so that we don't use it for comparison """    
    for col in interventions_mapping_counts.columns[interventions_mapping_counts.columns.str.contains("char_count")]:
        for index, value in interventions_mapping_counts[col].items():
            if value < 4:
                # Find the column with the most similar name without "char_count" substring
                most_similar_col = interventions_mapping_counts.columns[interventions_mapping_counts.columns.str.replace("_char_count", "") == col.replace("_char_count", "")].values[0]
                # Update the value in the most similar column
                interventions_mapping_counts.at[index, most_similar_col] = None
        interventions_mapping_counts = interventions_mapping_counts.drop(col, axis=1) # drop the count columns now    
        
    """ Send the prepared interventions to MetaMap now. If we are on OSX, we have to use MetaMap 2018, which requires deasciied terms. If we are on Linux, we can use MetaMap 2020, which does not require such preprocessing """
    params = {"exclude_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...

    # file to write out to is still open, so continuing to append the interventions to it now     
    if metamap_version[0] >= 20:  # find out MetaMap installation version
        print("MetaMap version >= 2020, conduct mapping on original terms")
        parallelize_metamap(interventions_mapping_counts["orig_int"].tolist(), params, "intervention", flag_and_path, csv_writer)
        parallelize_metamap(interventions_mapping_counts["alt_int"].tolist(), params, "alternate_intervention", flag_and_path, csv_writer)

    else:
        print("MetaMap version < 2020, conduct mapping on terms after removing ascii characters")
        parallelize_metamap(interventions_mapping_counts["deascii_orig_int"].tolist(), params, "intervention", flag_and_path, csv_writer)
        parallelize_metamap(interventions_mapping_counts["deascii_alt_int"].tolist(), params, "alternate_intervention", flag_and_path, csv_writer)
        
        
    """ If the substring that was either outside or inside the () is identical to the term from which it came from, or actually any of the columns have the same value, put None in that cell/put None where that term is duplicated """    
    # Iterate through each column in the DataFrame
    for col1 in interventions_mapping_counts.columns:
        for col2 in interventions_mapping_counts.columns:
            # Skip comparing a column with itself
            if col1 != col2:
                # Check if the values in col2 are duplicates of col1
                interventions_mapping_counts[col2] = interventions_mapping_counts.apply(lambda row: row[col2] if row[col2] != row[col1] else None, axis=1)
    # Drop duplicate columns (keeping the first instance)
    interventions_mapping_counts = interventions_mapping_counts.T.drop_duplicates().T

    # output results to TSV        
    interventions_mapping_counts.to_csv('interventions_{}.tsv'.format(date), sep="\t", index=False, header=True) # output interventions to TSV, avoid storing in memory

    return filename    
        
        
        
        
  








    # # print(len(test))
    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    #     display(ascii_ints)

          
#     # -------    INTERVENTIONS    ------- #
#     print("Using UMLS MetaMap to get more mappings for interventions. MetaMap returns mappings, CUIs, and semantic type of mapping.")
#     unmapped_interventions = ct_terms["unmapped_interventions"]
#     interventions_unmapped_chunked = split_list_by_char_lim(unmapped_interventions)
#     # see MetaMap Usage instructions: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MM_2016_Usage.pdf
#     # removing sosy semantic type (sign or symptom) - often get MetaMap matches to the sign or symptom instead of the full disease...for example, will get back "exercise-induced" instead of "immune dysfunction" for "exercise-induced immune dysfunction" bc it matches the descriptive quality "exercise-induced" is matched on 
#     intervention_args = ['--sldi -I -C -k acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,sosy -z -i -f']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.") (I used inverse of semantic terms picked for conditions here)
#     mm_interventions = run_parallel_threads_mm(interventions_unmapped_chunked, intervention_args)
#     flattened_mm_interventions = {key: [item for sublist in value for item in sublist] for key, value in mm_interventions.items()}
#     mm_interventions_df = pd.DataFrame({"intervention_input": list(flattened_mm_interventions.keys()),
#                                         "intervention_CURIE_id": [value[0] for value in flattened_mm_interventions.values()],
#                                         "intervention_CURIE_name": [value[1] for value in flattened_mm_interventions.values()],
#                                         "intervention_semantic_type": [value[-1] for value in flattened_mm_interventions.values()],
#                                         "source": "MetaMap via UMLS, term and CURIE"})

#     mm_interventions_df[['intervention_CURIE_name_1', 'intervention_CURIE_name_2']] = mm_interventions_df['intervention_CURIE_name'].str.extract(r'^(.*?)\s*\((.*?)\)$').fillna('NA') # 

#     sort_ratio = np.vectorize(get_token_sort_ratio)
#     set_ratio = np.vectorize(get_token_set_ratio)
#     sim_score = np.vectorize(get_similarity_score)

#     # many MetaMap terms are returned as "term (term)". For example, "Nonessential Amino Acid (Nonessential amino acid)". This repetition messes up the sort ratio and sim score, so we extract the substrings out of the parenthesis to conduct scoring on those
#     mm_interventions_scored = mm_interventions_df.copy()
#     mm_interventions_scored["sort_ratio"] = sort_ratio(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name"]].values) # generate fuzzy scores based between original and MeSH term
#     mm_interventions_scored["sim_score"] = sim_score(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name"]].values)

#     mm_interventions_scored["sort_ratio_1"] = sort_ratio(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name_1"]].values) # generate fuzzy scores based between original and MetaMap term
#     mm_interventions_scored["sim_score_1"] = sim_score(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name_1"]].values)

#     mm_interventions_scored["sort_ratio_2"] = sort_ratio(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name_2"]].values) # generate fuzzy scores based between original and MetaMap term
#     mm_interventions_scored["sim_score_2"] = sim_score(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name_2"]].values)

#     mm_interventions_scored_thresholded = mm_interventions_scored.copy() 
#     mm_interventions_scored_thresholded = mm_interventions_scored_thresholded[(mm_interventions_scored_thresholded['sim_score'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sort_ratio'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sim_score_1'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sort_ratio_1'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sim_score_2'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sort_ratio_2'] > 88)]
    
#     print("Number of unique interventions that are mapped after using MetaMap and similarity and ratio score thresholds of 88: {}".format(mm_interventions_scored_thresholded.shape[0]))

#     mm_interventions_scored_thresholded = mm_interventions_scored_thresholded.drop(['intervention_CURIE_name_1',
#                                                                                     'intervention_CURIE_name_2',
#                                                                                     'sort_ratio',
#                                                                                     'sim_score',
#                                                                                     'sort_ratio_1',
#                                                                                     'sim_score_1',
#                                                                                     'sort_ratio_2',
#                                                                                     'sim_score_2'], axis=1)
#     previously_mapped = ct_terms["mapped_interventions"]
#     combined_mapped_interventions = pd.concat([previously_mapped, mm_interventions_scored_thresholded], ignore_index=True) # get dataframe of combined previously mapped interventions and additional MetaMapped interventions that passed threshold scoring
#     interventions = df_dict["interventions"]
#     all_interventions_list = interventions["downcase_name"].values.tolist()
#     all_interventions_list = list(set(all_interventions_list))
#     unmapped_interventions = list(set(all_interventions_list)-set(list(combined_mapped_interventions.intervention_input.values)))
#     print("Number of unique interventions that are unmapped after using MetaMap and similarity and ratio score thresholds of 88: {}".format(len(unmapped_interventions)))
#     ct_terms = {'mapped_conditions': combined_mapped_conditions,
#                 'unmapped_conditions': unmapped_conditions,
#                 'mapped_interventions': combined_mapped_interventions,
#                 'unmapped_interventions': unmapped_interventions,
#                 'all_metamapped_conditions': mm_conditions_df,
#                 'all_metamapped_interventions': mm_interventions_df}


#     return ct_terms


In [258]:
def merge_and_score_mappings(flag_and_path):
    sort_ratio = np.vectorize(get_token_sort_ratio)
    set_ratio = np.vectorize(get_token_set_ratio)
    sim_score = np.vectorize(get_similarity_score)
    
    relevant_date = flag_and_path["date_string"]
    conditions_input = "{}_conditions.tsv".format(relevant_date)
    interventions_input = "{}_interventions.tsv".format(relevant_date)
    metamap_input = "{}_metamap_output.tsv".format(relevant_date)
    
#     conditions = pd.read_csv(conditions_input, sep='\t', index_col=False, header=0)
    metamapped = pd.read_csv(metamap_input, sep='\t', index_col=False, header=0)

#     with open(metamap_output, "r") as input_file, open(output_file_name, "w") as output_file:
#     with open(metamap_output, "r+") as metamap_results:
#         metamap_results = metamap_results.readlines()[1:]
#         for line in metamap_results:
#             mm_info = line.strip().split("\t")
            

# merge_and_score_mappings("metamap_output_08_21_2023.tsv")
merge_and_score_mappings("08_21_2023_metamap_output.tsv")

['condition', 'stress reaction', 'Stress Fractures', 'C0016664', '10.02', '[inpo]']
['condition', 'anxiety', 'Anxiety', 'C0003467', '9.95', '[mobd]']
['condition', 'anxiety', 'Anxiety Disorders', 'C0003469', '6.79', '[mobd]']
['condition', 'anxiety', 'Anxiety, CTCAE 5.0', 'C4552735', '3.64', '[fndg]']
['condition', 'stroke', 'Cerebrovascular accident', 'C0038454', '16.10', '[dsyn]']
['condition', 'stroke', 'Stroke, CTCAE', 'C4554100', '3.48', '[fndg]']
['condition', 'incisional hernia', 'Incisional hernia', 'C0267716', '13.18', '[patf]']
['condition', 'hepatobiliary neoplasm', 'Hepatobiliary Neoplasm', 'C0854196', '3.72', '[neop]']
['condition', 'prediabetes', 'Prediabetes syndrome', 'C0362046', '16.26', '[dsyn]']
['condition', "burkitt's lymphoma", 'Burkitt Lymphoma', 'C0006413', '22.64', '[neop]']
['condition', "burkitt's lymphoma", 'Adult Burkitt Lymphoma', 'C0278764', '3.72', '[neop]']
['condition', "burkitt's lymphoma", 'Childhood Burkitt Lymphoma', 'C0278879', '3.72', '[neop]']
[

In [443]:
relevant_date = "08_21_2023"
pattern_outside = r'(?<=\().+?(?=\))|([^(]+)'
pattern_inside = r'\(([^)]+)\)'

sort_ratio = np.vectorize(get_token_sort_ratio)
set_ratio = np.vectorize(get_token_set_ratio)
sim_score = np.vectorize(get_similarity_score)

metamap_input = "{}_metamap_output.tsv".format(relevant_date)
metamap_semantic_types = pd.read_csv("MetaMap_SemanticTypes_2018AB.txt")
metamapped = pd.read_csv(metamap_input, sep='\t', index_col=False, header=0)

# get the full names of the semantic types so we know what we're looking at
metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].str.replace(r'\[|\]', '', regex=True)
sem_type_col_names = ["abbv", "group", "semantic_type_full"]
metamap_semantic_types = pd.read_csv("MetaMap_SemanticTypes_2018AB.txt", sep="|", index_col=False, header=None, names=sem_type_col_names)
sem_type_dict = dict(zip(metamap_semantic_types['abbv'], metamap_semantic_types['semantic_type_full'])) # make a dict of semantic type abbv and full name
# Handle NaN (None) values in metamap_semantic_type column
metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].apply(lambda x: x.split(',') if isinstance(x, str) else np.nan)
# map semantic type abbreviations to the full name of the semantic type
metamapped['metamap_semantic_type'] = metamapped['metamap_semantic_type'].apply(lambda x: '|'.join([sem_type_dict[term] if term in sem_type_dict else term for term in x]) if isinstance(x, list) else x)

metamapped['metamap_preferred_name'] = metamapped['metamap_preferred_name'].str.lower()
metamapped = metamapped.dropna(axis=0)
metamapped = metamapped[["clin_trial_term", "metamap_cui","metamap_preferred_name", "metamap_semantic_type"]]

matches_outside = metamapped['clin_trial_term'].str.extract(pattern_outside)
metamapped['clin_trial_term_outside_par'] = matches_outside[0].fillna('')
matches_inside = metamapped['clin_trial_term'].str.extract(pattern_inside)
metamapped['clin_trial_term_inside_par'] = matches_inside[0].fillna('')

metamapped = metamapped[['clin_trial_term', 'clin_trial_term_outside_par', 'clin_trial_term_inside_par', 'metamap_cui', 'metamap_preferred_name', 'metamap_semantic_type']] # re-order columns of df

# score on clin_trial_term term vs MetaMap term
metamapped["sort_ratio_orig"] = sort_ratio(metamapped[["clin_trial_term"]].values, metamapped[["metamap_preferred_name"]].values) 
metamapped["sim_score_orig"] = sim_score(metamapped[["clin_trial_term"]].values, metamapped[["metamap_preferred_name"]].values)
# score on term outside parentheses (clin_trial_term_outside_par) vs MetaMap term
metamapped["sort_ratio_outside"] = sort_ratio(metamapped[["clin_trial_term_outside_par"]].values, metamapped[["metamap_preferred_name"]].values) 
metamapped["sim_score_outside"] = sim_score(metamapped[["clin_trial_term_outside_par"]].values, metamapped[["metamap_preferred_name"]].values)
# score on term inside parentheses (clin_trial_term_inside_par) vs MetaMap term
metamapped["sort_ratio_inside"] = sort_ratio(metamapped[["clin_trial_term_inside_par"]].values, metamapped[["metamap_preferred_name"]].values) 
metamapped["sim_score_inside"] = sim_score(metamapped[["clin_trial_term_inside_par"]].values, metamapped[["metamap_preferred_name"]].values)













metamapped_manual_curation = metamapped[["clin_trial_term", "metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]]
metamapped_manual_curation = metamapped_manual_curation.copy()
metamapped_manual_curation['metamap_term_info']= metamapped_manual_curation[["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]].values.tolist()
metamapped_manual_curation.drop(["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"], axis = 1, inplace = True)

metamapped_manual_curation['metamap_term_info'] = metamapped_manual_curation['metamap_term_info'].apply(lambda x: ','.join(map(str, x))) # remove the MetaMap info from their lists bc pandas Multi-indexing doesn't work on lists
metamapped_manual_curation['temp'] = "temp"
metamapped_manual_curation.set_index(["clin_trial_term", "metamap_term_info"],inplace=True)

metamapped_manual_curation = metamapped_manual_curation.drop('temp', axis=1) # drop the redundant column now






# metamapped_manual_curation.drop(["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"], axis = 1, inplace = True)
# metamapped_manual_curation = metamapped_manual_curation.groupby('clin_trial_term')['metamap_term_info'].agg(list).reset_index()

# use Multiindexing to see lists of CURIEs available for single term
# Explode the column of lists of lists
# metamapped_manual_curation = metamapped_manual_curation.explode('metamap_term_info')

# Reset the index if needed
# metamapped_manual_curation.reset_index(drop=True, inplace=True)








# metamapped_manual_curation = metamapped_manual_curation.groupby('clin_trial_term')['metamap_term_info'].agg(list).reset_index()

# metamapped_con['max_score'] = metamapped_con[['sort_ratio', 'sim_score']].max(axis=1)
# metamapped_con = metamapped_con.sort_values('max_score').drop_duplicates('clin_trial_term', keep='first')


# metamapped_con["metamap_term_info"] = metamapped_con[["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"]].values.tolist() 
# metamapped_con.drop(["metamap_cui", "metamap_preferred_name", "metamap_semantic_type"], axis = 1, inplace = True)
# metamapped_con = metamapped_con.groupby('clin_trial_term')['metamap_term_info'].agg(list).reset_index()


with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(metamapped_manual_curation)

clin_trial_term,metamap_term_info
coronary disease,"C1956346,coronary artery disease,Disease or Syndrome"
coronary disease,"C0010068,coronary heart disease,Disease or Syndrome"
spermatocele,"C0037859,spermatocele,Disease or Syndrome"
parkinson disease,"C0030567,parkinson disease,Disease or Syndrome"
atrial fibrillation,"C0004238,atrial fibrillation,Disease or Syndrome"
atrial fibrillation,"C1963067,atrial fibrillation, ctcae,Finding"
recurrent t acute lymphoblastic leukemia,"C1961099,precursor t-cell lymphoblastic leukemia-lymphoma,Neoplastic Process"
recurrent t acute lymphoblastic leukemia,"C0279592,adult t acute lymphoblastic leukemia,Neoplastic Process"
recurrent t acute lymphoblastic leukemia,"C0279583,childhood t acute lymphoblastic leukemia,Neoplastic Process"
alcoholic liver disease,"C0023896,alcoholic liver diseases,Disease or Syndrome"


In [424]:
# Custom function to format the lists of lists
def format_lists_of_lists(cell):
    if isinstance(cell, list):
        return '\n'.join([', '.join(sublist) for sublist in cell])
    else:
        return cell

# Apply the custom function to the second column
metamapped_manual_curation['metamap_term_info'] = metamapped_manual_curation['metamap_term_info'].apply(format_lists_of_lists)

# Define the ExcelWriter
excel_writer = ExcelWriter('output.xlsx', engine='xlsxwriter')

# Write the DataFrame to Excel
metamapped_manual_curation.to_excel(excel_writer, index=False, sheet_name='Sheet1')

# Get the xlsxwriter workbook and worksheet objects
workbook = excel_writer.book
worksheet = excel_writer.sheets['Sheet1']

# Set the column width to fit the formatted cell contents
for i, col in enumerate(metamapped_manual_curation.columns):
    max_len = max(metamapped_manual_curation[col].astype(str).apply(len).max(), len(col))
    worksheet.set_column(i, i, max_len)

# Save the Excel file
excel_writer.save()






0

0

In [35]:
# # output all results to TSVs
# def compile_and_output(df_dict, ct_terms, remaining_unmapped_possible):
#     print("\n")
#     print("#   -------- -------- -------- --------  ")
#     print("Final Tallies:")
#     print("Total # of conditions mapped: {}".format(ct_terms["mapped_conditions"].shape[0]))
#     print("Total # of interventions mapped: {}".format(ct_terms["mapped_interventions"].shape[0]))
#     print("Total # of conditions unmapped or not mapped: {}".format(len(ct_terms["unmapped_conditions"])))
#     print("Total # of interventions unmapped or not mapped: {}".format(len(ct_terms["unmapped_interventions"])))    
#     # How many Clinical Trials are there? Well, it's different depending on the Conditions or Interventions dataframes...
#     conditions_nctids = len(df_dict["conditions"].nct_id.unique())
#     interventions_nctids = len(df_dict["interventions"].nct_id.unique())
#     print("Number of Clinical Trials NCITs in Conditions table: {}".format(conditions_nctids))      
#     print("Number of Clinical Trials NCITs in Interventions table: {}".format(interventions_nctids))
#     print("#   -------- -------- -------- --------  ")

#     """ create tables of unused MeSH and MetaMap CURIEs that could be used for unmapped Conditions and Interventions """
#     # -------    CONDITIONS    ------- #
#     all_conditions = df_dict["conditions"][["nct_id", "downcase_name"]]
#     conditions_mesh = pd.merge(all_conditions, 
#                                remaining_unmapped_possible["mesh_conditions_per_study"],
#                                how='left',
#                                left_on=['nct_id'],
#                                right_on = ['nct_id'])
    
#     metamap_possibilities = remaining_unmapped_possible["all_metamapped_conditions"][["condition_input", "condition_CURIE_id", "condition_CURIE_name", "condition_semantic_type"]]
#     conditions_mesh_metamap = pd.merge(conditions_mesh, 
#                                        metamap_possibilities,
#                                        how='left',
#                                        left_on=['downcase_name'],
#                                        right_on = ['condition_input'])
    
#     unmapped_conditions_possible_terms = conditions_mesh_metamap[conditions_mesh_metamap['downcase_name'].isin(ct_terms["unmapped_conditions"])]
#     unmapped_conditions_possible_terms = unmapped_conditions_possible_terms.drop('condition_input', axis=1) # drop the redundant column now
    
#     # -------    INTERVENTIONS    ------- #
#     all_interventions = df_dict["interventions"][["nct_id", "downcase_name"]]
#     interventions_mesh = pd.merge(all_interventions, 
#                                remaining_unmapped_possible["mesh_interventions_per_study"],
#                                how='left',
#                                left_on=['nct_id'],
#                                right_on = ['nct_id'])
    
#     metamap_possibilities = remaining_unmapped_possible["all_metamapped_interventions"][["intervention_input", "intervention_CURIE_id", "intervention_CURIE_name", "intervention_semantic_type"]]
#     interventions_mesh_metamap = pd.merge(interventions_mesh, 
#                                        metamap_possibilities,
#                                        how='left',
#                                        left_on=['downcase_name'],
#                                        right_on = ['intervention_input'])
    
#     unmapped_interventions_possible_terms = interventions_mesh_metamap[interventions_mesh_metamap['downcase_name'].isin(ct_terms["unmapped_interventions"])]
#     unmapped_interventions_possible_terms = unmapped_interventions_possible_terms.drop('intervention_input', axis=1) # drop the redundant column now
          
        
#     """   Output all to TSVs   """    
#     pd.Series(ct_terms["unmapped_conditions"]).to_csv('unmapped_conditions.tsv', sep="\t", index=False, header=False) # convert the list to a pandas series, then output to TSV
#     pd.Series(ct_terms["unmapped_interventions"]).to_csv('unmapped_interventions.tsv', sep="\t", index=False, header=False) # convert the list to a pandas series, then output to TSV
#     ct_terms["mapped_conditions"].to_csv('mapped_conditions.tsv', sep="\t", index=False)
#     ct_terms["mapped_interventions"].to_csv('mapped_interventions.tsv', sep="\t", index=False)
#     unmapped_conditions_possible_terms.to_csv('unmapped_conditions_possible_mappings.tsv', sep="\t", index=False)
#     unmapped_interventions_possible_terms.to_csv('unmapped_interventions_possible_mappings.tsv', sep="\t", index=False)
    



In [None]:
# def test_or_prod():
#     print("The test run of this code performs the construction of the KG on a subset of 200 Conditions and 200 Interventions from Clinical Trials.\n")
#     test_or_prod = input("Is this a test run or the production of a new version of the KG? Write T for test, or P for production: ")
#     if test_or_prod == "T":
#         flag_and_path = get_raw_ct_data() # uncomment for production
#         flag_and_path["term_program_flag"] = False
#         run_ETL_mapping(flag_and_path)
#     elif test_or_prod == "P":
#         flag_and_path = get_raw_ct_data() 
#         run_ETL_mapping(flag_and_path)
#     else:
#         print("Bad input")
#         sys.exit(0)
        

        
        

In [None]:
# def run_ETL_mapping(flag_and_path):
#     df_dict = read_raw_ct_data(flag_and_path)
#     ct_terms = exact_match_mesh(df_dict)
#     ct_terms = inexact_match_mesh(df_dict, ct_terms)

#     # pull the available MeSH terms per study out of the returned ct_terms dict 
#     mesh_conditions_per_study = ct_terms["mesh_conditions_per_study"]
#     mesh_interventions_per_study = ct_terms["mesh_interventions_per_study"]

#     ct_terms = term_list_to_nr(df_dict, ct_terms)
#     ct_terms = term_list_to_mm(df_dict, ct_terms)

#     # pull the available UMLS terms per study out of the returned ct_terms dict 
#     all_metamapped_conditions = ct_terms["all_metamapped_conditions"]
#     all_metamapped_interventions = ct_terms["all_metamapped_interventions"]

#     remaining_unmapped_possible = {"mesh_conditions_per_study": mesh_conditions_per_study,
#                                    "mesh_interventions_per_study": mesh_interventions_per_study,
#                                    "all_metamapped_conditions": all_metamapped_conditions,
#                                    "all_metamapped_interventions": all_metamapped_interventions}
#     compile_and_output(df_dict, ct_terms, remaining_unmapped_possible)


    

In [337]:
# flag_and_path = get_raw_ct_data() # uncomment for production
flag_and_path = {'term_program_flag': False,
                 'data_extracted_path': '/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/08_21_2023_extracted',
                 'date_string':'08_21_2023'} # comment for production
metamap_dirs = check_os()
df_dict = read_raw_ct_data(flag_and_path)
term_list_to_mm(df_dict, flag_and_path)
merge_and_score_mappings(flag_and_path)

# # pull the available UMLS terms per study out of the returned ct_terms dict 
# all_metamapped_conditions = ct_terms["all_metamapped_conditions"]
# all_metamapped_interventions = ct_terms["all_metamapped_interventions"]

# remaining_unmapped_possible = {"mesh_conditions_per_study": mesh_conditions_per_study,
#                                "mesh_interventions_per_study": mesh_interventions_per_study,
#                                "all_metamapped_conditions": all_metamapped_conditions,
#                                "all_metamapped_interventions": all_metamapped_interventions}
# compile_and_output(df_dict, ct_terms, remaining_unmapped_possible)


Using UMLS MetaMap to get mappings for CONDITIONS. MetaMap returns mappings, CUIs, and semantic type of mapping.
MetaMap version < 2020, conduct mapping on terms after removing ascii characters
Starting skrmedpostctl: 
started.
Starting wsdserverctl: 
started.
loading properties file /Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/WSD_Server/config/disambServer.cfg
WSD Server initializing disambiguation methods.
WSD Server databases and disambiguation methods have been initialized.
Could not listen on port : 5554 : Address already in use
Stopping skrmedpostctl: 
Stopping Tagger Server process..
Process 58466 stopped
Stopping wsdserverctl: 
Stopping WSD Server process..
Process 58468 stopped


/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/skrmedpostctl: line 50: kill: (58466) - No such process
/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/wsdserverctl: line 55: kill: (58468) - No such process


Using UMLS MetaMap to get mappings for INTERVENTIONS. MetaMap returns mappings, CUIs, and semantic type of mapping.
MetaMap version < 2020, conduct mapping on terms after removing ascii characters
Starting skrmedpostctl: 
started.
Starting wsdserverctl: 
started.
loading properties file /Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/WSD_Server/config/disambServer.cfg
Stopping skrmedpostctl: 
Stopping Tagger Server process..
Process 63698 stopped
Stopping wsdserverctl: 
Stopping WSD Server process..
Process 63700 stopped
Starting skrmedpostctl: 


/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/skrmedpostctl: line 50: kill: (63698) - No such process


started.
Starting wsdserverctl: 
started.
loading properties file /Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/WSD_Server/config/disambServer.cfg
Stopping skrmedpostctl: 
Stopping Tagger Server process..
Process 65718 stopped
Stopping wsdserverctl: 
Stopping WSD Server process..
Process 65720 stopped


/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/bin/skrmedpostctl: line 50: kill: (65718) - No such process


In [None]:
def convert_seconds_to_hms(seconds):

    """ converts the elapsed or run_time to hours, min, sec """
    hours = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return hours, minutes, seconds

current = dt.datetime.now()
ts = dt.datetime.timestamp(current)
d = dt.datetime.fromtimestamp(ts)
str_date_time = d.strftime("%d-%m-%Y, %H:%M:%S")
print("Timestamp of script start: {}".format(str_date_time))

start_time = time.time()
end_time = time.time()
elapsed_time = end_time - start_time
hours, minutes, seconds = convert_seconds_to_hms(elapsed_time)
print(f"Runtime: {hours} hours, {minutes} minutes, {seconds} seconds")

In [None]:
remaining_unmapped_possible