In [2]:
# display cells to maximum width 
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))

In [1]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
import re
import collections
import os
import json
import numpy as np
import pickle
from functools import reduce
import time
from time import sleep
import concurrent
import multiprocessing
import datetime as dt
from datetime import date
import pathlib
import configparser
import sys
import urllib
import zipfile
import csv
sys.path.insert(0, '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/pymetamap-master')
from pymetamap import MetaMap  # https://github.com/AnthonyMRios/pymetamap/blob/master/pymetamap/SubprocessBackend.py

In [6]:
# %pip install thefuzz
# %pip install levenshtein

from thefuzz import fuzz # fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

In [5]:
global metamap_dirs
global metamap_pos_server_dir
global metamap_wsd_server_dir


In [4]:
# fuzzy matching explained: https://www.datacamp.com/tutorial/fuzzy-string-python

def get_token_sort_ratio(str1, str2):
    try:
        return fuzz.token_sort_ratio(str1, str2)
    except:
        return None
    
sort_ratio = np.vectorize(get_token_sort_ratio)

def get_token_set_ratio(str1, str2):
    try:
        return fuzz.token_set_ratio(str1, str2)
    except:
        return None  
set_ratio = np.vectorize(get_token_set_ratio)

def get_similarity_score(str1, str2):
    try:
        return fuzz.ratio(str1, str2)
    except:
        return None
sim_score = np.vectorize(get_similarity_score)

In [7]:
def get_raw_ct_data():
    term_program_flag = True
    global data_dir
    global data_extracted
    
    # get all the links and associated dates of upload into a dict called date_link
    url_all = "https://aact.ctti-clinicaltrials.org/pipe_files"
    response = requests.get(url_all)
    soup = BeautifulSoup(response.text)
    body = soup.find_all('option') #Find all
    date_link = {}
    for el in body:
        tags = el.find('a')
        try:
            zip_name = tags.contents[0].split()[0]
            date = zip_name.split("_")[0]
            date = dt.datetime.strptime(date, '%Y%m%d').date()
            date_link[date] = tags.get('href')
        except:
            pass
    latest_file_date = max(date_link.keys())   # get the date of the latest upload
    url = date_link[latest_file_date]   # get the corresponding download link of the latest upload so we can download the raw data
    date_string = latest_file_date.strftime("%m_%d_%Y")
    data_dir = "{}/data".format(pathlib.Path.cwd())
    data_extracted = data_dir + "/{}_extracted".format(date_string)
    data_path = "{}/{}_pipe-delimited-export.zip".format(data_dir, date_string)
    
    if not os.path.exists(data_path):   # if folder containing most recent data doesn't exist, download and extract it into data folder
        
        term_program_flag = False   # flag below for terminating program if latest download exists (KG is assumed up to date)
        print("Downloading Clinical Trial data as of {}".format(date_string))
        response = requests.get(url)
        if response.status_code == 200:
            with open(data_path, 'wb') as file:
                file.write(response.content)
            print("Finished download of zip")
            with zipfile.ZipFile(data_path, 'r') as download:
                print("Unzipping data")
                download.extractall(data_extracted)
        else:
            print("KG is already up to date.")
    return {"term_program_flag": term_program_flag, "data_extracted_path": data_extracted, "date_string": date_string}



In [8]:
def read_raw_ct_data(flag_and_path):
    if flag_and_path["term_program_flag"]:
        print("Exiting program. Assuming KG has already been constructed from most recent data dump from AACT.")
#         exit()
#         pass
    else:
        data_extracted = flag_and_path["data_extracted_path"]
        # read in pipe-delimited files 
        conditions_df = pd.read_csv(data_extracted + '/conditions.txt', sep='|', index_col=False, header=0)
        interventions_df = pd.read_csv(data_extracted + '/interventions.txt', sep='|', index_col=False, header=0)
        interventions_alts = pd.read_csv(data_extracted + '/intervention_other_names.txt', sep='|', index_col=False, header=0)

#         browse_conditions_df = pd.read_csv(data_extracted + '/browse_conditions.txt', sep='|', index_col=False, header=0)
#         browse_interventions_df = pd.read_csv(data_extracted + '/browse_interventions.txt', sep='|', index_col=False, header=0)
        
    ### GET RID OF....CHEAT LINE FOR TESTING
        conditions_df = conditions_df.iloc[:1000]
        interventions_df = interventions_df.iloc[:1000]

    return {"conditions": conditions_df, "interventions": interventions_df
#             "browse_conditions": browse_conditions_df, "browse_interventions": browse_interventions_df
           }



In [9]:
def de_ascii_er(text):
    non_ascii = "[^\x00-\x7F]"
    pattern = re.compile(r"[^\x00-\x7F]")
    non_ascii_text = re.sub(pattern, ' ', text)
    return non_ascii_text

In [10]:
def start_metamap_servers(metamap_dirs):
    metamap_pos_server_dir = 'bin/skrmedpostctl' # Part of speech tagger
    metamap_wsd_server_dir = 'bin/wsdserverctl' # Word sense disambiguation 
    
    # Start servers
    os.system(metamap_dirs['metamap_base_dir'] + metamap_pos_server_dir + ' start') # Part of speech tagger
    os.system(metamap_dirs['metamap_base_dir'] + metamap_wsd_server_dir + ' start') # Word sense disambiguation 
    # # Sleep a bit to give time for these servers to start up
    sleep(5)

def stop_metamap_servers(metamap_dirs):
    metamap_pos_server_dir = 'bin/skrmedpostctl' # Part of speech tagger
    metamap_wsd_server_dir = 'bin/wsdserverctl' # Word sense disambiguation 
    # Stop servers
    os.system(metamap_dirs['metamap_base_dir'] + metamap_pos_server_dir + ' stop') # Part of speech tagger
    os.system(metamap_dirs['metamap_base_dir'] + metamap_wsd_server_dir + ' stop') # Word sense disambiguation 
        

In [11]:
def check_os():
    if "linux" in sys.platform:
        print("Linux platform detected")
        metamap_base_dir = "{}/metamap/".format(pathlib.Path.cwd().parents[0])
        metamap_bin_dir = 'bin/metamap20'
    else:
        metamap_base_dir = '/Volumes/TOSHIBA_EXT/ISB/clinical_trials/public_mm/' # for running on local
        metamap_bin_dir = 'bin/metamap18'
        
    return {"metamap_base_dir":metamap_base_dir, "metamap_bin_dir":metamap_bin_dir}
        

In [58]:
def run_metamap(input_term, params, mm, cond_or_inter, csv_writer):
    from_metamap = []
    try:
        concepts,error = mm.extract_concepts([input_term],
                                             restrict_to_sts = params["restrict_to_sts"],
                                             term_processing = params["term_processing"],
                                             ignore_word_order = params["ignore_word_order"],
                                             strict_model = params["strict_model"]
                                            )

        for concept in concepts:
            concept_info = []
            concept = concept._asdict()
            concept_info.extend([cond_or_inter,input_term])
            concept_info.extend([concept.get(k) for k in ['preferred_name', 'cui', 'score', 'semtypes']])
            from_metamap.append(concept_info)
    except:
        from_metamap.extend([input_term, None, None, None, None, None, None])
    for result in from_metamap:
#         print(result)
        csv_writer.writerow(result)
    return from_metamap

In [59]:
# this cell tests if MetaMap is working on the term list below
# terms = ['infarction, myocardial', 'aneurysm', 'diabetes', 'common cold', 'fracture', 'juice blend', "hormones"]

# condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
# params = {"restrict_to_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False}
# start_metamap_servers(metamap_dirs) # start the MetaMap servers
# mm = MetaMap.get_instance(metamap_dirs["metamap_base_dir"] + metamap_dirs["metamap_bin_dir"])
# cond_or_inter = "condition"

# # prep file that stores MetaMap output
# col_names = ['term_type', 'clin_trial_term','metamap_preferred_name', 'metamap_cui', 'metamap_score', 'metamap_semantic_type']
# # metamap_output = open("metamap_output().tsv".format(flag_and_path["date_string"]), 'w+', newline='') 
# metamap_output = open("metamap_output.tsv", 'w+', newline='') 
# csv_writer = csv.writer(metamap_output, delimiter='\t')
# csv_writer.writerow(col_names)

# for term in terms:
# #     test = run_metamap(term, params, mm, cond_or_inter)
#     run_metamap(term, params, mm, cond_or_inter, csv_writer)
# metamap_output.close()    
# stop_metamap_servers(metamap_dirs) # stop the MetaMap servers


In [65]:
def parallelize_metamap(term_list, params, cond_or_inter, flag_and_path, csv_writer):
    start_metamap_servers(metamap_dirs) # start the MetaMap servers
    mm = MetaMap.get_instance(metamap_dirs["metamap_base_dir"] + metamap_dirs["metamap_bin_dir"])
    with concurrent.futures.ThreadPoolExecutor((multiprocessing.cpu_count()*2) - 1) as executor:
        _ = [executor.submit(run_metamap, term, params, mm, cond_or_inter, csv_writer) for term in term_list]
    stop_metamap_servers(metamap_dirs) # stop the MetaMap servers
    


# USE METAMAP LOCAL TO MAP REMAINING TERMS

In [69]:
df_dict["interventions"]

Unnamed: 0,id,nct_id,intervention_type,name,description
0,54367098,NCT03454451,Drug,CPI-006 + ciforadenant,Selected dose of CPI-006 administered intraven...
1,54367099,NCT03454451,Drug,CPI-006 + pembrolizumab,Selected dose of CPI-006 in combination with p...
2,54367100,NCT00089843,Drug,Testosterone,Testosterone patch 150mcg daily
3,54367101,NCT00089843,Drug,Actonel (risedronate),Actonel (risedronate) 35mg PO one time weekly
4,54367102,NCT00089843,Drug,Placebo Actonel (risedronate),Placebo tablet identical in appearance to acti...
...,...,...,...,...,...
995,53737860,NCT01784081,Other,Palliative care with decision aids,Participant is followed by the palliative care...
996,53737861,NCT01784094,Device,StromaGlide,
997,53737862,NCT01784107,Drug,Belotecan and Ifosfamide,
998,53737863,NCT01784120,Drug,Doxorubicin/Genexol-PM,


In [52]:
def term_list_to_mm(df_dict, flag_and_path):
    
    # -------    CONDITIONS    ------- #
    print("Using UMLS MetaMap to get mappings for CONDITIONS. MetaMap returns mappings, CUIs, and semantic type of mapping.")
    unmapped_conditions = df_dict["conditions"].downcase_name
    unmapped_conditions = list(unmapped_conditions.unique())
    unmapped_conditions = list(filter(None, unmapped_conditions))

    deasciied_unmapped_conditions = list(map(de_ascii_er, unmapped_conditions)) # Metamap 2020 does not need de_asciier. Metamap 2018 and prior does. 

    conditions = pd.DataFrame({'original_unmapped_conditions': unmapped_conditions, 'de_asciied_unmapped_conditions': deasciied_unmapped_conditions})
    
    # some input terms have () with additional text, like an abbreviation, in them. split them out to facilitate better mapping 
    
    pattern_outisde = r'(?<=\().+?(?=\))|([^(]+)'
    pattern_inside = r'\(([^()]+)\)|([^(]+)'

    matches_outside = conditions['original_unmapped_conditions'].str.extract(pattern_outisde)
    conditions['original_condition_split_1'] = matches_outside[0].fillna('')
    matches_inside = conditions['original_unmapped_conditions'].str.extract(pattern_inside)
    conditions['original_condition_split_2'] = matches_inside[0].fillna('')

    matches_outside = conditions['de_asciied_unmapped_conditions'].str.extract(pattern_outisde)
    conditions['de_asciied_conditions_split_1'] = matches_outside[0].fillna('')
    matches_inside = conditions['de_asciied_unmapped_conditions'].str.extract(pattern_inside)
    conditions['de_asciied_conditions_split_2'] = matches_inside[0].fillna('')
    
    metamap_version = [int(s) for s in re.findall(r'\d+', metamap_dirs.get('metamap_bin_dir'))]
    
    # see MetaMap Usage instructions: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MM_2016_Usage.pdf
#     condition_args = ['--sldi -I -C -J acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf -z -i -f']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    condition_semantic_type_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
    params = {"restrict_to_sts":condition_semantic_type_restriction, "term_processing":True, "ignore_word_order":True, "strict_model":False} # strict_model and relaxed_model are presumably opposites? relaxed_model = True is what I want, but that option appears to be broken in Pymetamap (returns no results when used). Using strict_model = False instead...

    # prep output file of Metamap results
    date = flag_and_path["date_string"]
    filename = f"metamap_output_{date}.tsv"
    metamap_output = open(filename, 'w+', newline='')
    col_names = ['term_type', 'clin_trial_term','metamap_preferred_name', 'metamap_cui', 'metamap_score', 'metamap_semantic_type']
    csv_writer = csv.writer(metamap_output, delimiter='\t')
    csv_writer.writerow(col_names)
    
    if metamap_version[0] >= 20:
        print("MetaMap version >= 2020, conduct mapping on original terms")
#         mm_conditions = run_parallel_threads_mm(conditions["original_condition_split_1"].tolist(), params)
        mm_conditions = parallelize_metamap(conditions["original_condition_split_1"].tolist(), params, "condition", flag_and_path, csv_writer)
    else:
        print("MetaMap version < 2020, conduct mapping on terms after removing ascii characters")
#         mm_conditions = run_parallel_threads_mm(conditions["de_asciied_conditions_split_1"].tolist(), params)
        parallelize_metamap(conditions["original_condition_split_1"].tolist(), params, "condition", flag_and_path, csv_writer)
    stop_metamap_servers(metamap_dirs) # stop the MetaMap servers
    
    # -------    INTERVENTIONS    ------- #
    print("Using UMLS MetaMap to get mappings for INTERVENTIONS. MetaMap returns mappings, CUIs, and semantic type of mapping.")
    unmapped_conditions = df_dict["interventions"].downcase_name
    unmapped_conditions = list(unmapped_conditions.unique())
    unmapped_conditions = list(filter(None, unmapped_conditions))
    


# open("metamap_output_().tsv".format(flag_and_path["date_string"]), 'w+', newline='')
#     col_names = ['term_type', 'clin_trial_term','metamap_preferred_name', 'metamap_cui', 'metamap_score', 'metamap_semantic_type']
#     csv_writer = csv.writer(metamap_output, delimiter='\t')
#     csv_writer.writerow(col_names)
    
#     csv_writer.close()

    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
#     print("Using UMLS MetaMap to get more mappings for conditions. MetaMap returns mappings, CUIs, and semantic type of mapping.")
#     unmapped_conditions = ct_terms["unmapped_conditions"]
#     conditions_unmapped_chunked = split_list_by_char_lim(unmapped_conditions)
#     # see MetaMap Usage instructions: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MM_2016_Usage.pdf
#     # removing sosy semantic type (sign or symptom) - often get MetaMap matches to the sign or symptom instead of the full disease...for example, will get back "exercise-induced" instead of "immune dysfunction" for "exercise-induced immune dysfunction" bc it matches the descriptive quality "exercise-induced" is matched on 
#     condition_args = ['--sldi -I -C -J acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,sosy -z -i -f']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")
#     mm_conditions = run_parallel_threads_mm(conditions_unmapped_chunked, condition_args)
#     flattened_mm_conditions = {key: [item for sublist in value for item in sublist] for key, value in mm_conditions.items()}
#     mm_conditions_df = pd.DataFrame({"condition_input": list(flattened_mm_conditions.keys()),
#                                      "condition_CURIE_id": [value[0] for value in flattened_mm_conditions.values()],
#                                      "condition_CURIE_name": [value[1] for value in flattened_mm_conditions.values()],
#                                      "condition_semantic_type": [value[-1] for value in flattened_mm_conditions.values()],
#                                      "source": "MetaMap via UMLS, term and CURIE"})
    
#     mm_conditions_df[['condition_CURIE_name_1', 'condition_CURIE_name_2']] = mm_conditions_df['condition_CURIE_name'].str.extract(r'^(.*?)\s*\((.*?)\)$').fillna('NA') # 

#     sort_ratio = np.vectorize(get_token_sort_ratio)
#     set_ratio = np.vectorize(get_token_set_ratio)
#     sim_score = np.vectorize(get_similarity_score)

#     # many MetaMap terms are returned as "term (term)". For example, "Nonessential Amino Acid (Nonessential amino acid)". This repetition messes up the sort ratio and sim score, so we extract the substrings out of the parenthesis to conduct scoring on those
#     mm_conditions_scored = mm_conditions_df.copy()
#     mm_conditions_scored["sort_ratio"] = sort_ratio(mm_conditions_scored[["condition_input"]].values, mm_conditions_scored[["condition_CURIE_name"]].values) # generate fuzzy scores based between original and MeSH term
#     mm_conditions_scored["sim_score"] = sim_score(mm_conditions_scored[["condition_input"]].values, mm_conditions_scored[["condition_CURIE_name"]].values)

#     mm_conditions_scored["sort_ratio_1"] = sort_ratio(mm_conditions_scored[["condition_input"]].values, mm_conditions_scored[["condition_CURIE_name_1"]].values) # generate fuzzy scores based between original and MetaMap term
#     mm_conditions_scored["sim_score_1"] = sim_score(mm_conditions_scored[["condition_input"]].values, mm_conditions_scored[["condition_CURIE_name_1"]].values)

#     mm_conditions_scored["sort_ratio_2"] = sort_ratio(mm_conditions_scored[["condition_input"]].values, mm_conditions_scored[["condition_CURIE_name_2"]].values) # generate fuzzy scores based between original and MetaMap term
#     mm_conditions_scored["sim_score_2"] = sim_score(mm_conditions_scored[["condition_input"]].values, mm_conditions_scored[["condition_CURIE_name_2"]].values)

#     mm_conditions_scored_thresholded = mm_conditions_scored.copy() 
    
#     mm_conditions_scored["sort_ratio"] = sort_ratio(mm_conditions_scored[["condition_input"]].values, mm_conditions_scored[["condition_CURIE_name"]].values) # generate fuzzy scores based between original and MetaMap term
#     mm_conditions_scored["sim_score"] = sim_score(mm_conditions_scored[["condition_input"]].values, mm_conditions_scored[["condition_CURIE_name"]].values)
#     mm_conditions_scored_thresholded = mm_conditions_scored.copy() 
#     mm_conditions_scored_thresholded = mm_conditions_scored_thresholded[(mm_conditions_scored_thresholded['sim_score'] > 88) |
#                                                                         (mm_conditions_scored_thresholded['sort_ratio'] > 88) |
#                                                                         (mm_conditions_scored_thresholded['sim_score_1'] > 88) |
#                                                                         (mm_conditions_scored_thresholded['sort_ratio_1'] > 88) |
#                                                                         (mm_conditions_scored_thresholded['sim_score_2'] > 88) |
#                                                                         (mm_conditions_scored_thresholded['sort_ratio_2'] > 88)]
    
#     print("Number of unique conditions that are mapped after using MetaMap and similarity and ratio score thresholds of 88: {}".format(mm_conditions_scored_thresholded.shape[0]))
    
#     mm_conditions_scored_thresholded = mm_conditions_scored_thresholded.drop(['condition_CURIE_name_1',
#                                                                               'condition_CURIE_name_2',
#                                                                               'sort_ratio',
#                                                                               'sim_score',
#                                                                               'sort_ratio_1',
#                                                                               'sim_score_1',
#                                                                               'sort_ratio_2',
#                                                                               'sim_score_2'], axis=1)
#     previously_mapped = ct_terms["mapped_conditions"]
#     combined_mapped_conditions = pd.concat([previously_mapped, mm_conditions_scored_thresholded], ignore_index=True) # get dataframe of combined previously mapped conditions and additional MetaMapped interventions that passed threshold scoring

#     conditions = df_dict["conditions"]
#     all_conditions_list = conditions["downcase_name"].values.tolist()
#     all_conditions_list = list(set(all_conditions_list))
#     unmapped_conditions = list(set(all_conditions_list)-set(list(combined_mapped_conditions.condition_input.values)))
#     print("Number of unique conditions that are unmapped after using MetaMap and similarity and ratio score thresholds of 88: {}".format(len(unmapped_conditions)))
          
#     # -------    INTERVENTIONS    ------- #
#     print("Using UMLS MetaMap to get more mappings for interventions. MetaMap returns mappings, CUIs, and semantic type of mapping.")
#     unmapped_interventions = ct_terms["unmapped_interventions"]
#     interventions_unmapped_chunked = split_list_by_char_lim(unmapped_interventions)
#     # see MetaMap Usage instructions: https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MM_2016_Usage.pdf
#     # removing sosy semantic type (sign or symptom) - often get MetaMap matches to the sign or symptom instead of the full disease...for example, will get back "exercise-induced" instead of "immune dysfunction" for "exercise-induced immune dysfunction" bc it matches the descriptive quality "exercise-induced" is matched on 
#     intervention_args = ['--sldi -I -C -k acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,sosy -z -i -f']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.") (I used inverse of semantic terms picked for conditions here)
#     mm_interventions = run_parallel_threads_mm(interventions_unmapped_chunked, intervention_args)
#     flattened_mm_interventions = {key: [item for sublist in value for item in sublist] for key, value in mm_interventions.items()}
#     mm_interventions_df = pd.DataFrame({"intervention_input": list(flattened_mm_interventions.keys()),
#                                         "intervention_CURIE_id": [value[0] for value in flattened_mm_interventions.values()],
#                                         "intervention_CURIE_name": [value[1] for value in flattened_mm_interventions.values()],
#                                         "intervention_semantic_type": [value[-1] for value in flattened_mm_interventions.values()],
#                                         "source": "MetaMap via UMLS, term and CURIE"})

#     mm_interventions_df[['intervention_CURIE_name_1', 'intervention_CURIE_name_2']] = mm_interventions_df['intervention_CURIE_name'].str.extract(r'^(.*?)\s*\((.*?)\)$').fillna('NA') # 

#     sort_ratio = np.vectorize(get_token_sort_ratio)
#     set_ratio = np.vectorize(get_token_set_ratio)
#     sim_score = np.vectorize(get_similarity_score)

#     # many MetaMap terms are returned as "term (term)". For example, "Nonessential Amino Acid (Nonessential amino acid)". This repetition messes up the sort ratio and sim score, so we extract the substrings out of the parenthesis to conduct scoring on those
#     mm_interventions_scored = mm_interventions_df.copy()
#     mm_interventions_scored["sort_ratio"] = sort_ratio(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name"]].values) # generate fuzzy scores based between original and MeSH term
#     mm_interventions_scored["sim_score"] = sim_score(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name"]].values)

#     mm_interventions_scored["sort_ratio_1"] = sort_ratio(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name_1"]].values) # generate fuzzy scores based between original and MetaMap term
#     mm_interventions_scored["sim_score_1"] = sim_score(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name_1"]].values)

#     mm_interventions_scored["sort_ratio_2"] = sort_ratio(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name_2"]].values) # generate fuzzy scores based between original and MetaMap term
#     mm_interventions_scored["sim_score_2"] = sim_score(mm_interventions_scored[["intervention_input"]].values, mm_interventions_scored[["intervention_CURIE_name_2"]].values)

#     mm_interventions_scored_thresholded = mm_interventions_scored.copy() 
#     mm_interventions_scored_thresholded = mm_interventions_scored_thresholded[(mm_interventions_scored_thresholded['sim_score'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sort_ratio'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sim_score_1'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sort_ratio_1'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sim_score_2'] > 88) |
#                                                                               (mm_interventions_scored_thresholded['sort_ratio_2'] > 88)]
    
#     print("Number of unique interventions that are mapped after using MetaMap and similarity and ratio score thresholds of 88: {}".format(mm_interventions_scored_thresholded.shape[0]))

#     mm_interventions_scored_thresholded = mm_interventions_scored_thresholded.drop(['intervention_CURIE_name_1',
#                                                                                     'intervention_CURIE_name_2',
#                                                                                     'sort_ratio',
#                                                                                     'sim_score',
#                                                                                     'sort_ratio_1',
#                                                                                     'sim_score_1',
#                                                                                     'sort_ratio_2',
#                                                                                     'sim_score_2'], axis=1)
#     previously_mapped = ct_terms["mapped_interventions"]
#     combined_mapped_interventions = pd.concat([previously_mapped, mm_interventions_scored_thresholded], ignore_index=True) # get dataframe of combined previously mapped interventions and additional MetaMapped interventions that passed threshold scoring
#     interventions = df_dict["interventions"]
#     all_interventions_list = interventions["downcase_name"].values.tolist()
#     all_interventions_list = list(set(all_interventions_list))
#     unmapped_interventions = list(set(all_interventions_list)-set(list(combined_mapped_interventions.intervention_input.values)))
#     print("Number of unique interventions that are unmapped after using MetaMap and similarity and ratio score thresholds of 88: {}".format(len(unmapped_interventions)))
#     ct_terms = {'mapped_conditions': combined_mapped_conditions,
#                 'unmapped_conditions': unmapped_conditions,
#                 'mapped_interventions': combined_mapped_interventions,
#                 'unmapped_interventions': unmapped_interventions,
#                 'all_metamapped_conditions': mm_conditions_df,
#                 'all_metamapped_interventions': mm_interventions_df}


#     return ct_terms


In [35]:
# output all results to TSVs
def compile_and_output(df_dict, ct_terms, remaining_unmapped_possible):
    print("\n")
    print("#   -------- -------- -------- --------  ")
    print("Final Tallies:")
    print("Total # of conditions mapped: {}".format(ct_terms["mapped_conditions"].shape[0]))
    print("Total # of interventions mapped: {}".format(ct_terms["mapped_interventions"].shape[0]))
    print("Total # of conditions unmapped or not mapped: {}".format(len(ct_terms["unmapped_conditions"])))
    print("Total # of interventions unmapped or not mapped: {}".format(len(ct_terms["unmapped_interventions"])))    
    # How many Clinical Trials are there? Well, it's different depending on the Conditions or Interventions dataframes...
    conditions_nctids = len(df_dict["conditions"].nct_id.unique())
    interventions_nctids = len(df_dict["interventions"].nct_id.unique())
    print("Number of Clinical Trials NCITs in Conditions table: {}".format(conditions_nctids))      
    print("Number of Clinical Trials NCITs in Interventions table: {}".format(interventions_nctids))
    print("#   -------- -------- -------- --------  ")

    """ create tables of unused MeSH and MetaMap CURIEs that could be used for unmapped Conditions and Interventions """
    # -------    CONDITIONS    ------- #
    all_conditions = df_dict["conditions"][["nct_id", "downcase_name"]]
    conditions_mesh = pd.merge(all_conditions, 
                               remaining_unmapped_possible["mesh_conditions_per_study"],
                               how='left',
                               left_on=['nct_id'],
                               right_on = ['nct_id'])
    
    metamap_possibilities = remaining_unmapped_possible["all_metamapped_conditions"][["condition_input", "condition_CURIE_id", "condition_CURIE_name", "condition_semantic_type"]]
    conditions_mesh_metamap = pd.merge(conditions_mesh, 
                                       metamap_possibilities,
                                       how='left',
                                       left_on=['downcase_name'],
                                       right_on = ['condition_input'])
    
    unmapped_conditions_possible_terms = conditions_mesh_metamap[conditions_mesh_metamap['downcase_name'].isin(ct_terms["unmapped_conditions"])]
    unmapped_conditions_possible_terms = unmapped_conditions_possible_terms.drop('condition_input', axis=1) # drop the redundant column now
    
    # -------    INTERVENTIONS    ------- #
    all_interventions = df_dict["interventions"][["nct_id", "downcase_name"]]
    interventions_mesh = pd.merge(all_interventions, 
                               remaining_unmapped_possible["mesh_interventions_per_study"],
                               how='left',
                               left_on=['nct_id'],
                               right_on = ['nct_id'])
    
    metamap_possibilities = remaining_unmapped_possible["all_metamapped_interventions"][["intervention_input", "intervention_CURIE_id", "intervention_CURIE_name", "intervention_semantic_type"]]
    interventions_mesh_metamap = pd.merge(interventions_mesh, 
                                       metamap_possibilities,
                                       how='left',
                                       left_on=['downcase_name'],
                                       right_on = ['intervention_input'])
    
    unmapped_interventions_possible_terms = interventions_mesh_metamap[interventions_mesh_metamap['downcase_name'].isin(ct_terms["unmapped_interventions"])]
    unmapped_interventions_possible_terms = unmapped_interventions_possible_terms.drop('intervention_input', axis=1) # drop the redundant column now
          
        
    """   Output all to TSVs   """    
    pd.Series(ct_terms["unmapped_conditions"]).to_csv('unmapped_conditions.tsv', sep="\t", index=False, header=False) # convert the list to a pandas series, then output to TSV
    pd.Series(ct_terms["unmapped_interventions"]).to_csv('unmapped_interventions.tsv', sep="\t", index=False, header=False) # convert the list to a pandas series, then output to TSV
    ct_terms["mapped_conditions"].to_csv('mapped_conditions.tsv', sep="\t", index=False)
    ct_terms["mapped_interventions"].to_csv('mapped_interventions.tsv', sep="\t", index=False)
    unmapped_conditions_possible_terms.to_csv('unmapped_conditions_possible_mappings.tsv', sep="\t", index=False)
    unmapped_interventions_possible_terms.to_csv('unmapped_interventions_possible_mappings.tsv', sep="\t", index=False)
    



In [None]:
# def test_or_prod():
#     print("The test run of this code performs the construction of the KG on a subset of 200 Conditions and 200 Interventions from Clinical Trials.\n")
#     test_or_prod = input("Is this a test run or the production of a new version of the KG? Write T for test, or P for production: ")
#     if test_or_prod == "T":
#         flag_and_path = get_raw_ct_data() # uncomment for production
#         flag_and_path["term_program_flag"] = False
#         run_ETL_mapping(flag_and_path)
#     elif test_or_prod == "P":
#         flag_and_path = get_raw_ct_data() 
#         run_ETL_mapping(flag_and_path)
#     else:
#         print("Bad input")
#         sys.exit(0)
        

        
        

In [None]:
# def run_ETL_mapping(flag_and_path):
#     df_dict = read_raw_ct_data(flag_and_path)
#     ct_terms = exact_match_mesh(df_dict)
#     ct_terms = inexact_match_mesh(df_dict, ct_terms)

#     # pull the available MeSH terms per study out of the returned ct_terms dict 
#     mesh_conditions_per_study = ct_terms["mesh_conditions_per_study"]
#     mesh_interventions_per_study = ct_terms["mesh_interventions_per_study"]

#     ct_terms = term_list_to_nr(df_dict, ct_terms)
#     ct_terms = term_list_to_mm(df_dict, ct_terms)

#     # pull the available UMLS terms per study out of the returned ct_terms dict 
#     all_metamapped_conditions = ct_terms["all_metamapped_conditions"]
#     all_metamapped_interventions = ct_terms["all_metamapped_interventions"]

#     remaining_unmapped_possible = {"mesh_conditions_per_study": mesh_conditions_per_study,
#                                    "mesh_interventions_per_study": mesh_interventions_per_study,
#                                    "all_metamapped_conditions": all_metamapped_conditions,
#                                    "all_metamapped_interventions": all_metamapped_interventions}
#     compile_and_output(df_dict, ct_terms, remaining_unmapped_possible)


    

In [92]:
interventions_alts = pd.read_csv('/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/08_21_2023_extracted' + '/intervention_other_names.txt', sep='|', index_col=False, header=0)
interventions_alts['alt_downcase_name'] = interventions_alts['name'].str.lower()

interventions_alts

Unnamed: 0,id,nct_id,intervention_id,name,alt_downcase_name
0,27584249,NCT01738191,54313664,Strattera,strattera
1,27584250,NCT01737879,54313666,Omontys,omontys
2,27428744,NCT04545502,54003364,Gelsoft Plus,gelsoft plus
3,27584251,NCT01737879,54313667,Epogen,epogen
4,27273339,NCT04571879,53672522,Nebulized Xylocaine,nebulized xylocaine
...,...,...,...,...,...
387960,27583600,NCT03192215,54313417,Eliquis,eliquis
387961,27583601,NCT03192215,54313418,Aspirin Tablet,aspirin tablet
387962,27583602,NCT03052608,54313422,PF-06463922,pf-06463922
387963,27583603,NCT03052608,54313423,Xalkori,xalkori


In [93]:
interventions_df = pd.read_csv('/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/08_21_2023_extracted' + '/interventions.txt', sep='|', index_col=False, header=0)
interventions_df['orig_downcase_name'] = interventions_df['name'].str.lower()

interventions_df

Unnamed: 0,id,nct_id,intervention_type,name,description,orig_downcase_name
0,54367098,NCT03454451,Drug,CPI-006 + ciforadenant,Selected dose of CPI-006 administered intraven...,cpi-006 + ciforadenant
1,54367099,NCT03454451,Drug,CPI-006 + pembrolizumab,Selected dose of CPI-006 in combination with p...,cpi-006 + pembrolizumab
2,54367100,NCT00089843,Drug,Testosterone,Testosterone patch 150mcg daily,testosterone
3,54367101,NCT00089843,Drug,Actonel (risedronate),Actonel (risedronate) 35mg PO one time weekly,actonel (risedronate)
4,54367102,NCT00089843,Drug,Placebo Actonel (risedronate),Placebo tablet identical in appearance to acti...,placebo actonel (risedronate)
...,...,...,...,...,...,...
786249,54367034,NCT01191411,Other,Mailed invitations for FIT test kits,Mailed invitations for the non-invasive immuno...,mailed invitations for fit test kits
786250,54367035,NCT01191411,Other,Mailed invitations for a colonoscopy,These patients will be mailed invitations to d...,mailed invitations for a colonoscopy
786251,54367036,NCT01191411,Other,Visit Based Care,Visit based standard care at John Peter Smith ...,visit based care
786252,54367037,NCT03707873,Other,Education,Education + Medication adherence monitoring + ...,education


In [95]:
interventions_all = pd.merge(interventions_df[["id", "nct_id", "intervention_type", "orig_downcase_name", "description"]], interventions_alts[["nct_id", "intervention_id", "alt_downcase_name"]], how='left', left_on=['id', 'nct_id'], right_on = ['intervention_id', 'nct_id'])
interventions_all = interventions_all.sort_values(by='nct_id', ascending=False, na_position='first')

print(len(interventions_all))
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(interventions_all[2000:4000])
    

955979


Unnamed: 0,id,nct_id,intervention_type,orig_downcase_name,description,intervention_id,alt_downcase_name
2000,54052315,NCT01115322,Drug,no treatment with uva and uvb irradiation,Each subject will be exposed to a blank patch ...,,
2001,54367377,NCT03150407,Other,sham control ring,"Self-insertion of Sham Ring for 7 days, follow...",54367377.0,sham
2002,54367377,NCT03150407,Other,sham control ring,"Self-insertion of Sham Ring for 7 days, follow...",54367377.0,sham ring
2003,54367377,NCT03150407,Other,sham control ring,"Self-insertion of Sham Ring for 7 days, follow...",54367377.0,control
2004,53738058,NCT01786499,Behavioral,relaxation response training,,,
2005,53895303,NCT00843193,Drug,fluticasone propionate,Subjects will be supplied fluticasone propiona...,,
2006,53895304,NCT00829621,Other,75 mmhg suction,Incisional Vacuum Assisted Closure (IVAC) Devi...,,
2007,53895305,NCT00829621,Other,125 mmhg,Incisional Vacuum Assisted Closure (IVAC) Devi...,,
2008,53895306,NCT01157078,Drug,tc-5214,"Tablet, oral, twice daily for 8 weeks",,
2009,53895307,NCT01157078,Drug,placebo,"Tablet, oral, twice daily for 8 weeks",,


Unnamed: 0,id,nct_id,intervention_type,orig_downcase_name,description,intervention_id,alt_downcase_name
119713,54238820,NCT05983913,Behavioral,cognitive-motor training,The cognitive-motor training program consists ...,,
119715,54238822,NCT05983900,Procedure,brix3000,chemo-mechanical caries removal agent,,
119716,54238823,NCT05983900,Procedure,papacarie,chemo-mechanical caries removal agent,,
119717,54238824,NCT05983900,Procedure,hand excavation,mechanical caries removal,,
119718,54238825,NCT05983887,Other,therapeutic climbing,The protocol consists of using an in-door clim...,,
119722,54238826,NCT05983874,Biological,"bg505 sosip.664 gp140 vaccine, adjuvanted (3m-...",100µg Month 0 and Month 3,,
119723,54238827,NCT05983861,Diagnostic Test,identification o multi drug resistant bacteria,Identification o Multi Drug Resistant Bacteria,,
119724,54238828,NCT05983848,Radiation,99mmaracticaltide imaging,7.3.1 99mTc-maraciclatide imaging This will be...,,
119725,54238829,NCT05983835,Procedure,patients receive subsegmentectomy,Patients receive lobectomy,,
119726,54238830,NCT05983822,Device,technological group,Specific rehabilitation for the recovery of ha...,54238830.0,"amadeo® (tyromotion, austria)"


In [None]:
test[~test['Media'].str.contains('Site')]

In [88]:
test1 = pd.merge(interventions_df, interventions_alts[["nct_id", "intervention_id", "name"]], how='left', left_on=['id'], right_on = ['intervention_id'])
print(len(test))

955979


In [13]:
df_dict["conditions"]

NameError: name 'df_dict' is not defined

In [None]:
start_metamap_servers(metamap_dirs)
mm = MetaMap.get_instance(metamap_dirs["metamap_base_dir"] + metamap_dirs["metamap_bin_dir"])
condition_semtype_restriction = ['acab,anab,cgab,comd,dsyn,inpo,mobd,neop,patf,clna,fndg']  # see https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt for semantic types ("acab,anab,etc.")

params = {"restrict_to_sts":condition_semtype_restriction, "term_processing":True, "ignore_word_order":True, "relaxed_model":True, "strict_model":False}

terms = ['infarction, myocardial', 'aneurysm', 'diabetes', 'common cold', 'fracture', 'juice blend']

concepts,error = mm.extract_concepts(terms,
                                     term_processing = params["term_processing"],
                                     ignore_word_order = params["ignore_word_order"],
#                                      relaxed_model = params["relaxed_model"]
#                                      restrict_to_sts=params["restrict_to_sts"]
                                    strict_model = params["strict_model"]
                                    )
for concept in concepts:
    print(concept)
    print("\n")
stop_metamap_servers(metamap_dirs)

In [12]:
# flag_and_path = get_raw_ct_data() # uncomment for production
flag_and_path = {'term_program_flag': False,
                 'data_extracted_path': '/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/08_21_2023_extracted',
                 'date_string':'08_21_2023'} # comment for production
metamap_dirs = check_os()
df_dict = read_raw_ct_data(flag_and_path)
# term_list_to_mm(df_dict, flag_and_path)

# # pull the available UMLS terms per study out of the returned ct_terms dict 
# all_metamapped_conditions = ct_terms["all_metamapped_conditions"]
# all_metamapped_interventions = ct_terms["all_metamapped_interventions"]

# remaining_unmapped_possible = {"mesh_conditions_per_study": mesh_conditions_per_study,
#                                "mesh_interventions_per_study": mesh_interventions_per_study,
#                                "all_metamapped_conditions": all_metamapped_conditions,
#                                "all_metamapped_interventions": all_metamapped_interventions}
# compile_and_output(df_dict, ct_terms, remaining_unmapped_possible)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/Kamileh/Work/ISB/NCATS_BiomedicalTranslator/Projects/ClinicalTrials/ETL_Python/data/08_21_2023_extracted/intervention_other_names.txt.txt'

In [None]:
def convert_seconds_to_hms(seconds):

    """ converts the elapsed or run_time to hours, min, sec """
    hours = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
    return hours, minutes, seconds

current = dt.datetime.now()
ts = dt.datetime.timestamp(current)
d = dt.datetime.fromtimestamp(ts)
str_date_time = d.strftime("%d-%m-%Y, %H:%M:%S")
print("Timestamp of script start: {}".format(str_date_time))

start_time = time.time()
end_time = time.time()
elapsed_time = end_time - start_time
hours, minutes, seconds = convert_seconds_to_hms(elapsed_time)
print(f"Runtime: {hours} hours, {minutes} minutes, {seconds} seconds")

In [None]:
remaining_unmapped_possible