## This notebook constructs the KG/TSVs for Clinical Trials Using Gwenlyn's analysis

In [1]:
# display cells to maximum width 
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))

# lets you preint multiple outputs per cell, not just last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import pathlib
import re
import numpy as np

#### Get Gwen's TSV, clean Conditions and Interventions/Treatments columns, get each per NCT ID per each row

In [2]:
trials_list = pd.read_csv("trials_list.txt.gz", sep='\t', index_col=False, header=0)
trials_list

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,n_conditions,n_interventions,is_fda_regulated_drug,conditions,exp_only,ctr_only,both
0,NCT01049399,Treatment,Parallel Assignment,2009-12,1,3,?,Progressive Supranuclear Palsy|1.000,tideglusib,placebo,
1,NCT00807209,Treatment,Parallel Assignment,2008-12,1,5,?,Postoperative Pain|-0.092,High Dose SKY0402|Low Dose SKY0402,Placebo,Bupivacaine via epidural|Fentanyl via PCA
2,NCT05287724,Treatment,Crossover Assignment,2022-6-19,2,2,t,Pruritus|1.000|Skin Disorder|-0.091,N-acetyl cysteine|Placebo,,
3,NCT01432236,Treatment,Crossover Assignment,2011-10,1,2,?,Fibromyalgia|1.000,Pregabalin,placebo,
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,1,1,t,Relapsing Multiple Sclerosis|1.000,CNM-Au8,,
...,...,...,...,...,...,...,...,...,...,...,...
137111,NCT06310811,Treatment,Single Group Assignment,2024-3-7,2,1,f,Safety|1.000|Effective|-0.048,RD06-04 Cells injection,,
137112,NCT02375672,Treatment,Single Group Assignment,2015-5-28,1,2,?,Colorectal Cancer|1.000,Pembrolizumab|mFOLFOX6,,
137113,NCT04252118,Treatment,Parallel Assignment,2020-1-27,1,1,f,COVID-19|1.000,MSCs,,
137114,NCT04264533,Treatment,Parallel Assignment,2020-2-14,3,2,f,"Vitamin C|1.000|Pneumonia, Viral|0.462|Pneumon...",VC,Sterile Water for Injection,


In [3]:
# trials_list_check = trials_list[trials_list['ctr_only'].isnull()] # sometimes exp_only or ctr_only have nulls
# trials_list_check = trials_list[trials_list['exp_only'].isnull()]

# trials_list_check

In [4]:
scores_list = []
regexp = re.compile(r"^[-+]?[0-9]*\.?[0-9]+$")
conditions = trials_list["conditions"].str.split("|", expand=True)
conditions
high_score_conditions = []
low_score_conditions = []
columns = conditions.columns.tolist()
for index, row in conditions.iterrows():
    for col in columns:
        value = row[col]
        # print(type(value))
        if not value is None:
            # print(value)
            if regexp.search(value):
                condition = conditions.loc[index,col-1]
                # print(condition)
                score = float(value)
                scores_list.append(score)
                if score > 0.9:
                    high_score_conditions.extend([[index, condition]]) 
                else:
                    low_score_conditions.extend([[index, condition]]) 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,348,349,350,351,352,353,354,355,356,357
0,Progressive Supranuclear Palsy,1.000,,,,,,,,,...,,,,,,,,,,
1,Postoperative Pain,-0.092,,,,,,,,,...,,,,,,,,,,
2,Pruritus,1.000,Skin Disorder,-0.091,,,,,,,...,,,,,,,,,,
3,Fibromyalgia,1.000,,,,,,,,,...,,,,,,,,,,
4,Relapsing Multiple Sclerosis,1.000,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137111,Safety,1.000,Effective,-0.048,,,,,,,...,,,,,,,,,,
137112,Colorectal Cancer,1.000,,,,,,,,,...,,,,,,,,,,
137113,COVID-19,1.000,,,,,,,,,...,,,,,,,,,,
137114,Vitamin C,1.000,"Pneumonia, Viral",0.462,"Pneumonia, Ventilator-Associated",0.071,,,,,...,,,,,,,,,,


In [5]:
high_score_conditions[-10:]

[[137101, 'Provoked, Localized Vulvodynia'],
 [137105, 'Acute Myeloid Leukemia'],
 [137106, 'Discoid Lupus Erythematosus'],
 [137108, 'Postoperative Pain'],
 [137108, 'Oocyte Retrieval'],
 [137110, 'Allergic Conjunctivitis'],
 [137111, 'Safety'],
 [137112, 'Colorectal Cancer'],
 [137113, 'COVID-19'],
 [137114, 'Vitamin C']]

#### HISTOGRAM of scores

In [6]:
# import matplotlib.pyplot as plt

# from matplotlib.ticker import PercentFormatter

# # data = [1000, 1000, 5000, 3000, 4000, 16000, 2000]

# plt.hist(scores_list, weights=np.ones(len(scores_list)) / len(scores_list), bins=20, edgecolor='black')

# plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.xlim(0.5, 1)
# plt.show()


In [7]:
trials_list

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,n_conditions,n_interventions,is_fda_regulated_drug,conditions,exp_only,ctr_only,both
0,NCT01049399,Treatment,Parallel Assignment,2009-12,1,3,?,Progressive Supranuclear Palsy|1.000,tideglusib,placebo,
1,NCT00807209,Treatment,Parallel Assignment,2008-12,1,5,?,Postoperative Pain|-0.092,High Dose SKY0402|Low Dose SKY0402,Placebo,Bupivacaine via epidural|Fentanyl via PCA
2,NCT05287724,Treatment,Crossover Assignment,2022-6-19,2,2,t,Pruritus|1.000|Skin Disorder|-0.091,N-acetyl cysteine|Placebo,,
3,NCT01432236,Treatment,Crossover Assignment,2011-10,1,2,?,Fibromyalgia|1.000,Pregabalin,placebo,
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,1,1,t,Relapsing Multiple Sclerosis|1.000,CNM-Au8,,
...,...,...,...,...,...,...,...,...,...,...,...
137111,NCT06310811,Treatment,Single Group Assignment,2024-3-7,2,1,f,Safety|1.000|Effective|-0.048,RD06-04 Cells injection,,
137112,NCT02375672,Treatment,Single Group Assignment,2015-5-28,1,2,?,Colorectal Cancer|1.000,Pembrolizumab|mFOLFOX6,,
137113,NCT04252118,Treatment,Parallel Assignment,2020-1-27,1,1,f,COVID-19|1.000,MSCs,,
137114,NCT04264533,Treatment,Parallel Assignment,2020-2-14,3,2,f,"Vitamin C|1.000|Pneumonia, Viral|0.462|Pneumon...",VC,Sterile Water for Injection,


In [8]:
#   ---    ----  low score conditions   ----    ---   #
new_rows = []
for l in high_score_conditions:
    idx = l[0]
    corresponding_row = trials_list.loc[idx].tolist()
    new_row = corresponding_row + [l[1]]
    new_rows.append(new_row)

# first_length = len(new_rows[0])
# Check if all inner lists have the same length
# all(len(sublist) == first_length for sublist in new_rows)

ct_high = pd.DataFrame(new_rows)
cols = trials_list.columns.tolist() + ["condition"]
ct_high.columns = cols
ct_high["predicate"] = "in_clinical_trials_for"

#   ---    ----  high score conditions   ----    ---   #
new_rows = []
for l in low_score_conditions:
    idx = l[0]
    corresponding_row = trials_list.loc[idx].tolist()
    new_row = corresponding_row + [l[1]]
    new_rows.append(new_row)

# first_length = len(new_rows[0])
# Check if all inner lists have the same length
# all(len(sublist) == first_length for sublist in new_rows)

ct_low = pd.DataFrame(new_rows)
cols = trials_list.columns.tolist() + ["condition"]
ct_low.columns = cols
ct_low["predicate"] = "mentioned_in_clinical_trials_for"

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct_high[:10])
    
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct_low[:10])
len(ct_high)
len(ct_low)

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,n_conditions,n_interventions,is_fda_regulated_drug,conditions,exp_only,ctr_only,both,condition,predicate
0,NCT01049399,Treatment,Parallel Assignment,2009-12,1,3,?,Progressive Supranuclear Palsy|1.000,tideglusib,placebo,,Progressive Supranuclear Palsy,in_clinical_trials_for
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,2,2,t,Pruritus|1.000|Skin Disorder|-0.091,N-acetyl cysteine|Placebo,,,Pruritus,in_clinical_trials_for
2,NCT01432236,Treatment,Crossover Assignment,2011-10,1,2,?,Fibromyalgia|1.000,Pregabalin,placebo,,Fibromyalgia,in_clinical_trials_for
3,NCT04626921,Treatment,Single Group Assignment,2020-10-22,1,1,t,Relapsing Multiple Sclerosis|1.000,CNM-Au8,,,Relapsing Multiple Sclerosis,in_clinical_trials_for
4,NCT04681417,Treatment,Parallel Assignment,2021-3-25,1,7,f,Retinoblastoma|1.000,Melphalan or Melphalan + Topotecan,"Carboplatin administered on Day 1|etoposide, carboplatin and vincristine or local ophthalmological treatment without IV chemotherapy",Cryotherapy (local treatment)|Intravitreal Melphalan chemotherapy injections (local treatment)|Iodine-125 plaques (local treatment)|Thermotherapy (local treatment),Retinoblastoma,in_clinical_trials_for
5,NCT04036227,Treatment,Parallel Assignment,2019-7-3,1,2,f,Healthy|1.000,GS-248,Placebo,,Healthy,in_clinical_trials_for
6,NCT05624918,Treatment,Single Group Assignment,2024-4,2,3,f,Pancreatic Adenocarcinoma|1.000|Resectable Pancreatic Cancer|0.566,Gemcitabine|Nab paclitaxel|NovoTTF-200T(P),,,Pancreatic Adenocarcinoma,in_clinical_trials_for
7,NCT02886065,Prevention,Parallel Assignment,2017-3-7,1,4,t,Smoldering Multiple Myeloma|1.000,Citarinostat|Hiltonol|Lenalidomide|PVX-410,,,Smoldering Multiple Myeloma,in_clinical_trials_for
8,NCT03683576,Treatment,Parallel Assignment,2018-10-22,1,2,t,Asthma|1.000,GB001,Placebo,,Asthma,in_clinical_trials_for
9,NCT00738673,Treatment,Single Group Assignment,2008-7,1,1,?,Prostate Cancer|1.000,degarelix,,,Prostate Cancer,in_clinical_trials_for


Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,n_conditions,n_interventions,is_fda_regulated_drug,conditions,exp_only,ctr_only,both,condition,predicate
0,NCT00807209,Treatment,Parallel Assignment,2008-12,1,5,?,Postoperative Pain|-0.092,High Dose SKY0402|Low Dose SKY0402,Placebo,Bupivacaine via epidural|Fentanyl via PCA,Postoperative Pain,mentioned_in_clinical_trials_for
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,2,2,t,Pruritus|1.000|Skin Disorder|-0.091,N-acetyl cysteine|Placebo,,,Skin Disorder,mentioned_in_clinical_trials_for
2,NCT05624918,Treatment,Single Group Assignment,2024-4,2,3,f,Pancreatic Adenocarcinoma|1.000|Resectable Pancreatic Cancer|0.566,Gemcitabine|Nab paclitaxel|NovoTTF-200T(P),,,Resectable Pancreatic Cancer,mentioned_in_clinical_trials_for
3,NCT01003288,Prevention,Single Group Assignment,2009-10,1,1,?,Healthy|-0.400,Adjuvanted influenza H1N1split virion vaccine,,,Healthy,mentioned_in_clinical_trials_for
4,NCT06257693,Treatment,Single Group Assignment,2024-2,1,1,t,Prostate Adenocarcinoma|0.033,enzalutamide,,,Prostate Adenocarcinoma,mentioned_in_clinical_trials_for
5,NCT00003494,Treatment,Single Group Assignment,1996-3-26,1,1,t,Bronchial Alveolar; Tumor|0.567,Antineoplaston therapy (Atengenal + Astugenal),,,Bronchial Alveolar; Tumor,mentioned_in_clinical_trials_for
6,NCT06096909,Treatment,Parallel Assignment,2023-11-1,2,3,f,Acute Coronary Syndrome|1.000|Non ST Segment Elevation Acute Coronary Syndrome|0.279,Tafolecimab,Cholesterol Absorption Inhibitor,Statin,Non ST Segment Elevation Acute Coronary Syndrome,mentioned_in_clinical_trials_for
7,NCT00012012,Treatment,Single Group Assignment,2001-8,2,4,?,Cervical Cancer|1.000|Radiation Toxicity|0.242,Amifostine trihydrate|Cisplatin|External beam radiation therapy|Intracavitary brachytherapy,,,Radiation Toxicity,mentioned_in_clinical_trials_for
8,NCT05377203,Treatment,Crossover Assignment,2022-7-13,2,2,f,Hypertension|1.000|Arterial Hypertension|0.304,Dual combination of standard dose therapy→ Quadruple combination of half doses therapy|Quadruple combination of half doses therapy→Dual combination of standard dose therapy,,,Arterial Hypertension,mentioned_in_clinical_trials_for
9,NCT05762107,Prevention,Crossover Assignment,2023-7-28,1,4,t,Type 1 Diabetes Mellitus With Hypoglycemia|0.733,"Placebo|ZT-01, 15 mg|ZT-01, 22 mg|ZT-01, 7 mg",,,Type 1 Diabetes Mellitus With Hypoglycemia,mentioned_in_clinical_trials_for


104829

131559

In [9]:
ct = pd.concat([ct_high, ct_low], axis=0)
print(len(ct))

ct_exp_only = ct.copy()
ct_exp_only = ct_exp_only.drop(["ctr_only", "both"], axis=1).reset_index(drop=True)
ct_exp_only["exp_only_copy"] = ct_exp_only["exp_only"]
ct_exp_only.exp_only_copy = ct_exp_only.exp_only_copy.str.split('|')
ct_exp_only = ct_exp_only.explode('exp_only_copy')
ct_exp_only = ct_exp_only.rename(columns={'exp_only_copy': 'treatment'})

ct_ctr_only = ct.copy()
ct_ctr_only = ct_ctr_only.drop(["exp_only", "both"], axis=1).reset_index(drop=True)
ct_ctr_only["ctr_only_copy"] = ct_ctr_only["ctr_only"]
ct_ctr_only.ctr_only_copy = ct_ctr_only.ctr_only_copy.str.split('|')
ct_ctr_only = ct_ctr_only.explode('ctr_only_copy')
ct_ctr_only = ct_ctr_only.rename(columns={'ctr_only_copy': 'treatment'})
ct_ctr_only["predicate"] = "mentioned_in_clinical_trials_for"

ct = pd.concat([ct_exp_only, ct_ctr_only], axis=0)
ct.dropna(subset=['treatment'], inplace=True)
ct = ct[~ct['treatment'].isnull()]
ct = ct.drop(["conditions", "exp_only", "ctr_only", "n_conditions", "n_interventions"], axis=1).reset_index(drop=True)
print(len(ct))

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[-10:])

236388
544550


Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment
544540,NCT01996384,Treatment,Parallel Assignment,2013-11,?,Provoked Vestibulodynia,mentioned_in_clinical_trials_for,Non-classical acupuncture
544541,NCT01996384,Treatment,Parallel Assignment,2013-11,?,Vulvar Vestibulitis,mentioned_in_clinical_trials_for,Non-classical acupuncture
544542,NCT01247064,Treatment,Parallel Assignment,2010-10,?,"Bronchiolitis, Viral",mentioned_in_clinical_trials_for,Nebulized 0.9% Normal Saline
544543,NCT01247064,Treatment,Parallel Assignment,2010-10,?,"Saline Solution, Hypertonic",mentioned_in_clinical_trials_for,Nebulized 0.9% Normal Saline
544544,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Carboplatin
544545,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Gemcitabine
544546,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Paclitaxel
544547,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Pemetrexed
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection


In [10]:
# List of interventions to block 
undesirable_interventions = [
    "placebo", "standard of care", "laboratory biomarker analysis",
    "questionnaire", "standard treatment",
    "data collection", "educational intervention",
    "intervention group", "training", "management of therapy complications", 
    "contingency management", "active control", "experimental group",
    " sham ", "sham intervention", "active comparator",
    "patient navigation", "self-management", "quality of life",
    "treatment group", "study", "routine care",
]

ct["treatment_lower"] = ct["treatment"].str.lower()
# Create a boolean mask for each substring and combine them with logical OR
mask = ct['treatment_lower'].str.contains('|'.join(undesirable_interventions))
# Filter the DataFrame using the mask and replace matching rows with None
ct.loc[mask, 'treatment_lower'] = None
ct = ct[ct['treatment_lower'].notna()]
ct = ct[~ct['treatment_lower'].isnull()]
print(len(ct))

ct["condition_lower"] = ct["condition"].str.lower()
# Create a boolean mask for each substring and combine them with logical OR
mask = ct['condition_lower'].str.contains('|'.join(undesirable_interventions))
# Filter the DataFrame using the mask and replace matching rows with None
ct.loc[mask, 'condition_lower'] = None
ct = ct[ct['condition_lower'].notna()]
ct = ct[~ct['condition_lower'].isnull()]
print(len(ct))

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[:10])

470185
469833


Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,treatment_lower,condition_lower
0,NCT01049399,Treatment,Parallel Assignment,2009-12,?,Progressive Supranuclear Palsy,in_clinical_trials_for,tideglusib,tideglusib,progressive supranuclear palsy
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,t,Pruritus,in_clinical_trials_for,N-acetyl cysteine,n-acetyl cysteine,pruritus
3,NCT01432236,Treatment,Crossover Assignment,2011-10,?,Fibromyalgia,in_clinical_trials_for,Pregabalin,pregabalin,fibromyalgia
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,t,Relapsing Multiple Sclerosis,in_clinical_trials_for,CNM-Au8,cnm-au8,relapsing multiple sclerosis
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,melphalan or melphalan + topotecan,retinoblastoma
6,NCT04036227,Treatment,Parallel Assignment,2019-7-3,f,Healthy,in_clinical_trials_for,GS-248,gs-248,healthy
7,NCT05624918,Treatment,Single Group Assignment,2024-4,f,Pancreatic Adenocarcinoma,in_clinical_trials_for,Gemcitabine,gemcitabine,pancreatic adenocarcinoma
8,NCT05624918,Treatment,Single Group Assignment,2024-4,f,Pancreatic Adenocarcinoma,in_clinical_trials_for,Nab paclitaxel,nab paclitaxel,pancreatic adenocarcinoma
9,NCT05624918,Treatment,Single Group Assignment,2024-4,f,Pancreatic Adenocarcinoma,in_clinical_trials_for,NovoTTF-200T(P),novottf-200t(p),pancreatic adenocarcinoma
10,NCT02886065,Prevention,Parallel Assignment,2017-3-7,t,Smoldering Multiple Myeloma,in_clinical_trials_for,Citarinostat,citarinostat,smoldering multiple myeloma


In [11]:
# ct.to_csv("ct_check.tsv", sep='\t', index=False, header=True)
ct.dtypes

nct_id                   object
primary_purpose          object
intervention_model       object
start_month_year         object
is_fda_regulated_drug    object
condition                object
predicate                object
treatment                object
treatment_lower          object
condition_lower          object
dtype: object

In [12]:
ct['condition_outside_p'] =  [re.sub(r"\((.*?)\)",'', str(x)) for x in ct['condition_lower']]
ct['condition_inside_p'] = ct['condition_lower'].str.extract(r"\((.*?)\)", expand=True)

ct['treatment_outside_p'] =  [re.sub(r"\((.*?)\)",'', str(x)) for x in ct['treatment_lower']]
ct['treatment_inside_p'] = ct['treatment_lower'].str.extract(r"\((.*?)\)", expand=True)

ct['condition_outside_b'] =  [re.sub(r'\[([^\]]+)\]','', str(x)) for x in ct['condition_lower']]
ct['condition_inside_b'] = ct['condition_lower'].str.extract(r'\[([^\]]+)\]', expand=True)

ct['treatment_outside_b'] =  [re.sub(r'\[([^\]]+)\]','', str(x)) for x in ct['treatment_lower']]
ct['treatment_inside_b'] = ct['treatment_lower'].str.extract(r'\[([^\]]+)\]', expand=True)

ct = ct.apply(lambda x: x.str.strip() if x.dtype == "object" else x) # strip leading and trailing whitespace
ct = ct[~ct['condition_lower'].isnull()]
ct = ct[~ct['treatment_lower'].isnull()]

split_chars = [",", "+", "/", " and ", "&", " or "]
split_chars = [re.escape(char) for char in split_chars]

# Function to split column based on list of characters
def split_column(df, column, split_chars):
    # Create a temporary dataframe to store split values
    temp_df = df[column].str.split('|'.join(split_chars), expand=True)
    # Rename columns of the temporary dataframe
    temp_df.columns = [f'{column}_{i+1}' for i in range(temp_df.shape[1])]
    return temp_df

# Apply split_column function to the desired column
split_con_outside_p = split_column(ct, 'condition_outside_p', split_chars)
split_con_inside_p = split_column(ct, 'condition_inside_p', split_chars)
split_trmnt_outside_p = split_column(ct, 'treatment_outside_p', split_chars)
split_trmnt_inside_p = split_column(ct, 'treatment_inside_p', split_chars)

split_con_outside_b = split_column(ct, 'condition_outside_b', split_chars)
split_con_inside_b = split_column(ct, 'condition_inside_b', split_chars)
split_trmnt_outside_b = split_column(ct, 'treatment_outside_b', split_chars)
split_trmnt_inside_b = split_column(ct, 'treatment_inside_b', split_chars)

# Concatenate the split dataframe with the original dataframe
split_df = pd.concat([split_con_outside_p, split_con_inside_p, split_trmnt_outside_p, split_trmnt_inside_p,
                     split_con_outside_b, split_con_inside_b, split_trmnt_outside_b, split_trmnt_inside_b], axis=1)
split_df = split_df.fillna(np.nan).replace([np.nan], [None]) # replace NaN values in dataframe with None (just for consistency)
split_df = pd.concat([ct["condition_lower"], ct["treatment_lower"], split_df], axis=1)

split_df = split_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) # strip leading and trailing whitespace

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(split_df[:10])

Unnamed: 0,condition_lower,treatment_lower,condition_outside_p_1,condition_outside_p_2,condition_outside_p_3,condition_outside_p_4,condition_outside_p_5,condition_outside_p_6,condition_outside_p_7,condition_outside_p_8,condition_outside_p_9,condition_outside_p_10,condition_outside_p_11,condition_outside_p_12,condition_inside_p_1,condition_inside_p_2,condition_inside_p_3,condition_inside_p_4,condition_inside_p_5,condition_inside_p_6,treatment_outside_p_1,treatment_outside_p_2,treatment_outside_p_3,treatment_outside_p_4,treatment_outside_p_5,treatment_outside_p_6,treatment_outside_p_7,treatment_outside_p_8,treatment_outside_p_9,treatment_outside_p_10,treatment_outside_p_11,treatment_outside_p_12,treatment_outside_p_13,treatment_outside_p_14,treatment_outside_p_15,treatment_outside_p_16,treatment_inside_p_1,treatment_inside_p_2,treatment_inside_p_3,treatment_inside_p_4,treatment_inside_p_5,treatment_inside_p_6,treatment_inside_p_7,treatment_inside_p_8,treatment_inside_p_9,condition_outside_b_1,condition_outside_b_2,condition_outside_b_3,condition_outside_b_4,condition_outside_b_5,condition_outside_b_6,condition_outside_b_7,condition_outside_b_8,condition_outside_b_9,condition_outside_b_10,condition_outside_b_11,condition_outside_b_12,condition_inside_b_1,treatment_outside_b_1,treatment_outside_b_2,treatment_outside_b_3,treatment_outside_b_4,treatment_outside_b_5,treatment_outside_b_6,treatment_outside_b_7,treatment_outside_b_8,treatment_outside_b_9,treatment_outside_b_10,treatment_outside_b_11,treatment_outside_b_12,treatment_outside_b_13,treatment_outside_b_14,treatment_outside_b_15,treatment_outside_b_16,treatment_inside_b_1,treatment_inside_b_2,treatment_inside_b_3,treatment_inside_b_4,treatment_inside_b_5
0,progressive supranuclear palsy,tideglusib,progressive supranuclear palsy,,,,,,,,,,,,,,,,,,tideglusib,,,,,,,,,,,,,,,,,,,,,,,,,progressive supranuclear palsy,,,,,,,,,,,,,tideglusib,,,,,,,,,,,,,,,,,,,,
1,pruritus,n-acetyl cysteine,pruritus,,,,,,,,,,,,,,,,,,n-acetyl cysteine,,,,,,,,,,,,,,,,,,,,,,,,,pruritus,,,,,,,,,,,,,n-acetyl cysteine,,,,,,,,,,,,,,,,,,,,
3,fibromyalgia,pregabalin,fibromyalgia,,,,,,,,,,,,,,,,,,pregabalin,,,,,,,,,,,,,,,,,,,,,,,,,fibromyalgia,,,,,,,,,,,,,pregabalin,,,,,,,,,,,,,,,,,,,,
4,relapsing multiple sclerosis,cnm-au8,relapsing multiple sclerosis,,,,,,,,,,,,,,,,,,cnm-au8,,,,,,,,,,,,,,,,,,,,,,,,,relapsing multiple sclerosis,,,,,,,,,,,,,cnm-au8,,,,,,,,,,,,,,,,,,,,
5,retinoblastoma,melphalan or melphalan + topotecan,retinoblastoma,,,,,,,,,,,,,,,,,,melphalan,melphalan,topotecan,,,,,,,,,,,,,,,,,,,,,,,retinoblastoma,,,,,,,,,,,,,melphalan,melphalan,topotecan,,,,,,,,,,,,,,,,,,
6,healthy,gs-248,healthy,,,,,,,,,,,,,,,,,,gs-248,,,,,,,,,,,,,,,,,,,,,,,,,healthy,,,,,,,,,,,,,gs-248,,,,,,,,,,,,,,,,,,,,
7,pancreatic adenocarcinoma,gemcitabine,pancreatic adenocarcinoma,,,,,,,,,,,,,,,,,,gemcitabine,,,,,,,,,,,,,,,,,,,,,,,,,pancreatic adenocarcinoma,,,,,,,,,,,,,gemcitabine,,,,,,,,,,,,,,,,,,,,
8,pancreatic adenocarcinoma,nab paclitaxel,pancreatic adenocarcinoma,,,,,,,,,,,,,,,,,,nab paclitaxel,,,,,,,,,,,,,,,,,,,,,,,,,pancreatic adenocarcinoma,,,,,,,,,,,,,nab paclitaxel,,,,,,,,,,,,,,,,,,,,
9,pancreatic adenocarcinoma,novottf-200t(p),pancreatic adenocarcinoma,,,,,,,,,,,,,,,,,,novottf-200t,,,,,,,,,,,,,,,,p,,,,,,,,,pancreatic adenocarcinoma,,,,,,,,,,,,,novottf-200t(p),,,,,,,,,,,,,,,,,,,,
10,smoldering multiple myeloma,citarinostat,smoldering multiple myeloma,,,,,,,,,,,,,,,,,,citarinostat,,,,,,,,,,,,,,,,,,,,,,,,,smoldering multiple myeloma,,,,,,,,,,,,,citarinostat,,,,,,,,,,,,,,,,,,,,


In [13]:
split_t = split_df.transpose()
split_t = split_t.apply(lambda x: x.mask(x.duplicated(), None))
split_df = split_t.transpose()
split_df = split_df[~split_df['condition_lower'].isnull()]
split_df = split_df[~split_df['treatment_lower'].isnull()]

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(split_df[:10])

Unnamed: 0,condition_lower,treatment_lower,condition_outside_p_1,condition_outside_p_2,condition_outside_p_3,condition_outside_p_4,condition_outside_p_5,condition_outside_p_6,condition_outside_p_7,condition_outside_p_8,condition_outside_p_9,condition_outside_p_10,condition_outside_p_11,condition_outside_p_12,condition_inside_p_1,condition_inside_p_2,condition_inside_p_3,condition_inside_p_4,condition_inside_p_5,condition_inside_p_6,treatment_outside_p_1,treatment_outside_p_2,treatment_outside_p_3,treatment_outside_p_4,treatment_outside_p_5,treatment_outside_p_6,treatment_outside_p_7,treatment_outside_p_8,treatment_outside_p_9,treatment_outside_p_10,treatment_outside_p_11,treatment_outside_p_12,treatment_outside_p_13,treatment_outside_p_14,treatment_outside_p_15,treatment_outside_p_16,treatment_inside_p_1,treatment_inside_p_2,treatment_inside_p_3,treatment_inside_p_4,treatment_inside_p_5,treatment_inside_p_6,treatment_inside_p_7,treatment_inside_p_8,treatment_inside_p_9,condition_outside_b_1,condition_outside_b_2,condition_outside_b_3,condition_outside_b_4,condition_outside_b_5,condition_outside_b_6,condition_outside_b_7,condition_outside_b_8,condition_outside_b_9,condition_outside_b_10,condition_outside_b_11,condition_outside_b_12,condition_inside_b_1,treatment_outside_b_1,treatment_outside_b_2,treatment_outside_b_3,treatment_outside_b_4,treatment_outside_b_5,treatment_outside_b_6,treatment_outside_b_7,treatment_outside_b_8,treatment_outside_b_9,treatment_outside_b_10,treatment_outside_b_11,treatment_outside_b_12,treatment_outside_b_13,treatment_outside_b_14,treatment_outside_b_15,treatment_outside_b_16,treatment_inside_b_1,treatment_inside_b_2,treatment_inside_b_3,treatment_inside_b_4,treatment_inside_b_5
0,progressive supranuclear palsy,tideglusib,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,pruritus,n-acetyl cysteine,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,fibromyalgia,pregabalin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,relapsing multiple sclerosis,cnm-au8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,retinoblastoma,melphalan or melphalan + topotecan,,,,,,,,,,,,,,,,,,,melphalan,,topotecan,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,healthy,gs-248,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,pancreatic adenocarcinoma,gemcitabine,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,pancreatic adenocarcinoma,nab paclitaxel,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,pancreatic adenocarcinoma,novottf-200t(p),,,,,,,,,,,,,,,,,,,novottf-200t,,,,,,,,,,,,,,,,p,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10,smoldering multiple myeloma,citarinostat,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [14]:
len(ct)
len(split_df)

469833

469034

In [15]:
test = split_df[split_df['treatment_lower'].isnull()]
test

Unnamed: 0,condition_lower,treatment_lower,condition_outside_p_1,condition_outside_p_2,condition_outside_p_3,condition_outside_p_4,condition_outside_p_5,condition_outside_p_6,condition_outside_p_7,condition_outside_p_8,...,treatment_outside_b_12,treatment_outside_b_13,treatment_outside_b_14,treatment_outside_b_15,treatment_outside_b_16,treatment_inside_b_1,treatment_inside_b_2,treatment_inside_b_3,treatment_inside_b_4,treatment_inside_b_5


In [16]:
condition_cols = [ele for ele in split_df.columns if any([substr in ele for substr in ['condition']])] # find columns with "condition" in it 
treatment_cols = [ele for ele in split_df.columns if any([substr in ele for substr in ['treatment']])] # find columns with "treatment" in it 
condition_cols[:5]
treatment_cols[:5]

['condition_lower',
 'condition_outside_p_1',
 'condition_outside_p_2',
 'condition_outside_p_3',
 'condition_outside_p_4']

['treatment_lower',
 'treatment_outside_p_1',
 'treatment_outside_p_2',
 'treatment_outside_p_3',
 'treatment_outside_p_4']

In [17]:
split_df["condition_collections"] = split_df[condition_cols].stack().groupby(level=0).apply(list).values.tolist()  # get the columns with conditions into a list in a single column, ignoring the None values
split_df["treatment_collections"] = split_df[treatment_cols].stack().groupby(level=0).apply(list).values.tolist()  # get the columns with treatments into a list in a single column, ignoring the None values

# split_df
# trmnts = split_df[treatment_cols].stack().groupby(level=0).apply(list).values.tolist()
# len(trmnts)

# conds = split_df[condition_cols].stack().groupby(level=0).apply(list).values.tolist()
# len(conds)

In [18]:
ct["condition_collections"] = split_df["condition_collections"]
ct["treatment_collections"] = split_df["treatment_collections"]
cols_to_keep = [element for element in ct.columns if not any(substring in element for substring in ['inside', 'outside', 'lower'])]
ct = ct[cols_to_keep]
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[-10:])

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,condition_collections,treatment_collections
544540,NCT01996384,Treatment,Parallel Assignment,2013-11,?,Provoked Vestibulodynia,mentioned_in_clinical_trials_for,Non-classical acupuncture,[provoked vestibulodynia],[non-classical acupuncture]
544541,NCT01996384,Treatment,Parallel Assignment,2013-11,?,Vulvar Vestibulitis,mentioned_in_clinical_trials_for,Non-classical acupuncture,[vulvar vestibulitis],[non-classical acupuncture]
544542,NCT01247064,Treatment,Parallel Assignment,2010-10,?,"Bronchiolitis, Viral",mentioned_in_clinical_trials_for,Nebulized 0.9% Normal Saline,"[bronchiolitis, viral, bronchiolitis, viral]",[nebulized 0.9% normal saline]
544543,NCT01247064,Treatment,Parallel Assignment,2010-10,?,"Saline Solution, Hypertonic",mentioned_in_clinical_trials_for,Nebulized 0.9% Normal Saline,"[saline solution, hypertonic, saline solution, hypertonic]",[nebulized 0.9% normal saline]
544544,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Carboplatin,[first line non-small cell lung cancer],[carboplatin]
544545,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Gemcitabine,[first line non-small cell lung cancer],[gemcitabine]
544546,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Paclitaxel,[first line non-small cell lung cancer],[paclitaxel]
544547,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Pemetrexed,[first line non-small cell lung cancer],[pemetrexed]
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,"[pneumonia, viral, pneumonia, viral]",[sterile water for injection]
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,"[pneumonia, ventilator-associated, pneumonia, ventilator-associated]",[sterile water for injection]


In [19]:
ct = ct.explode('condition_collections')
ct = ct.explode('treatment_collections')
len(ct)

795705

In [20]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[-10:])

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,condition_collections,treatment_collections
544544,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Carboplatin,first line non-small cell lung cancer,carboplatin
544545,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Gemcitabine,first line non-small cell lung cancer,gemcitabine
544546,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Paclitaxel,first line non-small cell lung cancer,paclitaxel
544547,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Pemetrexed,first line non-small cell lung cancer,pemetrexed
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,"pneumonia, viral",sterile water for injection
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,viral,sterile water for injection
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,"pneumonia, ventilator-associated",sterile water for injection
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,ventilator-associated,sterile water for injection


In [21]:
ct['condition_collections'] = ct['condition_collections'].apply(lambda x: x if len(str(x)) > 3 else None)
ct['treatment_collections'] = ct['treatment_collections'].apply(lambda x: x if len(str(x)) > 3 else None)
ct = ct[~ct['condition_collections'].isnull()]
ct = ct[~ct['treatment_collections'].isnull()]
ct

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,condition_collections,treatment_collections
0,NCT01049399,Treatment,Parallel Assignment,2009-12,?,Progressive Supranuclear Palsy,in_clinical_trials_for,tideglusib,progressive supranuclear palsy,tideglusib
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,t,Pruritus,in_clinical_trials_for,N-acetyl cysteine,pruritus,n-acetyl cysteine
3,NCT01432236,Treatment,Crossover Assignment,2011-10,?,Fibromyalgia,in_clinical_trials_for,Pregabalin,fibromyalgia,pregabalin
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,t,Relapsing Multiple Sclerosis,in_clinical_trials_for,CNM-Au8,relapsing multiple sclerosis,cnm-au8
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,retinoblastoma,melphalan or melphalan + topotecan
...,...,...,...,...,...,...,...,...,...,...
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,viral,sterile water for injection
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,"pneumonia, ventilator-associated",sterile water for injection
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection


In [22]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[8700:8710])

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,condition_collections,treatment_collections
6084,NCT06012760,Treatment,Parallel Assignment,2024-4-1,f,Cardiac Surgery,in_clinical_trials_for,"Iron sucrose, Human Erythropoietin Injection, Vitamin C",cardiac surgery,human erythropoietin injection
6084,NCT06012760,Treatment,Parallel Assignment,2024-4-1,f,Cardiac Surgery,in_clinical_trials_for,"Iron sucrose, Human Erythropoietin Injection, Vitamin C",cardiac surgery,vitamin c
6085,NCT03676309,Treatment,Parallel Assignment,2017-9-1,f,"Diabetes Mellitus, Type 2",in_clinical_trials_for,"Nutriceutical Oral Capsule,","diabetes mellitus, type 2","nutriceutical oral capsule,"
6085,NCT03676309,Treatment,Parallel Assignment,2017-9-1,f,"Diabetes Mellitus, Type 2",in_clinical_trials_for,"Nutriceutical Oral Capsule,","diabetes mellitus, type 2",nutriceutical oral capsule
6085,NCT03676309,Treatment,Parallel Assignment,2017-9-1,f,"Diabetes Mellitus, Type 2",in_clinical_trials_for,"Nutriceutical Oral Capsule,",diabetes mellitus,"nutriceutical oral capsule,"
6085,NCT03676309,Treatment,Parallel Assignment,2017-9-1,f,"Diabetes Mellitus, Type 2",in_clinical_trials_for,"Nutriceutical Oral Capsule,",diabetes mellitus,nutriceutical oral capsule
6085,NCT03676309,Treatment,Parallel Assignment,2017-9-1,f,"Diabetes Mellitus, Type 2",in_clinical_trials_for,"Nutriceutical Oral Capsule,",type 2,"nutriceutical oral capsule,"
6085,NCT03676309,Treatment,Parallel Assignment,2017-9-1,f,"Diabetes Mellitus, Type 2",in_clinical_trials_for,"Nutriceutical Oral Capsule,",type 2,nutriceutical oral capsule
6086,NCT04015492,Treatment,Crossover Assignment,2019-8-8,t,Hemophilia A,in_clinical_trials_for,"Damoctocog-alfa-pegol (BAY94-9027, Jivi)",hemophilia a,"damoctocog-alfa-pegol (bay94-9027, jivi)"
6086,NCT04015492,Treatment,Crossover Assignment,2019-8-8,t,Hemophilia A,in_clinical_trials_for,"Damoctocog-alfa-pegol (BAY94-9027, Jivi)",hemophilia a,damoctocog-alfa-pegol


#### Map CURIEs from mapper to ct dataframe

In [23]:
mapping_cache = pd.read_csv("mapping_cache.tsv", sep='\t', index_col=False, header=0, dtype="object", on_bad_lines="skip")
mapping_cache = mapping_cache[mapping_cache['score'] != 'unscored'] 
mapping_cache["score"] = pd.to_numeric(mapping_cache["score"], errors='coerce')
mapping_cache = mapping_cache.sort_values(by=['clintrial_term', 'score'], ascending=[False, False])


In [24]:
idx = mapping_cache.groupby(['clintrial_term', 'term_type'])['score'].transform(max) == mapping_cache['score']
max_scores = mapping_cache[idx]
max_scores = max_scores[max_scores.score >= 70]
conditions_maxscores = max_scores[max_scores.term_type == "condition"]
interventions_maxscores = max_scores[max_scores.term_type == "intervention"]
alt_intervention_maxscores = max_scores[max_scores.term_type == "alternate_intervention"]

In [25]:
conditions_dict = dict(zip(conditions_maxscores.clintrial_term, conditions_maxscores.mapping_tool_response))
interventions_dict = dict(zip(interventions_maxscores.clintrial_term, interventions_maxscores.mapping_tool_response))
alt_intervention_dict = dict(zip(alt_intervention_maxscores.clintrial_term, interventions_maxscores.mapping_tool_response))

In [26]:
ct['condition_curie_info'] = ct.condition_collections.map(conditions_dict)
ct['treatment_curie_info'] = ct.treatment_collections.map(interventions_dict)
ct

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,condition_collections,treatment_collections,condition_curie_info,treatment_curie_info
0,NCT01049399,Treatment,Parallel Assignment,2009-12,?,Progressive Supranuclear Palsy,in_clinical_trials_for,tideglusib,progressive supranuclear palsy,tideglusib,{'mapped_name': 'Progressive supranuclear pals...,"{'mapped_name': 'Tideglusib', 'mapped_curie': ..."
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,t,Pruritus,in_clinical_trials_for,N-acetyl cysteine,pruritus,n-acetyl cysteine,"{'mapped_name': 'Pruritus', 'mapped_curie': 'C...","{'mapped_name': 'acetylcysteine', 'mapped_curi..."
3,NCT01432236,Treatment,Crossover Assignment,2011-10,?,Fibromyalgia,in_clinical_trials_for,Pregabalin,fibromyalgia,pregabalin,"{'mapped_name': 'Fibromyalgia', 'mapped_curie'...","{'mapped_name': 'pregabalin', 'mapped_curie': ..."
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,t,Relapsing Multiple Sclerosis,in_clinical_trials_for,CNM-Au8,relapsing multiple sclerosis,cnm-au8,"{'mapped_name': 'Multiple Sclerosis', 'mapped_...",
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,retinoblastoma,melphalan or melphalan + topotecan,"{'mapped_name': 'Retinoblastoma', 'mapped_curi...",
...,...,...,...,...,...,...,...,...,...,...,...,...
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection,"{'mapped_name': 'Pneumonia', 'mapped_curie': '...","{'mapped_name': 'Sterile Water for Injection',..."
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,viral,sterile water for injection,,"{'mapped_name': 'Sterile Water for Injection',..."
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,"pneumonia, ventilator-associated",sterile water for injection,"{'mapped_name': 'Pneumonia, Ventilator-Associa...","{'mapped_name': 'Sterile Water for Injection',..."
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection,"{'mapped_name': 'Pneumonia', 'mapped_curie': '...","{'mapped_name': 'Sterile Water for Injection',..."


In [27]:
ct = ct.dropna(subset=["condition_curie_info", "treatment_curie_info"])
ct = ct[ct['condition'] != ct['treatment']]   # delete rows where the subject and object are =
ct = ct[ct['condition_curie_info'] != ct['treatment_curie_info']]   # delete rows where the subject and object are =
ct

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,condition_collections,treatment_collections,condition_curie_info,treatment_curie_info
0,NCT01049399,Treatment,Parallel Assignment,2009-12,?,Progressive Supranuclear Palsy,in_clinical_trials_for,tideglusib,progressive supranuclear palsy,tideglusib,{'mapped_name': 'Progressive supranuclear pals...,"{'mapped_name': 'Tideglusib', 'mapped_curie': ..."
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,t,Pruritus,in_clinical_trials_for,N-acetyl cysteine,pruritus,n-acetyl cysteine,"{'mapped_name': 'Pruritus', 'mapped_curie': 'C...","{'mapped_name': 'acetylcysteine', 'mapped_curi..."
3,NCT01432236,Treatment,Crossover Assignment,2011-10,?,Fibromyalgia,in_clinical_trials_for,Pregabalin,fibromyalgia,pregabalin,"{'mapped_name': 'Fibromyalgia', 'mapped_curie'...","{'mapped_name': 'pregabalin', 'mapped_curie': ..."
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,retinoblastoma,melphalan,"{'mapped_name': 'Retinoblastoma', 'mapped_curi...","{'mapped_name': 'melphalan', 'mapped_curie': '..."
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,retinoblastoma,topotecan,"{'mapped_name': 'Retinoblastoma', 'mapped_curi...","{'mapped_name': 'topotecan', 'mapped_curie': '..."
...,...,...,...,...,...,...,...,...,...,...,...,...
544547,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Pemetrexed,first line non-small cell lung cancer,pemetrexed,{'mapped_name': 'Non-Small Cell Lung Carcinoma...,"{'mapped_name': 'pemetrexed', 'mapped_curie': ..."
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,"pneumonia, viral",sterile water for injection,"{'mapped_name': 'Pneumonia, Viral', 'mapped_cu...","{'mapped_name': 'Sterile Water for Injection',..."
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection,"{'mapped_name': 'Pneumonia', 'mapped_curie': '...","{'mapped_name': 'Sterile Water for Injection',..."
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,"pneumonia, ventilator-associated",sterile water for injection,"{'mapped_name': 'Pneumonia, Ventilator-Associa...","{'mapped_name': 'Sterile Water for Injection',..."


In [28]:
conditions_to_exclude = ["chronic", "viral", "vaccines", "virus diseases", "health volunteers", "healthy volunteers",
                         "safety", "severe",  "intravenous", "injectable",  "liver", "healthy", 
                         "healthy men", "healthy women", "systemic"]
ct = ct[~ct['condition_collections'].isin(conditions_to_exclude)]
len(ct)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[:10])

336390

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,condition_collections,treatment_collections,condition_curie_info,treatment_curie_info
0,NCT01049399,Treatment,Parallel Assignment,2009-12,?,Progressive Supranuclear Palsy,in_clinical_trials_for,tideglusib,progressive supranuclear palsy,tideglusib,"{'mapped_name': 'Progressive supranuclear palsy', 'mapped_curie': 'C0038868', 'mapped_score': '5.18', 'mapped_semtypes': '[dsyn]'}","{'mapped_name': 'Tideglusib', 'mapped_curie': 'C3273375', 'mapped_score': '5.18', 'mapped_semtypes': '[orch,phsu]'}"
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,t,Pruritus,in_clinical_trials_for,N-acetyl cysteine,pruritus,n-acetyl cysteine,"{'mapped_name': 'Pruritus', 'mapped_curie': 'C0033774', 'mapped_score': '5.18', 'mapped_semtypes': '[fndg]'}","{'mapped_name': 'acetylcysteine', 'mapped_curie': 'C0001047', 'mapped_score': '5.18', 'mapped_semtypes': '[aapp,phsu]'}"
3,NCT01432236,Treatment,Crossover Assignment,2011-10,?,Fibromyalgia,in_clinical_trials_for,Pregabalin,fibromyalgia,pregabalin,"{'mapped_name': 'Fibromyalgia', 'mapped_curie': 'C0016053', 'mapped_score': '5.18', 'mapped_semtypes': '[dsyn]'}","{'mapped_name': 'pregabalin', 'mapped_curie': 'C0657912', 'mapped_score': '5.18', 'mapped_semtypes': '[aapp,phsu]'}"
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,retinoblastoma,melphalan,"{'mapped_name': 'Retinoblastoma', 'mapped_curie': 'C0035335', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'melphalan', 'mapped_curie': 'C0025241', 'mapped_score': '5.18', 'mapped_semtypes': '[aapp,phsu]'}"
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,retinoblastoma,topotecan,"{'mapped_name': 'Retinoblastoma', 'mapped_curie': 'C0035335', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'topotecan', 'mapped_curie': 'C0146224', 'mapped_score': '5.18', 'mapped_semtypes': '[orch,phsu]'}"
7,NCT05624918,Treatment,Single Group Assignment,2024-4,f,Pancreatic Adenocarcinoma,in_clinical_trials_for,Gemcitabine,pancreatic adenocarcinoma,gemcitabine,"{'mapped_name': 'Adenocarcinoma of pancreas', 'mapped_curie': 'C0281361', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'gemcitabine', 'mapped_curie': 'C0045093', 'mapped_score': '5.18', 'mapped_semtypes': '[nnon,phsu]'}"
9,NCT05624918,Treatment,Single Group Assignment,2024-4,f,Pancreatic Adenocarcinoma,in_clinical_trials_for,NovoTTF-200T(P),pancreatic adenocarcinoma,novottf-200t,"{'mapped_name': 'Adenocarcinoma of pancreas', 'mapped_curie': 'C0281361', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'NovoTTF-200A Device', 'mapped_curie': 'UMLS:C4683819', 'mapped_score': 14.063435, 'mapped_semtypes': 'biolink:Device'}"
10,NCT02886065,Prevention,Parallel Assignment,2017-3-7,t,Smoldering Multiple Myeloma,in_clinical_trials_for,Citarinostat,smoldering multiple myeloma,citarinostat,"{'mapped_name': 'Smoldering myeloma', 'mapped_curie': 'C1531608', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'Citarinostat', 'mapped_curie': 'C4724808', 'mapped_score': '5.18', 'mapped_semtypes': '[orch,phsu]'}"
11,NCT02886065,Prevention,Parallel Assignment,2017-3-7,t,Smoldering Multiple Myeloma,in_clinical_trials_for,Hiltonol,smoldering multiple myeloma,hiltonol,"{'mapped_name': 'Smoldering myeloma', 'mapped_curie': 'C1531608', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'hiltonol', 'mapped_curie': 'C2698842', 'mapped_score': '5.18', 'mapped_semtypes': '[phsu]'}"
12,NCT02886065,Prevention,Parallel Assignment,2017-3-7,t,Smoldering Multiple Myeloma,in_clinical_trials_for,Lenalidomide,smoldering multiple myeloma,lenalidomide,"{'mapped_name': 'Smoldering myeloma', 'mapped_curie': 'C1531608', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'lenalidomide', 'mapped_curie': 'C1144149', 'mapped_score': '5.18', 'mapped_semtypes': '[orch,phsu]'}"


In [32]:
# # check ct dataframe before tacking on other trial-related information
# ct.to_csv("ct_check.tsv", sep='\t', index=False, header=True)

### Add other trial information

In [31]:
ct = ct.copy()
ct["study_url"] = ct['nct_id'].apply(lambda x: f'https://www.clinicaltrials.gov/study/{x}')
ct

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,condition_collections,treatment_collections,condition_curie_info,treatment_curie_info,study_url
0,NCT01049399,Treatment,Parallel Assignment,2009-12,?,Progressive Supranuclear Palsy,in_clinical_trials_for,tideglusib,progressive supranuclear palsy,tideglusib,{'mapped_name': 'Progressive supranuclear pals...,"{'mapped_name': 'Tideglusib', 'mapped_curie': ...",https://www.clinicaltrials.gov/study/NCT01049399
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,t,Pruritus,in_clinical_trials_for,N-acetyl cysteine,pruritus,n-acetyl cysteine,"{'mapped_name': 'Pruritus', 'mapped_curie': 'C...","{'mapped_name': 'acetylcysteine', 'mapped_curi...",https://www.clinicaltrials.gov/study/NCT05287724
3,NCT01432236,Treatment,Crossover Assignment,2011-10,?,Fibromyalgia,in_clinical_trials_for,Pregabalin,fibromyalgia,pregabalin,"{'mapped_name': 'Fibromyalgia', 'mapped_curie'...","{'mapped_name': 'pregabalin', 'mapped_curie': ...",https://www.clinicaltrials.gov/study/NCT01432236
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,retinoblastoma,melphalan,"{'mapped_name': 'Retinoblastoma', 'mapped_curi...","{'mapped_name': 'melphalan', 'mapped_curie': '...",https://www.clinicaltrials.gov/study/NCT04681417
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,retinoblastoma,topotecan,"{'mapped_name': 'Retinoblastoma', 'mapped_curi...","{'mapped_name': 'topotecan', 'mapped_curie': '...",https://www.clinicaltrials.gov/study/NCT04681417
...,...,...,...,...,...,...,...,...,...,...,...,...,...
544547,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Pemetrexed,first line non-small cell lung cancer,pemetrexed,{'mapped_name': 'Non-Small Cell Lung Carcinoma...,"{'mapped_name': 'pemetrexed', 'mapped_curie': ...",https://www.clinicaltrials.gov/study/NCT02576574
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,"pneumonia, viral",sterile water for injection,"{'mapped_name': 'Pneumonia, Viral', 'mapped_cu...","{'mapped_name': 'Sterile Water for Injection',...",https://www.clinicaltrials.gov/study/NCT04264533
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection,"{'mapped_name': 'Pneumonia', 'mapped_curie': '...","{'mapped_name': 'Sterile Water for Injection',...",https://www.clinicaltrials.gov/study/NCT04264533
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,"pneumonia, ventilator-associated",sterile water for injection,"{'mapped_name': 'Pneumonia, Ventilator-Associa...","{'mapped_name': 'Sterile Water for Injection',...",https://www.clinicaltrials.gov/study/NCT04264533


In [50]:
# read in other tables 
data_extracted = "/Users/kamhome/all_projects/ISB/Translator/Projects/ClinicalTrials/ETL_Python/data/6zy655l6br1i6q5o2h32k4h9wjx8"
study_references = pd.read_csv(data_extracted + '/study_references.txt', sep='|', dtype=object, index_col=False, header=0)
studies = pd.read_csv(data_extracted + '/studies.txt', sep='|', dtype=object, index_col=False, header=0)
calculated_values = pd.read_csv(data_extracted + '/calculated_values.txt', sep='|', dtype=object, index_col=False, header=0)


In [47]:
study_references
study_references["result_url"] = study_references['pmid'].apply(lambda x: f'https://pubmed.ncbi.nlm.nih.gov/{x}')
study_results_agg = pd.DataFrame(study_references[["nct_id", "result_url"]].groupby("nct_id")["result_url"].apply(list)) # get all pmids available for each study
study_results_agg

Unnamed: 0,id,nct_id,pmid,reference_type,citation
0,87323472,NCT00624156,21678181,derived,"Mosher CE, Duhamel KN, Lam J, Dickler M, Li Y,..."
1,87323473,NCT05396586,35106729,background,"Pahor A, Mester RE, Carrillo AA, Ghil E, Reime..."
2,87323474,NCT05396586,30367386,background,"Pahor A, Stavropoulos T, Jaeggi SM, Seitz AR. ..."
3,87323475,NCT05396586,17852284,background,"Royle J, Lincoln NB. The Everyday Memory Quest..."
4,87323476,NCT05396586,34485810,background,"Pahor A, Collins C, Smith RN, Moon A, Stavropo..."
...,...,...,...,...,...
871607,87323467,NCT00624078,16197974,background,"Vazquez H, Chavez-Haro A, Garcia-Ubbelohde W, ..."
871608,87323468,NCT00624078,6516334,background,"Likes K, Banner W Jr, Chavez M. Centruroides e..."
871609,87323469,NCT00624078,17015284,background,"Lai MW, Klein-Schwartz W, Rodgers GC, Abrams J..."
871610,87323470,NCT00624078,10533010,background,"Gibly R, Williams M, Walter FG, McNally J, Con..."


Unnamed: 0_level_0,result_url
nct_id,Unnamed: 1_level_1
NCT00000112,"[https://pubmed.ncbi.nlm.nih.gov/35235641, htt..."
NCT00000113,"[https://pubmed.ncbi.nlm.nih.gov/12657584, htt..."
NCT00000114,"[https://pubmed.ncbi.nlm.nih.gov/8512476, http..."
NCT00000115,[https://pubmed.ncbi.nlm.nih.gov/8684794]
NCT00000116,"[https://pubmed.ncbi.nlm.nih.gov/8512476, http..."
...,...
NCT06366048,"[https://pubmed.ncbi.nlm.nih.gov/31145929, htt..."
NCT06366100,"[https://pubmed.ncbi.nlm.nih.gov/34098412, htt..."
NCT06366412,"[https://pubmed.ncbi.nlm.nih.gov/34418401, htt..."
NCT06366425,"[https://pubmed.ncbi.nlm.nih.gov/18395077, htt..."


In [49]:
# attach relevant dates

# list(studies.columns)
cols = []
cols.append("nct_id")
date_cols = [col for col in studies.columns if 'date' in col]
cols.extend(date_cols)
studies_dates = studies[cols]
studies_dates

all_dates = pd.merge(studies_dates, calculated_values[["nct_id", "actual_duration", "were_results_reported", "months_to_report_results"]], how='left', left_on=['nct_id'], right_on = ['nct_id'])
all_dates
# all_dates.columns
all_dates.dtypes

Unnamed: 0,nct_id,nlm_download_date_description,study_first_submitted_date,results_first_submitted_date,disposition_first_submitted_date,last_update_submitted_date,study_first_submitted_qc_date,study_first_posted_date,study_first_posted_date_type,results_first_submitted_qc_date,...,last_update_posted_date,last_update_posted_date_type,start_date_type,start_date,verification_date,completion_date_type,completion_date,primary_completion_date_type,primary_completion_date,updated_at
0,NCT00214500,,2005-09-13,2018-08-10,2010-08-17,2018-10-01,2005-09-14,2005-09-22,Estimate,2018-08-10,...,2018-10-30,Actual,Actual,2006-01-02,2018-10-31,Actual,2008-01-29,Actual,2008-01-29,2024-04-13 15:18:37.689771
1,NCT04002726,,2019-06-19,,,2022-08-18,2019-06-28,2019-07-01,Actual,,...,2022-08-22,Actual,Actual,2019-07-08,2022-05-31,Actual,2019-07-12,Actual,2019-07-12,2024-04-13 15:18:38.732734
2,NCT03597984,,2018-05-31,,,2019-07-01,2018-07-22,2018-07-24,Actual,,...,2019-07-05,Actual,Anticipated,2019-07-01,2019-07-31,Anticipated,2020-12-28,Actual,2019-07-01,2024-04-13 15:18:39.514159
3,NCT03597906,,2018-06-21,,,2020-01-26,2018-07-22,2018-07-24,Actual,,...,2020-01-28,Actual,Actual,2018-08-15,2020-01-31,Actual,2020-01-20,Actual,2019-11-15,2024-04-13 15:18:40.664541
4,NCT03597828,,2018-06-27,,,2020-06-29,2018-07-21,2018-07-24,Actual,,...,2020-06-30,Actual,Actual,2018-08-05,2020-06-30,Actual,2020-06-30,Actual,2020-06-30,2024-04-13 15:18:41.432351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491081,NCT03593460,,2018-06-07,,,2018-12-31,2018-07-09,2018-07-20,Actual,,...,2019-01-03,Actual,Anticipated,2019-01-01,2018-12-31,Anticipated,2019-12-31,Anticipated,2019-11-01,2024-04-13 15:18:27.19171
491082,NCT03024294,,2017-01-06,,,2018-07-12,2017-01-13,2017-01-18,Estimate,,...,2018-07-16,Actual,Actual,2017-01-31,2018-07-31,Anticipated,2019-12-31,Anticipated,2019-12-31,2024-04-13 15:18:27.880336
491083,NCT03593382,,2012-09-16,,,2018-07-09,2018-07-09,2018-07-20,Actual,,...,2018-07-20,Actual,,2011-11-30,2018-07-31,Actual,2013-12-31,Actual,2013-06-30,2024-04-13 15:18:28.57842
491084,NCT04407078,,2020-05-25,,,2021-01-06,2020-05-25,2020-05-29,Actual,,...,2021-01-08,Actual,Actual,2019-12-02,2021-01-31,Actual,2021-01-06,Actual,2021-01-06,2024-04-13 15:18:29.292283


NameError: name 'calculated_values' is not defined