## This notebook constructs the KG/TSVs for Clinical Trials Using Gwenlyn's analysis

In [1]:
# display cells to maximum width 
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>.output_result { max-width:100% !important; }</style>"))

# lets you preint multiple outputs per cell, not just last
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import pathlib
import re
import numpy as np

#### Get Gwen's TSV, clean Conditions and Interventions/Treatments columns, get each per NCT ID per each row

In [2]:
trials_list = pd.read_csv("trials_list.txt.gz", sep='\t', index_col=False, header=0)
trials_list

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,n_conditions,n_interventions,is_fda_regulated_drug,conditions,exp_only,ctr_only,both
0,NCT01049399,Treatment,Parallel Assignment,2009-12,1,3,?,Progressive Supranuclear Palsy|1.000,tideglusib,placebo,
1,NCT00807209,Treatment,Parallel Assignment,2008-12,1,5,?,Postoperative Pain|-0.092,High Dose SKY0402|Low Dose SKY0402,Placebo,Bupivacaine via epidural|Fentanyl via PCA
2,NCT05287724,Treatment,Crossover Assignment,2022-6-19,2,2,t,Pruritus|1.000|Skin Disorder|-0.091,N-acetyl cysteine|Placebo,,
3,NCT01432236,Treatment,Crossover Assignment,2011-10,1,2,?,Fibromyalgia|1.000,Pregabalin,placebo,
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,1,1,t,Relapsing Multiple Sclerosis|1.000,CNM-Au8,,
...,...,...,...,...,...,...,...,...,...,...,...
137111,NCT06310811,Treatment,Single Group Assignment,2024-3-7,2,1,f,Safety|1.000|Effective|-0.048,RD06-04 Cells injection,,
137112,NCT02375672,Treatment,Single Group Assignment,2015-5-28,1,2,?,Colorectal Cancer|1.000,Pembrolizumab|mFOLFOX6,,
137113,NCT04252118,Treatment,Parallel Assignment,2020-1-27,1,1,f,COVID-19|1.000,MSCs,,
137114,NCT04264533,Treatment,Parallel Assignment,2020-2-14,3,2,f,"Vitamin C|1.000|Pneumonia, Viral|0.462|Pneumon...",VC,Sterile Water for Injection,


In [3]:
# trials_list_check = trials_list[trials_list['ctr_only'].isnull()] # sometimes exp_only or ctr_only have nulls
# trials_list_check = trials_list[trials_list['exp_only'].isnull()]

# trials_list_check

In [4]:
scores_list = []
regexp = re.compile(r"^[-+]?[0-9]*\.?[0-9]+$")
conditions = trials_list["conditions"].str.split("|", expand=True)
conditions
high_score_conditions = []
low_score_conditions = []
columns = conditions.columns.tolist()
for index, row in conditions.iterrows():
    for col in columns:
        value = row[col]
        # print(type(value))
        if not value is None:
            # print(value)
            if regexp.search(value):
                condition = conditions.loc[index,col-1]
                # print(condition)
                score = float(value)
                scores_list.append(score)
                if score > 0.9:
                    high_score_conditions.extend([[index, condition]]) 
                else:
                    low_score_conditions.extend([[index, condition]]) 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,348,349,350,351,352,353,354,355,356,357
0,Progressive Supranuclear Palsy,1.000,,,,,,,,,...,,,,,,,,,,
1,Postoperative Pain,-0.092,,,,,,,,,...,,,,,,,,,,
2,Pruritus,1.000,Skin Disorder,-0.091,,,,,,,...,,,,,,,,,,
3,Fibromyalgia,1.000,,,,,,,,,...,,,,,,,,,,
4,Relapsing Multiple Sclerosis,1.000,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137111,Safety,1.000,Effective,-0.048,,,,,,,...,,,,,,,,,,
137112,Colorectal Cancer,1.000,,,,,,,,,...,,,,,,,,,,
137113,COVID-19,1.000,,,,,,,,,...,,,,,,,,,,
137114,Vitamin C,1.000,"Pneumonia, Viral",0.462,"Pneumonia, Ventilator-Associated",0.071,,,,,...,,,,,,,,,,


In [5]:
high_score_conditions[-10:]

[[137101, 'Provoked, Localized Vulvodynia'],
 [137105, 'Acute Myeloid Leukemia'],
 [137106, 'Discoid Lupus Erythematosus'],
 [137108, 'Postoperative Pain'],
 [137108, 'Oocyte Retrieval'],
 [137110, 'Allergic Conjunctivitis'],
 [137111, 'Safety'],
 [137112, 'Colorectal Cancer'],
 [137113, 'COVID-19'],
 [137114, 'Vitamin C']]

#### HISTOGRAM of scores

In [6]:
# import matplotlib.pyplot as plt

# from matplotlib.ticker import PercentFormatter

# # data = [1000, 1000, 5000, 3000, 4000, 16000, 2000]

# plt.hist(scores_list, weights=np.ones(len(scores_list)) / len(scores_list), bins=20, edgecolor='black')

# plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
# plt.xlim(0.5, 1)
# plt.show()


In [7]:
trials_list

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,n_conditions,n_interventions,is_fda_regulated_drug,conditions,exp_only,ctr_only,both
0,NCT01049399,Treatment,Parallel Assignment,2009-12,1,3,?,Progressive Supranuclear Palsy|1.000,tideglusib,placebo,
1,NCT00807209,Treatment,Parallel Assignment,2008-12,1,5,?,Postoperative Pain|-0.092,High Dose SKY0402|Low Dose SKY0402,Placebo,Bupivacaine via epidural|Fentanyl via PCA
2,NCT05287724,Treatment,Crossover Assignment,2022-6-19,2,2,t,Pruritus|1.000|Skin Disorder|-0.091,N-acetyl cysteine|Placebo,,
3,NCT01432236,Treatment,Crossover Assignment,2011-10,1,2,?,Fibromyalgia|1.000,Pregabalin,placebo,
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,1,1,t,Relapsing Multiple Sclerosis|1.000,CNM-Au8,,
...,...,...,...,...,...,...,...,...,...,...,...
137111,NCT06310811,Treatment,Single Group Assignment,2024-3-7,2,1,f,Safety|1.000|Effective|-0.048,RD06-04 Cells injection,,
137112,NCT02375672,Treatment,Single Group Assignment,2015-5-28,1,2,?,Colorectal Cancer|1.000,Pembrolizumab|mFOLFOX6,,
137113,NCT04252118,Treatment,Parallel Assignment,2020-1-27,1,1,f,COVID-19|1.000,MSCs,,
137114,NCT04264533,Treatment,Parallel Assignment,2020-2-14,3,2,f,"Vitamin C|1.000|Pneumonia, Viral|0.462|Pneumon...",VC,Sterile Water for Injection,


In [8]:
#   ---    ----  low score conditions   ----    ---   #
new_rows = []
for l in high_score_conditions:
    idx = l[0]
    corresponding_row = trials_list.loc[idx].tolist()
    new_row = corresponding_row + [l[1]]
    new_rows.append(new_row)

# first_length = len(new_rows[0])
# Check if all inner lists have the same length
# all(len(sublist) == first_length for sublist in new_rows)

ct_high = pd.DataFrame(new_rows)
cols = trials_list.columns.tolist() + ["condition"]
ct_high.columns = cols
ct_high["predicate"] = "in_clinical_trials_for"

#   ---    ----  high score conditions   ----    ---   #
new_rows = []
for l in low_score_conditions:
    idx = l[0]
    corresponding_row = trials_list.loc[idx].tolist()
    new_row = corresponding_row + [l[1]]
    new_rows.append(new_row)

# first_length = len(new_rows[0])
# Check if all inner lists have the same length
# all(len(sublist) == first_length for sublist in new_rows)

ct_low = pd.DataFrame(new_rows)
cols = trials_list.columns.tolist() + ["condition"]
ct_low.columns = cols
ct_low["predicate"] = "mentioned_in_clinical_trials_for"

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct_high[:10])
    
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct_low[:10])
len(ct_high)
len(ct_low)

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,n_conditions,n_interventions,is_fda_regulated_drug,conditions,exp_only,ctr_only,both,condition,predicate
0,NCT01049399,Treatment,Parallel Assignment,2009-12,1,3,?,Progressive Supranuclear Palsy|1.000,tideglusib,placebo,,Progressive Supranuclear Palsy,in_clinical_trials_for
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,2,2,t,Pruritus|1.000|Skin Disorder|-0.091,N-acetyl cysteine|Placebo,,,Pruritus,in_clinical_trials_for
2,NCT01432236,Treatment,Crossover Assignment,2011-10,1,2,?,Fibromyalgia|1.000,Pregabalin,placebo,,Fibromyalgia,in_clinical_trials_for
3,NCT04626921,Treatment,Single Group Assignment,2020-10-22,1,1,t,Relapsing Multiple Sclerosis|1.000,CNM-Au8,,,Relapsing Multiple Sclerosis,in_clinical_trials_for
4,NCT04681417,Treatment,Parallel Assignment,2021-3-25,1,7,f,Retinoblastoma|1.000,Melphalan or Melphalan + Topotecan,"Carboplatin administered on Day 1|etoposide, carboplatin and vincristine or local ophthalmological treatment without IV chemotherapy",Cryotherapy (local treatment)|Intravitreal Melphalan chemotherapy injections (local treatment)|Iodine-125 plaques (local treatment)|Thermotherapy (local treatment),Retinoblastoma,in_clinical_trials_for
5,NCT04036227,Treatment,Parallel Assignment,2019-7-3,1,2,f,Healthy|1.000,GS-248,Placebo,,Healthy,in_clinical_trials_for
6,NCT05624918,Treatment,Single Group Assignment,2024-4,2,3,f,Pancreatic Adenocarcinoma|1.000|Resectable Pancreatic Cancer|0.566,Gemcitabine|Nab paclitaxel|NovoTTF-200T(P),,,Pancreatic Adenocarcinoma,in_clinical_trials_for
7,NCT02886065,Prevention,Parallel Assignment,2017-3-7,1,4,t,Smoldering Multiple Myeloma|1.000,Citarinostat|Hiltonol|Lenalidomide|PVX-410,,,Smoldering Multiple Myeloma,in_clinical_trials_for
8,NCT03683576,Treatment,Parallel Assignment,2018-10-22,1,2,t,Asthma|1.000,GB001,Placebo,,Asthma,in_clinical_trials_for
9,NCT00738673,Treatment,Single Group Assignment,2008-7,1,1,?,Prostate Cancer|1.000,degarelix,,,Prostate Cancer,in_clinical_trials_for


Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,n_conditions,n_interventions,is_fda_regulated_drug,conditions,exp_only,ctr_only,both,condition,predicate
0,NCT00807209,Treatment,Parallel Assignment,2008-12,1,5,?,Postoperative Pain|-0.092,High Dose SKY0402|Low Dose SKY0402,Placebo,Bupivacaine via epidural|Fentanyl via PCA,Postoperative Pain,mentioned_in_clinical_trials_for
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,2,2,t,Pruritus|1.000|Skin Disorder|-0.091,N-acetyl cysteine|Placebo,,,Skin Disorder,mentioned_in_clinical_trials_for
2,NCT05624918,Treatment,Single Group Assignment,2024-4,2,3,f,Pancreatic Adenocarcinoma|1.000|Resectable Pancreatic Cancer|0.566,Gemcitabine|Nab paclitaxel|NovoTTF-200T(P),,,Resectable Pancreatic Cancer,mentioned_in_clinical_trials_for
3,NCT01003288,Prevention,Single Group Assignment,2009-10,1,1,?,Healthy|-0.400,Adjuvanted influenza H1N1split virion vaccine,,,Healthy,mentioned_in_clinical_trials_for
4,NCT06257693,Treatment,Single Group Assignment,2024-2,1,1,t,Prostate Adenocarcinoma|0.033,enzalutamide,,,Prostate Adenocarcinoma,mentioned_in_clinical_trials_for
5,NCT00003494,Treatment,Single Group Assignment,1996-3-26,1,1,t,Bronchial Alveolar; Tumor|0.567,Antineoplaston therapy (Atengenal + Astugenal),,,Bronchial Alveolar; Tumor,mentioned_in_clinical_trials_for
6,NCT06096909,Treatment,Parallel Assignment,2023-11-1,2,3,f,Acute Coronary Syndrome|1.000|Non ST Segment Elevation Acute Coronary Syndrome|0.279,Tafolecimab,Cholesterol Absorption Inhibitor,Statin,Non ST Segment Elevation Acute Coronary Syndrome,mentioned_in_clinical_trials_for
7,NCT00012012,Treatment,Single Group Assignment,2001-8,2,4,?,Cervical Cancer|1.000|Radiation Toxicity|0.242,Amifostine trihydrate|Cisplatin|External beam radiation therapy|Intracavitary brachytherapy,,,Radiation Toxicity,mentioned_in_clinical_trials_for
8,NCT05377203,Treatment,Crossover Assignment,2022-7-13,2,2,f,Hypertension|1.000|Arterial Hypertension|0.304,Dual combination of standard dose therapy→ Quadruple combination of half doses therapy|Quadruple combination of half doses therapy→Dual combination of standard dose therapy,,,Arterial Hypertension,mentioned_in_clinical_trials_for
9,NCT05762107,Prevention,Crossover Assignment,2023-7-28,1,4,t,Type 1 Diabetes Mellitus With Hypoglycemia|0.733,"Placebo|ZT-01, 15 mg|ZT-01, 22 mg|ZT-01, 7 mg",,,Type 1 Diabetes Mellitus With Hypoglycemia,mentioned_in_clinical_trials_for


104829

131559

In [9]:
ct = pd.concat([ct_high, ct_low], axis=0)
print(len(ct))

ct_exp_only = ct.copy()
ct_exp_only = ct_exp_only.drop(["ctr_only", "both"], axis=1).reset_index(drop=True)
ct_exp_only["exp_only_copy"] = ct_exp_only["exp_only"]
ct_exp_only.exp_only_copy = ct_exp_only.exp_only_copy.str.split('|')
ct_exp_only = ct_exp_only.explode('exp_only_copy')
ct_exp_only = ct_exp_only.rename(columns={'exp_only_copy': 'treatment'})

ct_ctr_only = ct.copy()
ct_ctr_only = ct_ctr_only.drop(["exp_only", "both"], axis=1).reset_index(drop=True)
ct_ctr_only["ctr_only_copy"] = ct_ctr_only["ctr_only"]
ct_ctr_only.ctr_only_copy = ct_ctr_only.ctr_only_copy.str.split('|')
ct_ctr_only = ct_ctr_only.explode('ctr_only_copy')
ct_ctr_only = ct_ctr_only.rename(columns={'ctr_only_copy': 'treatment'})
ct_ctr_only["predicate"] = "mentioned_in_clinical_trials_for"

ct = pd.concat([ct_exp_only, ct_ctr_only], axis=0)
ct.dropna(subset=['treatment'], inplace=True)
ct = ct[~ct['treatment'].isnull()]
ct = ct.drop(["conditions", "exp_only", "ctr_only", "n_conditions", "n_interventions"], axis=1).reset_index(drop=True)
print(len(ct))

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[-10:])

236388
544550


Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment
544540,NCT01996384,Treatment,Parallel Assignment,2013-11,?,Provoked Vestibulodynia,mentioned_in_clinical_trials_for,Non-classical acupuncture
544541,NCT01996384,Treatment,Parallel Assignment,2013-11,?,Vulvar Vestibulitis,mentioned_in_clinical_trials_for,Non-classical acupuncture
544542,NCT01247064,Treatment,Parallel Assignment,2010-10,?,"Bronchiolitis, Viral",mentioned_in_clinical_trials_for,Nebulized 0.9% Normal Saline
544543,NCT01247064,Treatment,Parallel Assignment,2010-10,?,"Saline Solution, Hypertonic",mentioned_in_clinical_trials_for,Nebulized 0.9% Normal Saline
544544,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Carboplatin
544545,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Gemcitabine
544546,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Paclitaxel
544547,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Pemetrexed
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection


In [10]:
# List of interventions to block 
undesirable_interventions = [
    "placebo", "standard of care", "laboratory biomarker analysis",
    "questionnaire", "standard treatment",
    "data collection", "educational intervention",
    "intervention group", "training", "management of therapy complications", 
    "contingency management", "active control", "experimental group",
    " sham ", "sham intervention", "active comparator",
    "patient navigation", "self-management", "quality of life",
    "treatment group", "study", "routine care",
]

ct["treatment_lower"] = ct["treatment"].str.lower()
# Create a boolean mask for each substring and combine them with logical OR
mask = ct['treatment_lower'].str.contains('|'.join(undesirable_interventions))
# Filter the DataFrame using the mask and replace matching rows with None
ct.loc[mask, 'treatment_lower'] = None
ct = ct[ct['treatment_lower'].notna()]
ct = ct[~ct['treatment_lower'].isnull()]
print(len(ct))

ct["condition_lower"] = ct["condition"].str.lower()
# Create a boolean mask for each substring and combine them with logical OR
mask = ct['condition_lower'].str.contains('|'.join(undesirable_interventions))
# Filter the DataFrame using the mask and replace matching rows with None
ct.loc[mask, 'condition_lower'] = None
ct = ct[ct['condition_lower'].notna()]
ct = ct[~ct['condition_lower'].isnull()]
print(len(ct))

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[:10])

470185
469833


Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,treatment_lower,condition_lower
0,NCT01049399,Treatment,Parallel Assignment,2009-12,?,Progressive Supranuclear Palsy,in_clinical_trials_for,tideglusib,tideglusib,progressive supranuclear palsy
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,t,Pruritus,in_clinical_trials_for,N-acetyl cysteine,n-acetyl cysteine,pruritus
3,NCT01432236,Treatment,Crossover Assignment,2011-10,?,Fibromyalgia,in_clinical_trials_for,Pregabalin,pregabalin,fibromyalgia
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,t,Relapsing Multiple Sclerosis,in_clinical_trials_for,CNM-Au8,cnm-au8,relapsing multiple sclerosis
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,melphalan or melphalan + topotecan,retinoblastoma
6,NCT04036227,Treatment,Parallel Assignment,2019-7-3,f,Healthy,in_clinical_trials_for,GS-248,gs-248,healthy
7,NCT05624918,Treatment,Single Group Assignment,2024-4,f,Pancreatic Adenocarcinoma,in_clinical_trials_for,Gemcitabine,gemcitabine,pancreatic adenocarcinoma
8,NCT05624918,Treatment,Single Group Assignment,2024-4,f,Pancreatic Adenocarcinoma,in_clinical_trials_for,Nab paclitaxel,nab paclitaxel,pancreatic adenocarcinoma
9,NCT05624918,Treatment,Single Group Assignment,2024-4,f,Pancreatic Adenocarcinoma,in_clinical_trials_for,NovoTTF-200T(P),novottf-200t(p),pancreatic adenocarcinoma
10,NCT02886065,Prevention,Parallel Assignment,2017-3-7,t,Smoldering Multiple Myeloma,in_clinical_trials_for,Citarinostat,citarinostat,smoldering multiple myeloma


In [11]:
# ct.to_csv("ct_check.tsv", sep='\t', index=False, header=True)
ct.dtypes

nct_id                   object
primary_purpose          object
intervention_model       object
start_month_year         object
is_fda_regulated_drug    object
condition                object
predicate                object
treatment                object
treatment_lower          object
condition_lower          object
dtype: object

In [12]:
ct['condition_outside_p'] =  [re.sub(r"\((.*?)\)",'', str(x)) for x in ct['condition_lower']]
ct['condition_inside_p'] = ct['condition_lower'].str.extract(r"\((.*?)\)", expand=True)

ct['treatment_outside_p'] =  [re.sub(r"\((.*?)\)",'', str(x)) for x in ct['treatment_lower']]
ct['treatment_inside_p'] = ct['treatment_lower'].str.extract(r"\((.*?)\)", expand=True)

ct['condition_outside_b'] =  [re.sub(r'\[([^\]]+)\]','', str(x)) for x in ct['condition_lower']]
ct['condition_inside_b'] = ct['condition_lower'].str.extract(r'\[([^\]]+)\]', expand=True)

ct['treatment_outside_b'] =  [re.sub(r'\[([^\]]+)\]','', str(x)) for x in ct['treatment_lower']]
ct['treatment_inside_b'] = ct['treatment_lower'].str.extract(r'\[([^\]]+)\]', expand=True)

ct = ct.apply(lambda x: x.str.strip() if x.dtype == "object" else x) # strip leading and trailing whitespace
ct = ct[~ct['condition_lower'].isnull()]
ct = ct[~ct['treatment_lower'].isnull()]

split_chars = [",", "+", "/", " and ", "&", " or "]
split_chars = [re.escape(char) for char in split_chars]

# Function to split column based on list of characters
def split_column(df, column, split_chars):
    # Create a temporary dataframe to store split values
    temp_df = df[column].str.split('|'.join(split_chars), expand=True)
    # Rename columns of the temporary dataframe
    temp_df.columns = [f'{column}_{i+1}' for i in range(temp_df.shape[1])]
    return temp_df

# Apply split_column function to the desired column
split_con_outside_p = split_column(ct, 'condition_outside_p', split_chars)
split_con_inside_p = split_column(ct, 'condition_inside_p', split_chars)
split_trmnt_outside_p = split_column(ct, 'treatment_outside_p', split_chars)
split_trmnt_inside_p = split_column(ct, 'treatment_inside_p', split_chars)

split_con_outside_b = split_column(ct, 'condition_outside_b', split_chars)
split_con_inside_b = split_column(ct, 'condition_inside_b', split_chars)
split_trmnt_outside_b = split_column(ct, 'treatment_outside_b', split_chars)
split_trmnt_inside_b = split_column(ct, 'treatment_inside_b', split_chars)

# Concatenate the split dataframe with the original dataframe
split_df = pd.concat([split_con_outside_p, split_con_inside_p, split_trmnt_outside_p, split_trmnt_inside_p,
                     split_con_outside_b, split_con_inside_b, split_trmnt_outside_b, split_trmnt_inside_b], axis=1)
split_df = split_df.fillna(np.nan).replace([np.nan], [None]) # replace NaN values in dataframe with None (just for consistency)
split_df = pd.concat([ct["condition_lower"], ct["treatment_lower"], split_df], axis=1)

split_df = split_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) # strip leading and trailing whitespace

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(split_df[:10])

Unnamed: 0,condition_lower,treatment_lower,condition_outside_p_1,condition_outside_p_2,condition_outside_p_3,condition_outside_p_4,condition_outside_p_5,condition_outside_p_6,condition_outside_p_7,condition_outside_p_8,condition_outside_p_9,condition_outside_p_10,condition_outside_p_11,condition_outside_p_12,condition_inside_p_1,condition_inside_p_2,condition_inside_p_3,condition_inside_p_4,condition_inside_p_5,condition_inside_p_6,treatment_outside_p_1,treatment_outside_p_2,treatment_outside_p_3,treatment_outside_p_4,treatment_outside_p_5,treatment_outside_p_6,treatment_outside_p_7,treatment_outside_p_8,treatment_outside_p_9,treatment_outside_p_10,treatment_outside_p_11,treatment_outside_p_12,treatment_outside_p_13,treatment_outside_p_14,treatment_outside_p_15,treatment_outside_p_16,treatment_inside_p_1,treatment_inside_p_2,treatment_inside_p_3,treatment_inside_p_4,treatment_inside_p_5,treatment_inside_p_6,treatment_inside_p_7,treatment_inside_p_8,treatment_inside_p_9,condition_outside_b_1,condition_outside_b_2,condition_outside_b_3,condition_outside_b_4,condition_outside_b_5,condition_outside_b_6,condition_outside_b_7,condition_outside_b_8,condition_outside_b_9,condition_outside_b_10,condition_outside_b_11,condition_outside_b_12,condition_inside_b_1,treatment_outside_b_1,treatment_outside_b_2,treatment_outside_b_3,treatment_outside_b_4,treatment_outside_b_5,treatment_outside_b_6,treatment_outside_b_7,treatment_outside_b_8,treatment_outside_b_9,treatment_outside_b_10,treatment_outside_b_11,treatment_outside_b_12,treatment_outside_b_13,treatment_outside_b_14,treatment_outside_b_15,treatment_outside_b_16,treatment_inside_b_1,treatment_inside_b_2,treatment_inside_b_3,treatment_inside_b_4,treatment_inside_b_5
0,progressive supranuclear palsy,tideglusib,progressive supranuclear palsy,,,,,,,,,,,,,,,,,,tideglusib,,,,,,,,,,,,,,,,,,,,,,,,,progressive supranuclear palsy,,,,,,,,,,,,,tideglusib,,,,,,,,,,,,,,,,,,,,
1,pruritus,n-acetyl cysteine,pruritus,,,,,,,,,,,,,,,,,,n-acetyl cysteine,,,,,,,,,,,,,,,,,,,,,,,,,pruritus,,,,,,,,,,,,,n-acetyl cysteine,,,,,,,,,,,,,,,,,,,,
3,fibromyalgia,pregabalin,fibromyalgia,,,,,,,,,,,,,,,,,,pregabalin,,,,,,,,,,,,,,,,,,,,,,,,,fibromyalgia,,,,,,,,,,,,,pregabalin,,,,,,,,,,,,,,,,,,,,
4,relapsing multiple sclerosis,cnm-au8,relapsing multiple sclerosis,,,,,,,,,,,,,,,,,,cnm-au8,,,,,,,,,,,,,,,,,,,,,,,,,relapsing multiple sclerosis,,,,,,,,,,,,,cnm-au8,,,,,,,,,,,,,,,,,,,,
5,retinoblastoma,melphalan or melphalan + topotecan,retinoblastoma,,,,,,,,,,,,,,,,,,melphalan,melphalan,topotecan,,,,,,,,,,,,,,,,,,,,,,,retinoblastoma,,,,,,,,,,,,,melphalan,melphalan,topotecan,,,,,,,,,,,,,,,,,,
6,healthy,gs-248,healthy,,,,,,,,,,,,,,,,,,gs-248,,,,,,,,,,,,,,,,,,,,,,,,,healthy,,,,,,,,,,,,,gs-248,,,,,,,,,,,,,,,,,,,,
7,pancreatic adenocarcinoma,gemcitabine,pancreatic adenocarcinoma,,,,,,,,,,,,,,,,,,gemcitabine,,,,,,,,,,,,,,,,,,,,,,,,,pancreatic adenocarcinoma,,,,,,,,,,,,,gemcitabine,,,,,,,,,,,,,,,,,,,,
8,pancreatic adenocarcinoma,nab paclitaxel,pancreatic adenocarcinoma,,,,,,,,,,,,,,,,,,nab paclitaxel,,,,,,,,,,,,,,,,,,,,,,,,,pancreatic adenocarcinoma,,,,,,,,,,,,,nab paclitaxel,,,,,,,,,,,,,,,,,,,,
9,pancreatic adenocarcinoma,novottf-200t(p),pancreatic adenocarcinoma,,,,,,,,,,,,,,,,,,novottf-200t,,,,,,,,,,,,,,,,p,,,,,,,,,pancreatic adenocarcinoma,,,,,,,,,,,,,novottf-200t(p),,,,,,,,,,,,,,,,,,,,
10,smoldering multiple myeloma,citarinostat,smoldering multiple myeloma,,,,,,,,,,,,,,,,,,citarinostat,,,,,,,,,,,,,,,,,,,,,,,,,smoldering multiple myeloma,,,,,,,,,,,,,citarinostat,,,,,,,,,,,,,,,,,,,,


In [13]:
split_t = split_df.transpose()
split_t = split_t.apply(lambda x: x.mask(x.duplicated(), None))
split_df = split_t.transpose()
split_df = split_df[~split_df['condition_lower'].isnull()]
split_df = split_df[~split_df['treatment_lower'].isnull()]

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(split_df[:10])

Unnamed: 0,condition_lower,treatment_lower,condition_outside_p_1,condition_outside_p_2,condition_outside_p_3,condition_outside_p_4,condition_outside_p_5,condition_outside_p_6,condition_outside_p_7,condition_outside_p_8,condition_outside_p_9,condition_outside_p_10,condition_outside_p_11,condition_outside_p_12,condition_inside_p_1,condition_inside_p_2,condition_inside_p_3,condition_inside_p_4,condition_inside_p_5,condition_inside_p_6,treatment_outside_p_1,treatment_outside_p_2,treatment_outside_p_3,treatment_outside_p_4,treatment_outside_p_5,treatment_outside_p_6,treatment_outside_p_7,treatment_outside_p_8,treatment_outside_p_9,treatment_outside_p_10,treatment_outside_p_11,treatment_outside_p_12,treatment_outside_p_13,treatment_outside_p_14,treatment_outside_p_15,treatment_outside_p_16,treatment_inside_p_1,treatment_inside_p_2,treatment_inside_p_3,treatment_inside_p_4,treatment_inside_p_5,treatment_inside_p_6,treatment_inside_p_7,treatment_inside_p_8,treatment_inside_p_9,condition_outside_b_1,condition_outside_b_2,condition_outside_b_3,condition_outside_b_4,condition_outside_b_5,condition_outside_b_6,condition_outside_b_7,condition_outside_b_8,condition_outside_b_9,condition_outside_b_10,condition_outside_b_11,condition_outside_b_12,condition_inside_b_1,treatment_outside_b_1,treatment_outside_b_2,treatment_outside_b_3,treatment_outside_b_4,treatment_outside_b_5,treatment_outside_b_6,treatment_outside_b_7,treatment_outside_b_8,treatment_outside_b_9,treatment_outside_b_10,treatment_outside_b_11,treatment_outside_b_12,treatment_outside_b_13,treatment_outside_b_14,treatment_outside_b_15,treatment_outside_b_16,treatment_inside_b_1,treatment_inside_b_2,treatment_inside_b_3,treatment_inside_b_4,treatment_inside_b_5
0,progressive supranuclear palsy,tideglusib,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,pruritus,n-acetyl cysteine,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,fibromyalgia,pregabalin,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,relapsing multiple sclerosis,cnm-au8,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,retinoblastoma,melphalan or melphalan + topotecan,,,,,,,,,,,,,,,,,,,melphalan,,topotecan,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,healthy,gs-248,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,pancreatic adenocarcinoma,gemcitabine,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,pancreatic adenocarcinoma,nab paclitaxel,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,pancreatic adenocarcinoma,novottf-200t(p),,,,,,,,,,,,,,,,,,,novottf-200t,,,,,,,,,,,,,,,,p,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10,smoldering multiple myeloma,citarinostat,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [14]:
len(ct)
len(split_df)

469833

469034

In [15]:
test = split_df[split_df['treatment_lower'].isnull()]
test

Unnamed: 0,condition_lower,treatment_lower,condition_outside_p_1,condition_outside_p_2,condition_outside_p_3,condition_outside_p_4,condition_outside_p_5,condition_outside_p_6,condition_outside_p_7,condition_outside_p_8,...,treatment_outside_b_12,treatment_outside_b_13,treatment_outside_b_14,treatment_outside_b_15,treatment_outside_b_16,treatment_inside_b_1,treatment_inside_b_2,treatment_inside_b_3,treatment_inside_b_4,treatment_inside_b_5


In [16]:
condition_cols = [ele for ele in split_df.columns if any([substr in ele for substr in ['condition']])] # find columns with "condition" in it 
treatment_cols = [ele for ele in split_df.columns if any([substr in ele for substr in ['treatment']])] # find columns with "treatment" in it 
condition_cols[:5]
treatment_cols[:5]

['condition_lower',
 'condition_outside_p_1',
 'condition_outside_p_2',
 'condition_outside_p_3',
 'condition_outside_p_4']

['treatment_lower',
 'treatment_outside_p_1',
 'treatment_outside_p_2',
 'treatment_outside_p_3',
 'treatment_outside_p_4']

In [17]:
split_df["condition_collections"] = split_df[condition_cols].stack().groupby(level=0).apply(list).values.tolist()  # get the columns with conditions into a list in a single column, ignoring the None values
split_df["treatment_collections"] = split_df[treatment_cols].stack().groupby(level=0).apply(list).values.tolist()  # get the columns with treatments into a list in a single column, ignoring the None values

# split_df
# trmnts = split_df[treatment_cols].stack().groupby(level=0).apply(list).values.tolist()
# len(trmnts)

# conds = split_df[condition_cols].stack().groupby(level=0).apply(list).values.tolist()
# len(conds)

In [18]:
ct["condition_collections"] = split_df["condition_collections"]
ct["treatment_collections"] = split_df["treatment_collections"]
cols_to_keep = [element for element in ct.columns if not any(substring in element for substring in ['inside', 'outside', 'lower'])]
ct = ct[cols_to_keep]
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[-10:])

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,condition_collections,treatment_collections
544540,NCT01996384,Treatment,Parallel Assignment,2013-11,?,Provoked Vestibulodynia,mentioned_in_clinical_trials_for,Non-classical acupuncture,[provoked vestibulodynia],[non-classical acupuncture]
544541,NCT01996384,Treatment,Parallel Assignment,2013-11,?,Vulvar Vestibulitis,mentioned_in_clinical_trials_for,Non-classical acupuncture,[vulvar vestibulitis],[non-classical acupuncture]
544542,NCT01247064,Treatment,Parallel Assignment,2010-10,?,"Bronchiolitis, Viral",mentioned_in_clinical_trials_for,Nebulized 0.9% Normal Saline,"[bronchiolitis, viral, bronchiolitis, viral]",[nebulized 0.9% normal saline]
544543,NCT01247064,Treatment,Parallel Assignment,2010-10,?,"Saline Solution, Hypertonic",mentioned_in_clinical_trials_for,Nebulized 0.9% Normal Saline,"[saline solution, hypertonic, saline solution, hypertonic]",[nebulized 0.9% normal saline]
544544,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Carboplatin,[first line non-small cell lung cancer],[carboplatin]
544545,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Gemcitabine,[first line non-small cell lung cancer],[gemcitabine]
544546,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Paclitaxel,[first line non-small cell lung cancer],[paclitaxel]
544547,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Pemetrexed,[first line non-small cell lung cancer],[pemetrexed]
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,"[pneumonia, viral, pneumonia, viral]",[sterile water for injection]
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,"[pneumonia, ventilator-associated, pneumonia, ventilator-associated]",[sterile water for injection]


In [19]:
ct = ct.explode('condition_collections')
ct = ct.explode('treatment_collections')
len(ct)

795705

In [20]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[-10:])

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,condition_collections,treatment_collections
544544,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Carboplatin,first line non-small cell lung cancer,carboplatin
544545,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Gemcitabine,first line non-small cell lung cancer,gemcitabine
544546,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Paclitaxel,first line non-small cell lung cancer,paclitaxel
544547,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Pemetrexed,first line non-small cell lung cancer,pemetrexed
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,"pneumonia, viral",sterile water for injection
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,viral,sterile water for injection
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,"pneumonia, ventilator-associated",sterile water for injection
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,ventilator-associated,sterile water for injection


In [21]:
ct['condition_collections'] = ct['condition_collections'].apply(lambda x: x if len(str(x)) > 3 else None)
ct['treatment_collections'] = ct['treatment_collections'].apply(lambda x: x if len(str(x)) > 3 else None)
ct = ct[~ct['condition_collections'].isnull()]
ct = ct[~ct['treatment_collections'].isnull()]
ct

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,condition_collections,treatment_collections
0,NCT01049399,Treatment,Parallel Assignment,2009-12,?,Progressive Supranuclear Palsy,in_clinical_trials_for,tideglusib,progressive supranuclear palsy,tideglusib
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,t,Pruritus,in_clinical_trials_for,N-acetyl cysteine,pruritus,n-acetyl cysteine
3,NCT01432236,Treatment,Crossover Assignment,2011-10,?,Fibromyalgia,in_clinical_trials_for,Pregabalin,fibromyalgia,pregabalin
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,t,Relapsing Multiple Sclerosis,in_clinical_trials_for,CNM-Au8,relapsing multiple sclerosis,cnm-au8
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,retinoblastoma,melphalan or melphalan + topotecan
...,...,...,...,...,...,...,...,...,...,...
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,viral,sterile water for injection
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,"pneumonia, ventilator-associated",sterile water for injection
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection


In [22]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[8700:8710])

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,condition_collections,treatment_collections
6084,NCT06012760,Treatment,Parallel Assignment,2024-4-1,f,Cardiac Surgery,in_clinical_trials_for,"Iron sucrose, Human Erythropoietin Injection, Vitamin C",cardiac surgery,human erythropoietin injection
6084,NCT06012760,Treatment,Parallel Assignment,2024-4-1,f,Cardiac Surgery,in_clinical_trials_for,"Iron sucrose, Human Erythropoietin Injection, Vitamin C",cardiac surgery,vitamin c
6085,NCT03676309,Treatment,Parallel Assignment,2017-9-1,f,"Diabetes Mellitus, Type 2",in_clinical_trials_for,"Nutriceutical Oral Capsule,","diabetes mellitus, type 2","nutriceutical oral capsule,"
6085,NCT03676309,Treatment,Parallel Assignment,2017-9-1,f,"Diabetes Mellitus, Type 2",in_clinical_trials_for,"Nutriceutical Oral Capsule,","diabetes mellitus, type 2",nutriceutical oral capsule
6085,NCT03676309,Treatment,Parallel Assignment,2017-9-1,f,"Diabetes Mellitus, Type 2",in_clinical_trials_for,"Nutriceutical Oral Capsule,",diabetes mellitus,"nutriceutical oral capsule,"
6085,NCT03676309,Treatment,Parallel Assignment,2017-9-1,f,"Diabetes Mellitus, Type 2",in_clinical_trials_for,"Nutriceutical Oral Capsule,",diabetes mellitus,nutriceutical oral capsule
6085,NCT03676309,Treatment,Parallel Assignment,2017-9-1,f,"Diabetes Mellitus, Type 2",in_clinical_trials_for,"Nutriceutical Oral Capsule,",type 2,"nutriceutical oral capsule,"
6085,NCT03676309,Treatment,Parallel Assignment,2017-9-1,f,"Diabetes Mellitus, Type 2",in_clinical_trials_for,"Nutriceutical Oral Capsule,",type 2,nutriceutical oral capsule
6086,NCT04015492,Treatment,Crossover Assignment,2019-8-8,t,Hemophilia A,in_clinical_trials_for,"Damoctocog-alfa-pegol (BAY94-9027, Jivi)",hemophilia a,"damoctocog-alfa-pegol (bay94-9027, jivi)"
6086,NCT04015492,Treatment,Crossover Assignment,2019-8-8,t,Hemophilia A,in_clinical_trials_for,"Damoctocog-alfa-pegol (BAY94-9027, Jivi)",hemophilia a,damoctocog-alfa-pegol


#### Map CURIEs from mapper to ct dataframe

In [23]:
mapping_cache = pd.read_csv("mapping_cache.tsv", sep='\t', index_col=False, header=0, dtype="object", on_bad_lines="skip")
mapping_cache = mapping_cache[mapping_cache['score'] != 'unscored'] 
mapping_cache["score"] = pd.to_numeric(mapping_cache["score"], errors='coerce')
mapping_cache = mapping_cache.sort_values(by=['clintrial_term', 'score'], ascending=[False, False])


In [24]:
idx = mapping_cache.groupby(['clintrial_term', 'term_type'])['score'].transform(max) == mapping_cache['score']
max_scores = mapping_cache[idx]
max_scores = max_scores[max_scores.score >= 70]
conditions_maxscores = max_scores[max_scores.term_type == "condition"]
interventions_maxscores = max_scores[max_scores.term_type == "intervention"]
alt_intervention_maxscores = max_scores[max_scores.term_type == "alternate_intervention"]

In [25]:
conditions_dict = dict(zip(conditions_maxscores.clintrial_term, conditions_maxscores.mapping_tool_response))
interventions_dict = dict(zip(interventions_maxscores.clintrial_term, interventions_maxscores.mapping_tool_response))
alt_intervention_dict = dict(zip(alt_intervention_maxscores.clintrial_term, interventions_maxscores.mapping_tool_response))

In [26]:
ct['condition_curie_info'] = ct.condition_collections.map(conditions_dict)
ct['treatment_curie_info'] = ct.treatment_collections.map(interventions_dict)
ct

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,condition_collections,treatment_collections,condition_curie_info,treatment_curie_info
0,NCT01049399,Treatment,Parallel Assignment,2009-12,?,Progressive Supranuclear Palsy,in_clinical_trials_for,tideglusib,progressive supranuclear palsy,tideglusib,{'mapped_name': 'Progressive supranuclear pals...,"{'mapped_name': 'Tideglusib', 'mapped_curie': ..."
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,t,Pruritus,in_clinical_trials_for,N-acetyl cysteine,pruritus,n-acetyl cysteine,"{'mapped_name': 'Pruritus', 'mapped_curie': 'C...","{'mapped_name': 'acetylcysteine', 'mapped_curi..."
3,NCT01432236,Treatment,Crossover Assignment,2011-10,?,Fibromyalgia,in_clinical_trials_for,Pregabalin,fibromyalgia,pregabalin,"{'mapped_name': 'Fibromyalgia', 'mapped_curie'...","{'mapped_name': 'pregabalin', 'mapped_curie': ..."
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,t,Relapsing Multiple Sclerosis,in_clinical_trials_for,CNM-Au8,relapsing multiple sclerosis,cnm-au8,"{'mapped_name': 'Multiple Sclerosis', 'mapped_...",
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,retinoblastoma,melphalan or melphalan + topotecan,"{'mapped_name': 'Retinoblastoma', 'mapped_curi...",
...,...,...,...,...,...,...,...,...,...,...,...,...
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection,"{'mapped_name': 'Pneumonia', 'mapped_curie': '...","{'mapped_name': 'Sterile Water for Injection',..."
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,viral,sterile water for injection,,"{'mapped_name': 'Sterile Water for Injection',..."
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,"pneumonia, ventilator-associated",sterile water for injection,"{'mapped_name': 'Pneumonia, Ventilator-Associa...","{'mapped_name': 'Sterile Water for Injection',..."
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection,"{'mapped_name': 'Pneumonia', 'mapped_curie': '...","{'mapped_name': 'Sterile Water for Injection',..."


In [27]:
ct = ct.dropna(subset=["condition_curie_info", "treatment_curie_info"])
ct

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,condition_collections,treatment_collections,condition_curie_info,treatment_curie_info
0,NCT01049399,Treatment,Parallel Assignment,2009-12,?,Progressive Supranuclear Palsy,in_clinical_trials_for,tideglusib,progressive supranuclear palsy,tideglusib,{'mapped_name': 'Progressive supranuclear pals...,"{'mapped_name': 'Tideglusib', 'mapped_curie': ..."
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,t,Pruritus,in_clinical_trials_for,N-acetyl cysteine,pruritus,n-acetyl cysteine,"{'mapped_name': 'Pruritus', 'mapped_curie': 'C...","{'mapped_name': 'acetylcysteine', 'mapped_curi..."
3,NCT01432236,Treatment,Crossover Assignment,2011-10,?,Fibromyalgia,in_clinical_trials_for,Pregabalin,fibromyalgia,pregabalin,"{'mapped_name': 'Fibromyalgia', 'mapped_curie'...","{'mapped_name': 'pregabalin', 'mapped_curie': ..."
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,retinoblastoma,melphalan,"{'mapped_name': 'Retinoblastoma', 'mapped_curi...","{'mapped_name': 'melphalan', 'mapped_curie': '..."
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,retinoblastoma,topotecan,"{'mapped_name': 'Retinoblastoma', 'mapped_curi...","{'mapped_name': 'topotecan', 'mapped_curie': '..."
...,...,...,...,...,...,...,...,...,...,...,...,...
544547,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,Pemetrexed,first line non-small cell lung cancer,pemetrexed,{'mapped_name': 'Non-Small Cell Lung Carcinoma...,"{'mapped_name': 'pemetrexed', 'mapped_curie': ..."
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,"pneumonia, viral",sterile water for injection,"{'mapped_name': 'Pneumonia, Viral', 'mapped_cu...","{'mapped_name': 'Sterile Water for Injection',..."
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Viral",mentioned_in_clinical_trials_for,Sterile Water for Injection,pneumonia,sterile water for injection,"{'mapped_name': 'Pneumonia', 'mapped_curie': '...","{'mapped_name': 'Sterile Water for Injection',..."
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,Sterile Water for Injection,"pneumonia, ventilator-associated",sterile water for injection,"{'mapped_name': 'Pneumonia, Ventilator-Associa...","{'mapped_name': 'Sterile Water for Injection',..."


In [28]:
ct.to_csv("ct_check.tsv", sep='\t', index=False, header=True)


In [None]:
ct["conditions_split"] = pd.Series(condition_collections)
ct["treatments_split"] = pd.Series(treatment_collections)
ct

In [None]:
condition_cols = [ele for ele in split_df.columns if any([substr in ele for substr in ['condition']])] # find columns with "condition" in it 
treatment_cols = [ele for ele in split_df.columns if any([substr in ele for substr in ['treatment']])] # find columns with "treatment" in it 

def filter_sublist(sublist):
    return [x for x in sublist if x is not None and (not isinstance(x, float) or not np.isnan(x)) and len(x) >= 3]

condition_collections = split_df[condition_cols].values.tolist()
condition_collections = [filter_sublist(sublist) for sublist in condition_collections]

treatment_collections = split_df[treatment_cols].values.tolist()
treatment_collections = [filter_sublist(sublist) for sublist in treatment_collections]


In [None]:
ct["conditions_split"] = pd.Series(condition_collections)
ct["treatments_split"] = pd.Series(treatment_collections)
cols_to_keep = [element for element in ct.columns if not any(substring in element for substring in ['inside', 'outside'])]
ct = ct[cols_to_keep]
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[:10])

In [None]:
ct = ct.explode('conditions_split')
ct = ct.explode('treatments_split')
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[:10])


#### Map CURIEs from mapper to ct dataframe

In [16]:
mapping_cache = pd.read_csv("mapping_cache.tsv", sep='\t', index_col=False, header=0, dtype="object", on_bad_lines="skip")
mapping_cache = mapping_cache[mapping_cache['score'] != 'unscored'] 
mapping_cache["score"] = pd.to_numeric(mapping_cache["score"], errors='coerce')
mapping_cache = mapping_cache.sort_values(by=['clintrial_term', 'score'], ascending=[False, False])


In [17]:
idx = mapping_cache.groupby(['clintrial_term', 'term_type'])['score'].transform(max) == mapping_cache['score']
max_scores = mapping_cache[idx]
max_scores = max_scores[max_scores.score >= 70]
conditions_maxscores = max_scores[max_scores.term_type == "condition"]
interventions_maxscores = max_scores[max_scores.term_type == "intervention"]
alt_intervention_maxscores = max_scores[max_scores.term_type == "alternate_intervention"]

In [18]:
conditions_dict = dict(zip(conditions_maxscores.clintrial_term, conditions_maxscores.mapping_tool_response))
interventions_dict = dict(zip(interventions_maxscores.clintrial_term, interventions_maxscores.mapping_tool_response))
alt_intervention_dict = dict(zip(alt_intervention_maxscores.clintrial_term, interventions_maxscores.mapping_tool_response))

In [19]:
ct['condition_curie_info'] = ct.conditions_split.map(conditions_dict)
ct['treatment_curie_info'] = ct.treatments_split.map(interventions_dict)
# ct = ct.drop(["conditions_split", "treatments_split", "condition_lower", "treatment_lower"], axis=1).reset_index(drop=True)
ct = ct.drop(["condition_lower", "treatment_lower"], axis=1).reset_index(drop=True)
ct.dropna(subset=['condition_curie_info', 'treatment_curie_info'], inplace=True)

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[:10])

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,conditions_split,treatments_split,condition_curie_info,treatment_curie_info
0,NCT01049399,Treatment,Parallel Assignment,2009-12,?,Progressive Supranuclear Palsy,in_clinical_trials_for,tideglusib,progressive supranuclear palsy,tideglusib,"{'mapped_name': 'Progressive supranuclear palsy', 'mapped_curie': 'C0038868', 'mapped_score': '5.18', 'mapped_semtypes': '[dsyn]'}","{'mapped_name': 'Tideglusib', 'mapped_curie': 'C3273375', 'mapped_score': '5.18', 'mapped_semtypes': '[orch,phsu]'}"
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,t,Pruritus,in_clinical_trials_for,N-acetyl cysteine,pruritus,n-acetyl cysteine,"{'mapped_name': 'Pruritus', 'mapped_curie': 'C0033774', 'mapped_score': '5.18', 'mapped_semtypes': '[fndg]'}","{'mapped_name': 'acetylcysteine', 'mapped_curie': 'C0001047', 'mapped_score': '5.18', 'mapped_semtypes': '[aapp,phsu]'}"
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,t,Relapsing Multiple Sclerosis,in_clinical_trials_for,CNM-Au8,retinoblastoma,melphalan,"{'mapped_name': 'Retinoblastoma', 'mapped_curie': 'C0035335', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'melphalan', 'mapped_curie': 'C0025241', 'mapped_score': '5.18', 'mapped_semtypes': '[aapp,phsu]'}"
5,NCT04626921,Treatment,Single Group Assignment,2020-10-22,t,Relapsing Multiple Sclerosis,in_clinical_trials_for,CNM-Au8,retinoblastoma,topotecan,"{'mapped_name': 'Retinoblastoma', 'mapped_curie': 'C0035335', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'topotecan', 'mapped_curie': 'C0146224', 'mapped_score': '5.18', 'mapped_semtypes': '[orch,phsu]'}"
7,NCT04036227,Treatment,Parallel Assignment,2019-7-3,f,Healthy,in_clinical_trials_for,GS-248,pancreatic adenocarcinoma,gemcitabine,"{'mapped_name': 'Adenocarcinoma of pancreas', 'mapped_curie': 'C0281361', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'gemcitabine', 'mapped_curie': 'C0045093', 'mapped_score': '5.18', 'mapped_semtypes': '[nnon,phsu]'}"
10,NCT05624918,Treatment,Single Group Assignment,2024-4,f,Pancreatic Adenocarcinoma,in_clinical_trials_for,Nab paclitaxel,pancreatic adenocarcinoma,novottf-200t,"{'mapped_name': 'Adenocarcinoma of pancreas', 'mapped_curie': 'C0281361', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'NovoTTF-200A Device', 'mapped_curie': 'UMLS:C4683819', 'mapped_score': 14.063435, 'mapped_semtypes': 'biolink:Device'}"
11,NCT05624918,Treatment,Single Group Assignment,2024-4,f,Pancreatic Adenocarcinoma,in_clinical_trials_for,NovoTTF-200T(P),smoldering multiple myeloma,citarinostat,"{'mapped_name': 'Smoldering myeloma', 'mapped_curie': 'C1531608', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'Citarinostat', 'mapped_curie': 'C4724808', 'mapped_score': '5.18', 'mapped_semtypes': '[orch,phsu]'}"
12,NCT02886065,Prevention,Parallel Assignment,2017-3-7,t,Smoldering Multiple Myeloma,in_clinical_trials_for,Citarinostat,smoldering multiple myeloma,hiltonol,"{'mapped_name': 'Smoldering myeloma', 'mapped_curie': 'C1531608', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'hiltonol', 'mapped_curie': 'C2698842', 'mapped_score': '5.18', 'mapped_semtypes': '[phsu]'}"
13,NCT02886065,Prevention,Parallel Assignment,2017-3-7,t,Smoldering Multiple Myeloma,in_clinical_trials_for,Hiltonol,smoldering multiple myeloma,lenalidomide,"{'mapped_name': 'Smoldering myeloma', 'mapped_curie': 'C1531608', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'lenalidomide', 'mapped_curie': 'C1144149', 'mapped_score': '5.18', 'mapped_semtypes': '[orch,phsu]'}"
16,NCT03683576,Treatment,Parallel Assignment,2018-10-22,t,Asthma,in_clinical_trials_for,GB001,prostate cancer,degarelix,"{'mapped_name': 'Prostate carcinoma', 'mapped_curie': 'C0600139', 'mapped_score': '5.18', 'mapped_semtypes': '[neop]'}","{'mapped_name': 'degarelix', 'mapped_curie': 'C1455035', 'mapped_score': '5.18', 'mapped_semtypes': '[aapp,horm,phsu]'}"


In [20]:
ct.to_csv("ct_check.tsv", sep='\t', index=False, header=True)


In [21]:
ct.columns

Index(['nct_id', 'primary_purpose', 'intervention_model', 'start_month_year',
       'is_fda_regulated_drug', 'condition', 'predicate', 'treatment',
       'conditions_split', 'treatments_split', 'condition_curie_info',
       'treatment_curie_info'],
      dtype='object')

In [None]:
# ct = ct.drop(["condition_lower", "treatment_lower"], axis=1).reset_index(drop=True)

In [204]:
high_conditions = {}
for sublist in high_score_conditions:
    if sublist[0] not in high_conditions:
        high_conditions[sublist[0]] = []
    high_conditions[sublist[0]].append(sublist[1]) # dict where key is index of df, and value is the condition
# high_score_conditions

low_conditions = {}
for sublist in low_score_conditions:
    if sublist[0] not in low_conditions:
        low_conditions[sublist[0]] = []
    low_conditions[sublist[0]].append(sublist[1]) # dict where key is index of df, and value is the condition
# high_score_conditions

In [205]:
ct_high = trials_list.copy()
ct_high['conditions_cleaned'] = ct_high.index.map(high_conditions)
ct_high = ct_high.explode('conditions_cleaned')
ct_high.dropna(subset=['conditions_cleaned'], inplace=True)
ct_high["predicate"] = "in_clinical_trials_for"

ct_low = trials_list.copy()
ct_low['conditions_cleaned'] = ct_low.index.map(low_conditions)
ct_low = ct_low.explode('conditions_cleaned')
ct_low.dropna(subset=['conditions_cleaned'], inplace=True)
ct_low["predicate"] = "mentioned_in_clinical_trials_for"

ct = pd.concat([ct_high, ct_low], axis=0)
ct = ct.replace(np.nan, None)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(ct[:10])

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,n_conditions,n_interventions,is_fda_regulated_drug,conditions,exp_only,ctr_only,both,conditions_cleaned,predicate
0,NCT01049399,Treatment,Parallel Assignment,2009-12,1,3,?,Progressive Supranuclear Palsy|1.000,tideglusib,placebo,,Progressive Supranuclear Palsy,in_clinical_trials_for
2,NCT05287724,Treatment,Crossover Assignment,2022-6-19,2,2,t,Pruritus|1.000|Skin Disorder|-0.091,N-acetyl cysteine|Placebo,,,Pruritus,in_clinical_trials_for
3,NCT01432236,Treatment,Crossover Assignment,2011-10,1,2,?,Fibromyalgia|1.000,Pregabalin,placebo,,Fibromyalgia,in_clinical_trials_for
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,1,1,t,Relapsing Multiple Sclerosis|1.000,CNM-Au8,,,Relapsing Multiple Sclerosis,in_clinical_trials_for
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,1,7,f,Retinoblastoma|1.000,Melphalan or Melphalan + Topotecan,"Carboplatin administered on Day 1|etoposide, c...",Cryotherapy (local treatment)|Intravitreal Mel...,Retinoblastoma,in_clinical_trials_for
6,NCT04036227,Treatment,Parallel Assignment,2019-7-3,1,2,f,Healthy|1.000,GS-248,Placebo,,Healthy,in_clinical_trials_for
7,NCT05624918,Treatment,Single Group Assignment,2024-4,2,3,f,Pancreatic Adenocarcinoma|1.000|Resectable Pan...,Gemcitabine|Nab paclitaxel|NovoTTF-200T(P),,,Pancreatic Adenocarcinoma,in_clinical_trials_for
9,NCT02886065,Prevention,Parallel Assignment,2017-3-7,1,4,t,Smoldering Multiple Myeloma|1.000,Citarinostat|Hiltonol|Lenalidomide|PVX-410,,,Smoldering Multiple Myeloma,in_clinical_trials_for
12,NCT03683576,Treatment,Parallel Assignment,2018-10-22,1,2,t,Asthma|1.000,GB001,Placebo,,Asthma,in_clinical_trials_for
13,NCT00738673,Treatment,Single Group Assignment,2008-7,1,1,?,Prostate Cancer|1.000,degarelix,,,Prostate Cancer,in_clinical_trials_for


In [206]:
ct_treatments = ct.copy()
interventions = ct_treatments["exp_only"].str.split("|", expand=True)
interventions['treatments']= interventions.values.tolist()
interventions['treatments'] = interventions['treatments'].apply(lambda x: list(filter(None, x)))
ct_treatments["treatment"] = interventions['treatments']
ct_treatments = ct_treatments[ct_treatments['treatment'].map(lambda d: len(d)) > 0]

ct_controls = ct.copy()
controls = ct_controls["ctr_only"].str.split("|", expand=True)
controls['control']= controls.values.tolist()
controls['control'] = controls['control'].apply(lambda x: list(filter(None, x)))
controls = controls.rename(columns={'control': 'treatment'})
ct_controls["treatment"] = controls['treatment']
ct_controls["predicate"] = "mentioned_in_clincal_trials_for"
ct_controls = ct_controls[ct_controls['treatment'].map(lambda d: len(d)) > 0]





# data['Power'] = np.where(data['Power'].str.contains('null'), np.nan, data['Power'])

# ct_treatments["treatments"] = interventions['treatments']
# ct_treatments = ct_treatments.explode('treatments')

# ct_controls = ct.copy()
# intervention_controls = ct_controls["ctr_only"].str.split("|", expand=True)
# intervention_controls['controls']= intervention_controls.values.tolist()
# intervention_controls['predicate'] = "mentioned_in_clincal_trials_for"
# intervention_controls['controls'] = intervention_controls['controls'].apply(lambda x: list(filter(None, x)))
# intervention_controls = intervention_controls.rename(columns={'controls': 'treatments'})
# intervention_controls = intervention_controls.explode('treatments')

# ct = pd.concat([ct_treatments, ct_controls], axis=0)

# ct["treatments_cleaned"] = interventions["treatments"]
# ct["controls_cleaned"] = intervention_controls["controls"]
# ct = ct.explode('treatments_cleaned')
# ct = ct.explode('controls_cleaned')
# ct['condition'] = ct['conditions_cleaned'].str.lower()
# ct['treatment'] = ct['treatments_cleaned'].str.lower()
# ct['control'] = ct['controls_cleaned'].str.lower()
# ct = ct.drop(["conditions", "exp_only", "ctr_only", "both", "n_conditions", "n_interventions", "conditions_cleaned", "treatments_cleaned"], axis=1).reset_index(drop=True)

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     display(ct_treatments[:10])
    
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     display(ct[:1000])

ct_controls
ct_treatments


Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,n_conditions,n_interventions,is_fda_regulated_drug,conditions,exp_only,ctr_only,both,conditions_cleaned,predicate,treatment
0,NCT01049399,Treatment,Parallel Assignment,2009-12,1,3,?,Progressive Supranuclear Palsy|1.000,tideglusib,placebo,,Progressive Supranuclear Palsy,mentioned_in_clincal_trials_for,[placebo]
3,NCT01432236,Treatment,Crossover Assignment,2011-10,1,2,?,Fibromyalgia|1.000,Pregabalin,placebo,,Fibromyalgia,mentioned_in_clincal_trials_for,[placebo]
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,1,7,f,Retinoblastoma|1.000,Melphalan or Melphalan + Topotecan,"Carboplatin administered on Day 1|etoposide, c...",Cryotherapy (local treatment)|Intravitreal Mel...,Retinoblastoma,mentioned_in_clincal_trials_for,"[Carboplatin administered on Day 1, etoposide,..."
6,NCT04036227,Treatment,Parallel Assignment,2019-7-3,1,2,f,Healthy|1.000,GS-248,Placebo,,Healthy,mentioned_in_clincal_trials_for,[Placebo]
12,NCT03683576,Treatment,Parallel Assignment,2018-10-22,1,2,t,Asthma|1.000,GB001,Placebo,,Asthma,mentioned_in_clincal_trials_for,[Placebo]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137104,NCT01247064,Treatment,Parallel Assignment,2010-10,2,2,?,"Bronchiolitis, Viral|0.449|Saline Solution, Hy...",Nebulized 3% saline,Nebulized 0.9% Normal Saline,,"Bronchiolitis, Viral",mentioned_in_clincal_trials_for,[Nebulized 0.9% Normal Saline]
137104,NCT01247064,Treatment,Parallel Assignment,2010-10,2,2,?,"Bronchiolitis, Viral|0.449|Saline Solution, Hy...",Nebulized 3% saline,Nebulized 0.9% Normal Saline,,"Saline Solution, Hypertonic",mentioned_in_clincal_trials_for,[Nebulized 0.9% Normal Saline]
137109,NCT02576574,Treatment,Parallel Assignment,2015-10-29,1,9,?,First Line Non-Small Cell Lung Cancer|0.581,Avelumab|Avelumab Weekly|Cisplatin,Carboplatin|Gemcitabine|Paclitaxel|Pemetrexed,,First Line Non-Small Cell Lung Cancer,mentioned_in_clincal_trials_for,"[Carboplatin, Gemcitabine, Paclitaxel, Pemetre..."
137114,NCT04264533,Treatment,Parallel Assignment,2020-2-14,3,2,f,"Vitamin C|1.000|Pneumonia, Viral|0.462|Pneumon...",VC,Sterile Water for Injection,,"Pneumonia, Viral",mentioned_in_clincal_trials_for,[Sterile Water for Injection]


Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,n_conditions,n_interventions,is_fda_regulated_drug,conditions,exp_only,ctr_only,both,conditions_cleaned,predicate,treatment
0,NCT01049399,Treatment,Parallel Assignment,2009-12,1,3,?,Progressive Supranuclear Palsy|1.000,tideglusib,placebo,,Progressive Supranuclear Palsy,in_clinical_trials_for,[tideglusib]
2,NCT05287724,Treatment,Crossover Assignment,2022-6-19,2,2,t,Pruritus|1.000|Skin Disorder|-0.091,N-acetyl cysteine|Placebo,,,Pruritus,in_clinical_trials_for,"[N-acetyl cysteine, Placebo]"
3,NCT01432236,Treatment,Crossover Assignment,2011-10,1,2,?,Fibromyalgia|1.000,Pregabalin,placebo,,Fibromyalgia,in_clinical_trials_for,[Pregabalin]
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,1,1,t,Relapsing Multiple Sclerosis|1.000,CNM-Au8,,,Relapsing Multiple Sclerosis,in_clinical_trials_for,[CNM-Au8]
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,1,7,f,Retinoblastoma|1.000,Melphalan or Melphalan + Topotecan,"Carboplatin administered on Day 1|etoposide, c...",Cryotherapy (local treatment)|Intravitreal Mel...,Retinoblastoma,in_clinical_trials_for,[Melphalan or Melphalan + Topotecan]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137109,NCT02576574,Treatment,Parallel Assignment,2015-10-29,1,9,?,First Line Non-Small Cell Lung Cancer|0.581,Avelumab|Avelumab Weekly|Cisplatin,Carboplatin|Gemcitabine|Paclitaxel|Pemetrexed,,First Line Non-Small Cell Lung Cancer,mentioned_in_clinical_trials_for,"[Avelumab, Avelumab Weekly, Cisplatin]"
137111,NCT06310811,Treatment,Single Group Assignment,2024-3-7,2,1,f,Safety|1.000|Effective|-0.048,RD06-04 Cells injection,,,Effective,mentioned_in_clinical_trials_for,[RD06-04 Cells injection]
137114,NCT04264533,Treatment,Parallel Assignment,2020-2-14,3,2,f,"Vitamin C|1.000|Pneumonia, Viral|0.462|Pneumon...",VC,Sterile Water for Injection,,"Pneumonia, Viral",mentioned_in_clinical_trials_for,[VC]
137114,NCT04264533,Treatment,Parallel Assignment,2020-2-14,3,2,f,"Vitamin C|1.000|Pneumonia, Viral|0.462|Pneumon...",VC,Sterile Water for Injection,,"Pneumonia, Ventilator-Associated",mentioned_in_clinical_trials_for,[VC]


In [192]:
ct = pd.concat([ct_treatments, ct_controls], axis=0)
ct = ct.drop(["conditions", "exp_only", "ctr_only", "both", "n_conditions", "n_interventions"], axis=1).reset_index(drop=True)

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     display(ct.iloc[-1000:])
ct = ct.explode('treatment')
ct['condition'] = ct['conditions_cleaned'].str.lower()
ct['treatment'] = ct['treatment'].str.lower()
ct = ct.drop(["conditions_cleaned"], axis=1).reset_index(drop=True)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(ct.iloc[-10:])

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,predicate,treatment,condition
544540,NCT01996384,Treatment,Parallel Assignment,2013-11,?,mentioned_in_clincal_trials_for,non-classical acupuncture,provoked vestibulodynia
544541,NCT01996384,Treatment,Parallel Assignment,2013-11,?,mentioned_in_clincal_trials_for,non-classical acupuncture,vulvar vestibulitis
544542,NCT01247064,Treatment,Parallel Assignment,2010-10,?,mentioned_in_clincal_trials_for,nebulized 0.9% normal saline,"bronchiolitis, viral"
544543,NCT01247064,Treatment,Parallel Assignment,2010-10,?,mentioned_in_clincal_trials_for,nebulized 0.9% normal saline,"saline solution, hypertonic"
544544,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,mentioned_in_clincal_trials_for,carboplatin,first line non-small cell lung cancer
544545,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,mentioned_in_clincal_trials_for,gemcitabine,first line non-small cell lung cancer
544546,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,mentioned_in_clincal_trials_for,paclitaxel,first line non-small cell lung cancer
544547,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,mentioned_in_clincal_trials_for,pemetrexed,first line non-small cell lung cancer
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,mentioned_in_clincal_trials_for,sterile water for injection,"pneumonia, viral"
544549,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,mentioned_in_clincal_trials_for,sterile water for injection,"pneumonia, ventilator-associated"


In [193]:
# List of interventions to block 
undesirable_interventions = [
    "placebo", "standard of care", "laboratory biomarker analysis",
    "questionnaire", "standard treatment",
    "data collection", "educational intervention",
    "intervention group", "training", "management of therapy complications", 
    "contingency management", "active control", "experimental group",
    "sham intervention", "active comparator", "patient navigation",
    "self-management", "quality of life",
    "treatment group", "study"
]

# Create a boolean mask for each substring and combine them with logical OR
mask = ct['treatment'].str.contains('|'.join(undesirable_interventions))
# Filter the DataFrame using the mask and replace matching rows with None
ct.loc[mask, 'treatment'] = None
ct = ct[ct['treatment'].notna()]
ct = ct[ct['condition'].notna()]
ct

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,predicate,treatment,condition
0,NCT01049399,Treatment,Parallel Assignment,2009-12,?,in_clinical_trials_for,tideglusib,progressive supranuclear palsy
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,t,in_clinical_trials_for,n-acetyl cysteine,pruritus
3,NCT01432236,Treatment,Crossover Assignment,2011-10,?,in_clinical_trials_for,pregabalin,fibromyalgia
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,t,in_clinical_trials_for,cnm-au8,relapsing multiple sclerosis
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,in_clinical_trials_for,melphalan or melphalan + topotecan,retinoblastoma
...,...,...,...,...,...,...,...,...
544545,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,mentioned_in_clincal_trials_for,gemcitabine,first line non-small cell lung cancer
544546,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,mentioned_in_clincal_trials_for,paclitaxel,first line non-small cell lung cancer
544547,NCT02576574,Treatment,Parallel Assignment,2015-10-29,?,mentioned_in_clincal_trials_for,pemetrexed,first line non-small cell lung cancer
544548,NCT04264533,Treatment,Parallel Assignment,2020-2-14,f,mentioned_in_clincal_trials_for,sterile water for injection,"pneumonia, viral"


In [311]:
ct['condition_outside_()'] =  [re.sub(r"\((.*?)\)",'', str(x)) for x in ct['condition']]
ct['condition_inside_()'] = ct['condition'].str.extract(r"\((.*?)\)", expand=True)

ct['treatment_outside_()'] =  [re.sub(r"\((.*?)\)",'', str(x)) for x in ct['treatment']]
ct['treatment_inside_()'] = ct['treatment'].str.extract(r"\((.*?)\)", expand=True)

split_chars = [",", "+", "/", " and ", "&", " or "]
split_chars = [re.escape(char) for char in split_chars]

# Function to split column based on list of characters
def split_column(df, column, split_chars):
    # Create a temporary dataframe to store split values
    temp_df = df[column].str.split('|'.join(split_chars), expand=True)
    # Rename columns of the temporary dataframe
    temp_df.columns = [f'{column}_{i+1}' for i in range(temp_df.shape[1])]
    return temp_df

# Apply split_column function to the desired column
split_con_outside = split_column(ct, 'condition_outside_()', split_chars)
split_con_inside = split_column(ct, 'condition_inside_()', split_chars)
split_trmnt_outside = split_column(ct, 'treatment_outside_()', split_chars)
split_trmnt_inside = split_column(ct, 'treatment_inside_()', split_chars)

# Concatenate the split dataframe with the original dataframe
split_df = pd.concat([split_con_outside, split_con_inside, split_trmnt_outside, split_trmnt_inside], axis=1)
split_df = split_df.fillna(np.nan).replace([np.nan], [None]) # replace NaN values in dataframe with None (just for consistency)
split_df = pd.concat([ct["condition"], ct["treatment"], split_df], axis=1)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(split_df[:100])

Unnamed: 0,condition,treatment,condition_outside_()_1,condition_outside_()_2,condition_outside_()_3,condition_outside_()_4,condition_outside_()_5,condition_outside_()_6,condition_outside_()_7,condition_outside_()_8,condition_outside_()_9,condition_outside_()_10,condition_outside_()_11,condition_outside_()_12,condition_inside_()_1,condition_inside_()_2,condition_inside_()_3,condition_inside_()_4,condition_inside_()_5,condition_inside_()_6,treatment_outside_()_1,treatment_outside_()_2,treatment_outside_()_3,treatment_outside_()_4,treatment_outside_()_5,treatment_outside_()_6,treatment_outside_()_7,treatment_outside_()_8,treatment_outside_()_9,treatment_outside_()_10,treatment_outside_()_11,treatment_outside_()_12,treatment_outside_()_13,treatment_outside_()_14,treatment_outside_()_15,treatment_outside_()_16,treatment_inside_()_1,treatment_inside_()_2,treatment_inside_()_3,treatment_inside_()_4,treatment_inside_()_5,treatment_inside_()_6,treatment_inside_()_7,treatment_inside_()_8,treatment_inside_()_9
0,Progressive Supranuclear Palsy,tideglusib,Progressive Supranuclear Palsy,,,,,,,,,,,,,,,,,,tideglusib,,,,,,,,,,,,,,,,,,,,,,,,
1,Pruritus,N-acetyl cysteine,Pruritus,,,,,,,,,,,,,,,,,,N-acetyl cysteine,,,,,,,,,,,,,,,,,,,,,,,,
2,Fibromyalgia,Pregabalin,Fibromyalgia,,,,,,,,,,,,,,,,,,Pregabalin,,,,,,,,,,,,,,,,,,,,,,,,
3,Relapsing Multiple Sclerosis,CNM-Au8,Relapsing Multiple Sclerosis,,,,,,,,,,,,,,,,,,CNM-Au8,,,,,,,,,,,,,,,,,,,,,,,,
4,Retinoblastoma,Melphalan or Melphalan + Topotecan,Retinoblastoma,,,,,,,,,,,,,,,,,,Melphalan,Melphalan,Topotecan,,,,,,,,,,,,,,,,,,,,,,
5,Healthy,GS-248,Healthy,,,,,,,,,,,,,,,,,,GS-248,,,,,,,,,,,,,,,,,,,,,,,,
6,Pancreatic Adenocarcinoma,Gemcitabine,Pancreatic Adenocarcinoma,,,,,,,,,,,,,,,,,,Gemcitabine,,,,,,,,,,,,,,,,,,,,,,,,
7,Pancreatic Adenocarcinoma,Nab paclitaxel,Pancreatic Adenocarcinoma,,,,,,,,,,,,,,,,,,Nab paclitaxel,,,,,,,,,,,,,,,,,,,,,,,,
8,Pancreatic Adenocarcinoma,NovoTTF-200T(P),Pancreatic Adenocarcinoma,,,,,,,,,,,,,,,,,,NovoTTF-200T,,,,,,,,,,,,,,,,P,,,,,,,,
9,Smoldering Multiple Myeloma,Citarinostat,Smoldering Multiple Myeloma,,,,,,,,,,,,,,,,,,Citarinostat,,,,,,,,,,,,,,,,,,,,,,,,


In [195]:
split_df

Unnamed: 0,condition,treatment,condition_outside_()_1,condition_outside_()_2,condition_outside_()_3,condition_outside_()_4,condition_outside_()_5,condition_outside_()_6,condition_outside_()_7,condition_outside_()_8,...,treatment_outside_()_16,treatment_inside_()_1,treatment_inside_()_2,treatment_inside_()_3,treatment_inside_()_4,treatment_inside_()_5,treatment_inside_()_6,treatment_inside_()_7,treatment_inside_()_8,treatment_inside_()_9
0,progressive supranuclear palsy,tideglusib,progressive supranuclear palsy,,,,,,,,...,,,,,,,,,,
1,pruritus,n-acetyl cysteine,pruritus,,,,,,,,...,,,,,,,,,,
3,fibromyalgia,pregabalin,fibromyalgia,,,,,,,,...,,,,,,,,,,
4,relapsing multiple sclerosis,cnm-au8,relapsing multiple sclerosis,,,,,,,,...,,,,,,,,,,
5,retinoblastoma,melphalan or melphalan + topotecan,retinoblastoma,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544545,first line non-small cell lung cancer,gemcitabine,first line non-small cell lung cancer,,,,,,,,...,,,,,,,,,,
544546,first line non-small cell lung cancer,paclitaxel,first line non-small cell lung cancer,,,,,,,,...,,,,,,,,,,
544547,first line non-small cell lung cancer,pemetrexed,first line non-small cell lung cancer,,,,,,,,...,,,,,,,,,,
544548,"pneumonia, viral",sterile water for injection,pneumonia,viral,,,,,,,...,,,,,,,,,,


In [196]:
split_t = split_df.transpose()
split_t = split_t.apply(lambda x: x.mask(x.duplicated(), None))
split_df = split_t.transpose()
split_df

Unnamed: 0,condition,treatment,condition_outside_()_1,condition_outside_()_2,condition_outside_()_3,condition_outside_()_4,condition_outside_()_5,condition_outside_()_6,condition_outside_()_7,condition_outside_()_8,...,treatment_outside_()_16,treatment_inside_()_1,treatment_inside_()_2,treatment_inside_()_3,treatment_inside_()_4,treatment_inside_()_5,treatment_inside_()_6,treatment_inside_()_7,treatment_inside_()_8,treatment_inside_()_9
0,progressive supranuclear palsy,tideglusib,,,,,,,,,...,,,,,,,,,,
1,pruritus,n-acetyl cysteine,,,,,,,,,...,,,,,,,,,,
3,fibromyalgia,pregabalin,,,,,,,,,...,,,,,,,,,,
4,relapsing multiple sclerosis,cnm-au8,,,,,,,,,...,,,,,,,,,,
5,retinoblastoma,melphalan or melphalan + topotecan,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544545,first line non-small cell lung cancer,gemcitabine,,,,,,,,,...,,,,,,,,,,
544546,first line non-small cell lung cancer,paclitaxel,,,,,,,,,...,,,,,,,,,,
544547,first line non-small cell lung cancer,pemetrexed,,,,,,,,,...,,,,,,,,,,
544548,"pneumonia, viral",sterile water for injection,pneumonia,viral,,,,,,,...,,,,,,,,,,


In [336]:
condition_cols = [ele for ele in split_df.columns if any([substr in ele for substr in ['condition']])] # find columns with "condition" in it 
treatment_cols = [ele for ele in split_df.columns if any([substr in ele for substr in ['treatment']])] # find columns with "treatment" in it 

def filter_sublist(sublist):
    return [x for x in sublist if x is not None and (not isinstance(x, float) or not np.isnan(x)) and len(x) >= 3]

condition_collections = split_df[condition_cols].values.tolist()
condition_collections = [filter_sublist(sublist) for sublist in condition_collections]

treatment_collections = split_df[treatment_cols].values.tolist()
treatment_collections = [filter_sublist(sublist) for sublist in treatment_collections]


In [337]:
ct["conditions_split"] = pd.Series(condition_collections)
ct["treatments_split"] = pd.Series(treatment_collections)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[:10])

Unnamed: 0,nct_id,primary_purpose,intervention_model,start_month_year,is_fda_regulated_drug,condition,predicate,treatment,treatment_lower,condition_lower,condition_outside_(),condition_inside_(),treatment_outside_(),treatment_inside_(),conditions_split,treatments_split
0,NCT01049399,Treatment,Parallel Assignment,2009-12,?,Progressive Supranuclear Palsy,in_clinical_trials_for,tideglusib,tideglusib,progressive supranuclear palsy,progressive supranuclear palsy,,tideglusib,,[progressive supranuclear palsy],[tideglusib]
1,NCT05287724,Treatment,Crossover Assignment,2022-6-19,t,Pruritus,in_clinical_trials_for,N-acetyl cysteine,n-acetyl cysteine,pruritus,pruritus,,n-acetyl cysteine,,[pruritus],[n-acetyl cysteine]
3,NCT01432236,Treatment,Crossover Assignment,2011-10,?,Fibromyalgia,in_clinical_trials_for,Pregabalin,pregabalin,fibromyalgia,fibromyalgia,,pregabalin,,[relapsing multiple sclerosis],[cnm-au8]
4,NCT04626921,Treatment,Single Group Assignment,2020-10-22,t,Relapsing Multiple Sclerosis,in_clinical_trials_for,CNM-Au8,cnm-au8,relapsing multiple sclerosis,relapsing multiple sclerosis,,cnm-au8,,[retinoblastoma],"[melphalan or melphalan + topotecan, melphalan, topotecan]"
5,NCT04681417,Treatment,Parallel Assignment,2021-3-25,f,Retinoblastoma,in_clinical_trials_for,Melphalan or Melphalan + Topotecan,melphalan or melphalan + topotecan,retinoblastoma,retinoblastoma,,melphalan or melphalan + topotecan,,[healthy],[gs-248]
6,NCT04036227,Treatment,Parallel Assignment,2019-7-3,f,Healthy,in_clinical_trials_for,GS-248,gs-248,healthy,healthy,,gs-248,,[pancreatic adenocarcinoma],[gemcitabine]
7,NCT05624918,Treatment,Single Group Assignment,2024-4,f,Pancreatic Adenocarcinoma,in_clinical_trials_for,Gemcitabine,gemcitabine,pancreatic adenocarcinoma,pancreatic adenocarcinoma,,gemcitabine,,[pancreatic adenocarcinoma],[nab paclitaxel]
8,NCT05624918,Treatment,Single Group Assignment,2024-4,f,Pancreatic Adenocarcinoma,in_clinical_trials_for,Nab paclitaxel,nab paclitaxel,pancreatic adenocarcinoma,pancreatic adenocarcinoma,,nab paclitaxel,,[pancreatic adenocarcinoma],"[novottf-200t(p), novottf-200t]"
9,NCT05624918,Treatment,Single Group Assignment,2024-4,f,Pancreatic Adenocarcinoma,in_clinical_trials_for,NovoTTF-200T(P),novottf-200t(p),pancreatic adenocarcinoma,pancreatic adenocarcinoma,,novottf-200t,p,[smoldering multiple myeloma],[citarinostat]
10,NCT02886065,Prevention,Parallel Assignment,2017-3-7,t,Smoldering Multiple Myeloma,in_clinical_trials_for,Citarinostat,citarinostat,smoldering multiple myeloma,smoldering multiple myeloma,,citarinostat,,[smoldering multiple myeloma],[hiltonol]


In [199]:
ct = ct.explode('conditions_split')
ct = ct.explode('treatments_split')
ct = ct.drop_duplicates(keep='first')
ct = ct.drop(["condition", "treatment", "condition_outside_()", "condition_inside_()", "treatment_outside_()", "treatment_inside_()"], axis=1).reset_index(drop=True)


In [338]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
#     display(ct[:100])

# Mapping CURIEs from cache to trials

In [None]:
mapping_cache = pd.read_csv("mapping_cache.tsv", sep='\t', index_col=False, header=0, dtype="object", on_bad_lines="skip")
mapping_cache = mapping_cache[mapping_cache['score'] != 'unscored'] 
mapping_cache["score"] = pd.to_numeric(mapping_cache["score"], errors='coerce')
mapping_cache = mapping_cache.sort_values(by=['clintrial_term', 'score'], ascending=[False, False])


In [None]:
idx = mapping_cache.groupby(['clintrial_term', 'term_type'])['score'].transform(max) == mapping_cache['score']
max_scores = mapping_cache[idx]
max_scores = max_scores[max_scores.score >= 70]
conditions_maxscores = max_scores[max_scores.term_type == "condition"]
interventions_maxscores = max_scores[max_scores.term_type == "intervention"]
alt_intervention_maxscores = max_scores[max_scores.term_type == "alternate_intervention"]

In [None]:
conditions_dict = dict(zip(conditions_maxscores.clintrial_term, conditions_maxscores.mapping_tool_response))
interventions_dict = dict(zip(interventions_maxscores.clintrial_term, interventions_maxscores.mapping_tool_response))
alt_intervention_dict = dict(zip(alt_intervention_maxscores.clintrial_term, interventions_maxscores.mapping_tool_response))

In [None]:
ct['condition_curie_info'] = ct.conditions_split.map(conditions_dict)
ct['treatment_curie_info'] = ct.treatments_split.map(interventions_dict)
ct = ct.drop_duplicates(keep='first')
ct

In [None]:
ct['condition_curie_info'] = ct.condition.map(conditions_dict)
ct['treatment_curie_info'] = ct.treatment.map(interventions_dict)
ct

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[:10])

In [None]:
temp = ct.copy()

In [None]:
ct = temp.copy()

In [None]:
ct['condition_outside_()'] =  [re.sub(r"\((.*?)\)",'', str(x)) for x in ct['condition']]
ct['condition_inside_()'] = ct['condition'].str.extract(r"\((.*?)\)", expand=True)

ct['treatment_outside_()'] =  [re.sub(r"\((.*?)\)",'', str(x)) for x in ct['treatment']]
ct['treatment_inside_()'] = ct['treatment'].str.extract(r"\((.*?)\)", expand=True)

split_chars = [",", "+", "/", " and ", "&", " or "]
split_chars = [re.escape(char) for char in split_chars]

# Function to split column based on list of characters
def split_column(df, column, split_chars):
    # Create a temporary dataframe to store split values
    temp_df = df[column].str.split('|'.join(split_chars), expand=True)
    # Rename columns of the temporary dataframe
    temp_df.columns = [f'{column}_{i+1}' for i in range(temp_df.shape[1])]
    return temp_df

# Apply split_column function to the desired column
split_con_outside = split_column(ct, 'condition_outside_()', split_chars)
split_con_inside = split_column(ct, 'condition_inside_()', split_chars)
split_trmnt_outside = split_column(ct, 'treatment_outside_()', split_chars)
split_trmnt_inside = split_column(ct, 'treatment_inside_()', split_chars)

# Concatenate the split dataframe with the original dataframe
split_df = pd.concat([split_con_outside, split_con_inside, split_trmnt_outside, split_trmnt_inside], axis=1)
split_df = split_df.fillna(np.nan).replace([np.nan], [None]) # replace NaN values in dataframe with None (just for consistency)
split_df = pd.concat([ct["condition"], ct["treatment"], split_df], axis=1)

In [None]:
split_df

In [None]:
split_t = split_df.transpose()
split_t = split_t.apply(lambda x: x.mask(x.duplicated(), None))
split_df = split_t.transpose()
split_df

In [None]:
condition_cols = [ele for ele in split_df.columns if any([substr in ele for substr in ['condition']])] # find columns with "condition" in it 
treatment_cols = [ele for ele in split_df.columns if any([substr in ele for substr in ['treatment']])] # find columns with "treatment" in it 

def filter_sublist(sublist):
    return [x for x in sublist if x is not None and (not isinstance(x, float) or not np.isnan(x))]

condition_collections = split_df[condition_cols].values.tolist()
condition_collections = [filter_sublist(sublist) for sublist in condition_collections]

treatment_collections = split_df[treatment_cols].values.tolist()
treatment_collections = [filter_sublist(sublist) for sublist in treatment_collections]

print(len(condition_collections))
print(len(treatment_collections))

In [None]:
ct["conditions_split"] = pd.Series(condition_collections)
ct["treatments_split"] = pd.Series(treatment_collections)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[:10])

In [None]:
ct = ct.explode('conditions_split')
ct = ct.explode('treatments_split')
ct = ct.drop_duplicates(keep='first')
ct = ct.drop(["condition", "treatment", "condition_outside_()", "condition_inside_()", "treatment_outside_()", "treatment_inside_()"], axis=1).reset_index(drop=True)


In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(ct[:10])


#### Map CURIEs from mapper to ct dataframe

In [None]:
mapping_cache = pd.read_csv("mapping_cache.tsv", sep='\t', index_col=False, header=0, dtype="object", on_bad_lines="skip")
mapping_cache = mapping_cache[mapping_cache['score'] != 'unscored'] 
mapping_cache["score"] = pd.to_numeric(mapping_cache["score"], errors='coerce')
mapping_cache = mapping_cache.sort_values(by=['clintrial_term', 'score'], ascending=[False, False])


In [None]:
idx = mapping_cache.groupby(['clintrial_term', 'term_type'])['score'].transform(max) == mapping_cache['score']
max_scores = mapping_cache[idx]
max_scores = max_scores[max_scores.score >= 70]
conditions_maxscores = max_scores[max_scores.term_type == "condition"]
interventions_maxscores = max_scores[max_scores.term_type == "intervention"]
alt_intervention_maxscores = max_scores[max_scores.term_type == "alternate_intervention"]

In [None]:
conditions_dict = dict(zip(conditions_maxscores.clintrial_term, conditions_maxscores.mapping_tool_response))
interventions_dict = dict(zip(interventions_maxscores.clintrial_term, interventions_maxscores.mapping_tool_response))
alt_intervention_dict = dict(zip(alt_intervention_maxscores.clintrial_term, interventions_maxscores.mapping_tool_response))

In [None]:
ct['condition_curie_info'] = ct.conditions_split.map(conditions_dict)
ct['treatment_curie_info'] = ct.treatments_split.map(interventions_dict)
ct = ct.drop_duplicates(keep='first')
ct

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    # display(ct[78000:79000])
    display(ct[:10])

#### Map CURIEs from mapper to ct dataframe

In [None]:
mapping_cache = pd.read_csv("mapping_cache.tsv", sep='\t', index_col=False, header=0, dtype="object", on_bad_lines="skip")
mapping_cache = mapping_cache[mapping_cache['score'] != 'unscored'] 
mapping_cache["score"] = pd.to_numeric(mapping_cache["score"], errors='coerce')
mapping_cache = mapping_cache.sort_values(by=['clintrial_term', 'score'], ascending=[False, False])


In [None]:
idx = mapping_cache.groupby(['clintrial_term', 'term_type'])['score'].transform(max) == mapping_cache['score']
max_scores = mapping_cache[idx]
max_scores = max_scores[max_scores.score >= 70]
conditions_maxscores = max_scores[max_scores.term_type == "condition"]
interventions_maxscores = max_scores[max_scores.term_type == "intervention"]
alt_intervention_maxscores = max_scores[max_scores.term_type == "alternate_intervention"]

In [None]:
conditions_dict = dict(zip(conditions_maxscores.clintrial_term, conditions_maxscores.mapping_tool_response))
interventions_dict = dict(zip(interventions_maxscores.clintrial_term, interventions_maxscores.mapping_tool_response))
alt_intervention_dict = dict(zip(alt_intervention_maxscores.clintrial_term, interventions_maxscores.mapping_tool_response))

In [None]:
ct['condition_curie_info'] = ct.condition.map(conditions_dict)
ct['treatment_curie_info'] = ct.treatment.map(interventions_dict)
ct

In [None]:
ct.to_csv('test.csv', sep="\t", index=False, header=True)

In [None]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
#     display(split_df[55000:56000])

In [None]:
ct['condition_outside_p'] =  [re.sub(r"\((.*?)\)",'', str(x)) for x in ct['condition']]
ct['condition_inside_p'] = ct['condition'].str.extract(r"\((.*?)\)", expand=True)

ct['treatment_outside_p'] =  [re.sub(r"\((.*?)\)",'', str(x)) for x in ct['treatment']]
ct['treatment_inside_p'] = ct['treatment'].str.extract(r"\((.*?)\)", expand=True)

ct['condition_outside_b'] =  [re.sub(r'\[([^\]]+)\]','', str(x)) for x in ct['condition']]
ct['condition_inside_b'] = ct['condition'].str.extract(r'\[([^\]]+)\]', expand=True)

ct['treatment_outside_b'] =  [re.sub(r'\[([^\]]+)\]','', str(x)) for x in ct['treatment']]
ct['treatment_inside_b'] = ct['treatment'].str.extract(r'\[([^\]]+)\]', expand=True)

split_chars = [",", "+", "/", " and ", "&", " or "]
split_chars = [re.escape(char) for char in split_chars]

# Function to split column based on list of characters
def split_column(df, column, split_chars):
    # Create a temporary dataframe to store split values
    temp_df = df[column].str.split('|'.join(split_chars), expand=True)
    # Rename columns of the temporary dataframe
    temp_df.columns = [f'{column}_{i+1}' for i in range(temp_df.shape[1])]
    return temp_df

# Apply split_column function to the desired column
split_con_outside_p = split_column(ct, 'condition_outside_p', split_chars)
split_con_inside_p = split_column(ct, 'condition_inside_p', split_chars)
split_trmnt_outside_p = split_column(ct, 'treatment_outside_p', split_chars)
split_trmnt_inside_p = split_column(ct, 'treatment_inside_p', split_chars)

split_con_outside_b = split_column(ct, 'condition_outside_b', split_chars)
split_con_inside_b = split_column(ct, 'condition_inside_b', split_chars)
split_trmnt_outside_b = split_column(ct, 'treatment_outside_b', split_chars)
split_trmnt_inside_b = split_column(ct, 'treatment_inside_b', split_chars)

# Concatenate the split dataframe with the original dataframe
split_df = pd.concat([split_con_outside_p, split_con_inside_p, split_trmnt_outside_p, split_trmnt_inside_p,
                     split_con_outside_b, split_con_inside_b, split_trmnt_outside_b, split_trmnt_inside_b], axis=1)
split_df = split_df.fillna(np.nan).replace([np.nan], [None]) # replace NaN values in dataframe with None (just for consistency)
split_df = pd.concat([ct["condition"], ct["treatment"], split_df], axis=1)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(split_df[-100:])

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(split_df[55000:56000])

In [None]:
ct = pd.concat([ct, split_df], axis=1)

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(split_df[55000:55010])

mapping_cache = pd.read_csv("mapping_cache.tsv", sep='\t', index_col=False, header=0, dtype="object", on_bad_lines="skip")
mapping_cache = mapping_cache[mapping_cache['score'] != 'unscored'] 
mapping_cache["score"] = pd.to_numeric(mapping_cache["score"], errors='coerce')
mapping_cache = mapping_cache.sort_values(by=['clintrial_term', 'score'], ascending=[False, False])
mapping_cache = pd.read_csv("mapping_cache.tsv", sep='\t', index_col=False, header=0, dtype="object", on_bad_lines="skip")
mapping_cache = mapping_cache[mapping_cache['score'] != 'unscored'] 
mapping_cache["score"] = pd.to_numeric(mapping_cache["score"], errors='coerce')
mapping_cache = mapping_cache.sort_values(by=['clintrial_term', 'score'], ascending=[False, False])
mapping_cache = pd.read_csv("mapping_cache.tsv", sep='\t', index_col=False, header=0, dtype="object", on_bad_lines="skip")
mapping_cache = mapping_cache[mapping_cache['score'] != 'unscored'] 
mapping_cache["score"] = pd.to_numeric(mapping_cache["score"], errors='coerce')
mapping_cache = mapping_cache.sort_values(by=['clintrial_term', 'score'], ascending=[False, False])
mapping_cache = pd.read_csv("mapping_cache.tsv", sep='\t', index_col=False, header=0, dtype="object", on_bad_lines="skip")
mapping_cache = mapping_cache[mapping_cache['score'] != 'unscored'] 
mapping_cache["score"] = pd.to_numeric(mapping_cache["score"], errors='coerce')
mapping_cache = mapping_cache.sort_values(by=['clintrial_term', 'score'], ascending=[False, False])
#### Map CURIEs from mapper to ct dataframe

In [None]:
mapping_cache = pd.read_csv("mapping_cache.tsv", sep='\t', index_col=False, header=0, dtype="object", on_bad_lines="skip")
mapping_cache = mapping_cache[mapping_cache['score'] != 'unscored'] 
mapping_cache["score"] = pd.to_numeric(mapping_cache["score"], errors='coerce')
mapping_cache = mapping_cache.sort_values(by=['clintrial_term', 'score'], ascending=[False, False])


In [None]:
idx = mapping_cache.groupby(['clintrial_term', 'term_type'])['score'].transform(max) == mapping_cache['score']
max_scores = mapping_cache[idx]
max_scores = max_scores[max_scores.score >= 70]
conditions_maxscores = max_scores[max_scores.term_type == "condition"]
interventions_maxscores = max_scores[max_scores.term_type == "intervention"]
alt_intervention_maxscores = max_scores[max_scores.term_type == "alternate_intervention"]

In [None]:
conditions_dict = dict(zip(conditions_maxscores.clintrial_term, conditions_maxscores.mapping_tool_response))
interventions_dict = dict(zip(interventions_maxscores.clintrial_term, interventions_maxscores.mapping_tool_response))
alt_intervention_dict = dict(zip(alt_intervention_maxscores.clintrial_term, interventions_maxscores.mapping_tool_response))

In [None]:
ct['condition_curie_info'] = ct.condition.map(conditions_dict)
ct['treatment_curie_info'] = ct.treatment.map(interventions_dict)
ct

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(ct[:10])

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(ct[:10])

In [None]:
data = {} # data dictionary
pattern = "\((.*?)\)"

# df = pd.Series(data)
# new_data = {}

key = 0
for item in df.items():
    new_data[key] = re.match(pattern, item[1])[1]
    key += 1

print(new_data)