In [1]:
import json
import logging
import xml.etree.ElementTree as ET
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import xmltodict

## Utils

In [2]:
def get_file_names(raw_data_folder_path):
    """
    Get all the file paths in a list
    :param raw_data_folder_path: path to the raw data folder
    :return: list of file paths
    """
    file_paths = []
    for splits in os.scandir(raw_data_folder_path):
        if splits.is_dir():
            for nct_dir in os.scandir(splits.path):
                for file in os.scandir(nct_dir.path):
                    if file.is_file():
                        file_paths.append(file.path)
    return file_paths


def find_xml_without_eligibility_tag(data_dir='data/raw'):
    """
    Find all the XML files without eligibility tag in TREC 2023 CT
    :param data_dir: path to the raw data folder
    :return: list of file paths without eligibility tag
    """
    xml_files = get_file_names(data_dir)
    xml_without_eligibility = set()
    for xml_file in xml_files:
        tree = ET.parse(xml_file)
        root = tree.getroot()

        # Check if <eligibility> tag exists as an immediate child of root
        eligibility_tag = root.find('eligibility')
        if not eligibility_tag:
            xml_without_eligibility.add(xml_file)
    return list(xml_without_eligibility)


def get_xml_full_path_from_nct_ids(indexes, raw_data_folder_path):
    """
    Get the full path of the XML files from a list of NCT IDs and a path to the raw data folder
    :param indexes: list of NCT IDs
    :param raw_data_folder_path: path to the raw data folder
    :return: list of full paths of the XML files
    """
    file_paths = get_file_names(raw_data_folder_path)
    indexes_set = set(indexes)
    res_paths = [file_path for file_path in file_paths if file_path[-15:-4] in indexes_set]
    return res_paths


def make_json_dump_from_xml_file_paths(file_paths, output_file_name='data_none'):
    """
    Dumps all the contente from a specific XML file in a json file
    :param file_paths: list of XML file paths
    :param output_file_name: name of the output file
    :return: None (stores the data in a json file)
    """
    output_path = os.path.normpath(os.getcwd() + '/..' + '/data/processed/' + output_file_name + '.json')

    dict_list = []
    for file_path in tqdm(file_paths, desc='Parsing XML files to json:', disable=False, position=0, leave=True):
        with open(file_path, 'r') as f:
            xml_string = f.read()
        json_data = xmltodict.parse(xml_string)
        dict_list.append(json_data)

    df = pd.DataFrame.from_dict(dict_list)

    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(df.to_dict(), file, ensure_ascii=False, indent=4)

    return None


def make_json_dump_from_df(df, output_path):
    """
    Dumps all the contents from a specific dataframe in a json file
    :param df: dataframe
    :param output_path: path to the output json file
    :return: None (stores the data in a json file)
    """
    df = pd.DataFrame.from_dict(dict_list)['eligibility']

    with open(output_path, 'w', encoding='utf-8') as file:
        json.dump(df.to_dict(), file, ensure_ascii=False, indent=4)

    return None


## Data exporation from json file for only sum of the main XML tags

In [3]:
# To define for later usage
raw_data_folder_path = os.path.normpath(os.getcwd() + '/..' + '/data/raw/')

In [4]:
filepath = os.path.normpath(os.getcwd() + '/..' + '/data/processed/data.json')
filepath

'/home/hazot/code/trec-clinical-trials-2023/data/processed/data.json'

In [5]:
df = pd.read_json(filepath)

In [6]:
print(df.shape)
print(df.columns)

(451538, 17)
Index(['nct_id', 'link_text', 'url', 'id_info', 'brief_title', 'sponsors',
       'brief_summary', 'detailed_description', 'primary_purpose',
       'intervention', 'eligibility', 'gender', 'minimum_age', 'maximum_age',
       'healthy_volunteers', 'keyword', 'condition_browse'],
      dtype='object')


##### Some numbers

In [7]:
print("Sum of column:", np.sum(list(df['gender'].value_counts())))
print('-----------')
print(df['gender'].value_counts())

Sum of column: 450667
-----------
gender
All       387617
Female     43441
Male       19609
Name: count, dtype: int64


In [8]:
print("Sum of column:", np.sum(list(df['minimum_age'].value_counts())))
print('-----------')
print(df['minimum_age'].value_counts())

Sum of column: 450667
-----------
minimum_age
18 Years     279575
N/A           30526
20 Years      17938
40 Years       9334
21 Years       9238
              ...  
84 Days           1
118 Years         1
73 Hours          1
167 Days          1
23 Days           1
Name: count, Length: 318, dtype: int64


In [9]:
# Full list of possibilities
dict(df['minimum_age'].value_counts())['N/A']

30526

In [10]:
print("Sum of column:", np.sum(list(df['maximum_age'].value_counts())))
print('-----------')
print(df['maximum_age'].value_counts())

Sum of column: 450667
-----------
maximum_age
N/A           214375
65 Years       29033
80 Years       23198
75 Years       22838
70 Years       18586
               ...  
54 Days            1
263 Months         1
93 Days            1
26 Hours           1
37 Days            1
Name: count, Length: 473, dtype: int64


In [None]:
print("Sum of column:", np.sum(list(df['healthy_volunteers'].value_counts())))
print('-----------')
print(df['healthy_volunteers'].value_counts())

### More complicated cases (more subtags, more intricate dicts)

#### Eligibility field

##### Extract full xml file for NONE eligibility fields in one json file

In [14]:
# How many None in Eligibility column
print("Number of 'None' in Eligibility field:", df['eligibility'].isnull().sum())

Number of 'None' in Eligibility field: 871


In [20]:
indexes_with_none_eligbility = [i for i, x in enumerate(list(df['eligibility'].isnull())) if x is True]
print(indexes_with_none_eligbility[:10], "...")

[854, 936, 977, 2962, 3354, 3359, 4953, 5235, 5307, 5517] ...


In [27]:
nct_id_with_none_eligibility = df.iloc[indexes_with_none_eligbility]['nct_id'].tolist()
print(nct_id_with_none_eligibility[:10], "...")

['NCT00800891', 'NCT00809406', 'NCT00806052', 'NCT00843752', 'NCT00840619', 'NCT00846820', 'NCT00506220', 'NCT00758095', 'NCT00758979', 'NCT00759694'] ...


In [31]:
raw_data_folder_path = os.path.normpath(os.getcwd() + '/..' + '/data/raw/')
xml_non_eligible_file_paths = get_xml_full_path_from_nct_ids(nct_id_with_none_eligibility, raw_data_folder_path)
xml_non_eligible_file_paths[:5]

['/home/hazot/code/trec-clinical-trials-2023/data/raw/ClinicalTrials.2023-05-08.trials0/NCT0080xxxx/NCT00800891.xml',
 '/home/hazot/code/trec-clinical-trials-2023/data/raw/ClinicalTrials.2023-05-08.trials0/NCT0080xxxx/NCT00809406.xml',
 '/home/hazot/code/trec-clinical-trials-2023/data/raw/ClinicalTrials.2023-05-08.trials0/NCT0080xxxx/NCT00806052.xml',
 '/home/hazot/code/trec-clinical-trials-2023/data/raw/ClinicalTrials.2023-05-08.trials0/NCT0084xxxx/NCT00843752.xml',
 '/home/hazot/code/trec-clinical-trials-2023/data/raw/ClinicalTrials.2023-05-08.trials0/NCT0084xxxx/NCT00840619.xml']

##### Checking for all possible sub tags and their values

In [39]:
list(df['eligibility'][0].keys())

['criteria', 'gender', 'minimum_age', 'maximum_age', 'healthy_volunteers']

In [None]:
# for i in tqdm(range(df.shape[0]), desc='Calculations...', disable=False, position=0, leave=True):
#     if df['eligibility'][i] is None:
#         continue
#     for value in df['eligibility'][0].get('criteria'):

In [40]:
# Check if criteria has the word "inclusion" or "exclusion" and count them
inclusion_exclusion_nb = {'inclusion': 0, 
                          'exclusion': 0,
                          'inclusion criteria': 0,
                          'exclusion criteria': 0}

for i in tqdm(range(df.shape[0]), desc='Calculations for inclusion_exclusion_nb', disable=False, position=0, leave=True):
    if df['eligibility'][i] is None:
        continue
    value = df['eligibility'][i].get('criteria')
    if value == None:
        continue
    if 'inclusion' in value.lower():
        inclusion_exclusion_nb['inclusion'] += 1
    if 'exclusion' in value.lower():
        inclusion_exclusion_nb['exclusion'] += 1
    if 'inclusion criteria' in value.lower():
        inclusion_exclusion_nb['inclusion criteria'] += 1
    if 'exclusion criteria' in value.lower():
        inclusion_exclusion_nb['exclusion criteria'] += 1

Calculations for inclusion_exclusion_nb: 100%|███████████████████████████████| 451538/451538 [00:11<00:00, 39888.94it/s]


In [47]:
inclusion_exclusion_nb

{'inclusion': 440269,
 'exclusion': 434400,
 'inclusion criteria': 438764,
 'exclusion criteria': 433051}

In [55]:
str(df['eligibility'][i].items())

"dict_items([('criteria', 'Inclusion Criteria The inclusion criteria are the following Consenting, English speaking women between ages 18 and 80 who will undergo same day vaginal female pelvic reconstructive surgery at MetroHealth Medical Center Ability to read VAS Scores Specific vaginal procedures include, but are not limited to Periurethral bulking Perineoplasty Complete vaginectomy Le Forte colpocleisis Anterior repair Posterior repair Enterocele repair Anterior and posterior repair Anterior, posterior and enterocele repair Transvaginal mesh use Sacrospinous ligament fixation Uterosacral ligament suspension Vaginal paravaginal defect repair Midurethral Sling Sphincteroplasty Vaginal hysterectomy, for uterus 250 g or less Vaginal hysterectomy, for uterus 250 g or less; with removal of tube(s), and/or ovary(s) Vaginal hysterectomy, for uterus 250 g or less; with removal of tube(s), and/or ovary(s), with repair of enterocele Vaginal hysterectomy, for uterus 250 g or less; with repair 

In [244]:
# Takes 10 seconds to run on i7 7700k @ WSL2
# Find NCT IDs without the word inclusion or exclusion inside the eligibility tag (does not work with broad text research)
nct_ids_without_anything_in_criteria = []
low_words_amount = []
nct_ids_without_anything_in_eligbility = []

nct_ids_without_inclusion_exclusion_in_criteria_dict = {
    'inclusion': [],
    'exclusion': [],
}
nct_ids_without_inclusion_exclusion_in_eligibility_dict = {
    'inclusion': [],
    'exclusion': []
}

nct_ids_with_synonyms = {'enrollment': [],
                         'disease characteristic': []}

for i in tqdm(range(df.shape[0]), desc='Calculations for inclusion_exclusion_nb', disable=False, position=0, leave=True):
    
    if df['eligibility'][i] is None:
        nct_ids_without_anything_in_eligbility.append(df['nct_id'][i])
        continue
        
    value_criteria = df['eligibility'][i].get('criteria')
    value_eligibility = str(df['eligibility'][i].items())
    
    if value_criteria == None:
        nct_ids_without_anything_in_criteria.append(df['nct_id'][i])
        nct_ids_without_inclusion_exclusion_in_criteria_dict['inclusion'].append(df['nct_id'][i])
        nct_ids_without_inclusion_exclusion_in_criteria_dict['exclusion'].append(df['nct_id'][i])
    else:
        value_criteria = value_criteria.lower()
        if 'inclusion' not in value_criteria:
            nct_ids_without_inclusion_exclusion_in_criteria_dict['inclusion'].append(df['nct_id'][i])
        if 'exclusion' not in value_criteria:
            nct_ids_without_inclusion_exclusion_in_criteria_dict['exclusion'].append(df['nct_id'][i])
        if len(value_criteria.split()) == 5:
            low_words_amount.append(df['nct_id'][i])
        if 'inclusion' not in value_criteria and 'enrollment' in value_criteria:
            nct_ids_with_synonyms['enrollment'].append(df['nct_id'][i])
        if 'inclusion' not in value_criteria and 'disease characteristic' in value_criteria:
            nct_ids_with_synonyms['disease characteristic'].append(df['nct_id'][i])
    
    value_eligibility = value_eligibility.lower()
    if 'inclusion' not in value_eligibility:
        nct_ids_without_inclusion_exclusion_in_eligibility_dict['inclusion'].append(df['nct_id'][i])
    if 'exclusion' not in value_eligibility:
        nct_ids_without_inclusion_exclusion_in_eligibility_dict['exclusion'].append(df['nct_id'][i])

print('Number of trials without the eligibility tag', len(nct_ids_without_anything_in_eligbility))

Calculations for inclusion_exclusion_nb: 100%|███████████████████████████████| 451538/451538 [00:16<00:00, 27418.44it/s]

Number of trials without the eligibility tag 871





In [245]:
print("Number of trials where <criteria> has 1 to 5 words:", len(low_words_amount))

Number of trials where <criteria> has 1 to 5 words: 294


In [246]:
print('nct_ids_without_anything_in_criteria:', len(nct_ids_without_anything_in_criteria))

nct_ids_without_anything_in_criteria: 75


In [247]:
print("nct ids without 'inclusion' or 'exclusion' in criteria.")
print('# inclusion:', len(nct_ids_without_inclusion_exlucion_dict['inclusion']))
print('# exclusion:', len(nct_ids_without_inclusion_exlucion_dict['exclusion']))

nct_ids_without_inclusion_exclusion_in_criteria_list = list(set(nct_ids_without_inclusion_exclusion_in_criteria_dict['inclusion'] + nct_ids_without_inclusion_exclusion_in_criteria_dict['exclusion']))
print('len(nct_ids_without_inclusion_exlucion_list:', len(nct_ids_without_inclusion_exclusion_in_criteria_list))

nct ids without 'inclusion' or 'exclusion' in criteria.
# inclusion: 10323
# exclusion: 16192
len(nct_ids_without_inclusion_exlucion_list: 17022


In [248]:
print("nct ids without 'inclusion' or 'exclusion' in eligibility.")
print('# inclusion:', len(nct_ids_without_inclusion_exclusion_in_eligibility_dict['inclusion']))
print('# exclusion:', len(nct_ids_without_inclusion_exclusion_in_eligibility_dict['exclusion']))

nct_ids_without_inclusion_exclusion_in_eligibility_list = list(set(nct_ids_without_inclusion_exclusion_in_eligibility_dict['inclusion'] + nct_ids_without_inclusion_exclusion_in_eligibility_dict['exclusion']))
print('len(nct_ids_without_inclusion_exlucion_list:', len(nct_ids_without_inclusion_exclusion_in_eligibility_list))

nct ids without 'inclusion' or 'exclusion' in eligibility.
# inclusion: 10377
# exclusion: 16245
len(nct_ids_without_inclusion_exlucion_list: 16994


In [249]:
# Difference between eligibility and criteria
diff_nct_ids = set(nct_ids_without_inclusion_exclusion_in_criteria_list) - set(nct_ids_without_inclusion_exclusion_in_eligibility_list)
print('len(diff_nct_ids):', len(diff_nct_ids))

len(diff_nct_ids): 28


In [250]:
print('# enrollment:', len(nct_ids_with_synonyms['enrollment']))

# enrollment: 532


In [252]:
print('# enrollment:', len(nct_ids_with_synonyms['disease characteristic']))

# enrollment: 6317


In [251]:
nct_ids_with_synonyms['enrollment'][0]

'NCT00640861'

##### Analyzing some trials with "inclusion" outside of the 'criteria' sub tag

In [241]:
print(list(diff_nct_ids))

['NCT02450916', 'NCT05161429', 'NCT03962699', 'NCT05208463', 'NCT03005197', 'NCT04003064', 'NCT02096692', 'NCT01717001', 'NCT05541302', 'NCT02219555', 'NCT02197780', 'NCT00786708', 'NCT05029843', 'NCT00643487', 'NCT04638179', 'NCT00898781', 'NCT00677378', 'NCT01788943', 'NCT00910884', 'NCT05847361', 'NCT02318550', 'NCT04754854', 'NCT01667276', 'NCT01366248', 'NCT03729557', 'NCT03437486', 'NCT05026580', 'NCT02841670']


In [265]:
df.shape[0]

451538

In [263]:
# id = nct_ids_with_synonyms['disease characteristic'][0]
id = nct_ids_with_synonyms['enrollment'][-4]
# id = list(diff_nct_ids)[3]
print(id)
print()
print(str(df[df['nct_id'] == id]['eligibility'][int(str(df[df['nct_id'] == id]['eligibility'].keys()[0]))].items()))

NCT03323151

dict_items([('criteria', "- Relapsed or refractory, pathologically proven mantle cell lymphoma. Must have a current or prior tissue sample that is IHC positive for cyclin D 1 or that is positive by FISH or cytogenetics for t(11;14). Must have been refractory to and/or relapsed/progressed after at least 1 prior therapy. Prior autologous or allogeneic transplant are allowed. Patients may not have active grade II-IV acute graft-versus-host disease (GVHD) or moderate/severe chronic GVHD by NIH criteria and may not require immunosuppressive medications and/or corticosteroids for the management of acute or chronic GVHD. Phase I Prior proteasome inhibitor and/or Bruton's tyrosine kinase (BTK) inhibitors are allowed but patients may not have been exposed to the combination of proteasome inhibitor and BTK inhibitor. Patients who have progressed on ibrutinib that are felt to be at high risk for rapid progression on this study shall not be eligible for the phase I portion of the stud

In [None]:
special_case_df = df[df['nct_id'].isin(special_case_nct_combined_normal)]
special_case_df.shape

In [None]:
special_case_df['eligibility'].iloc[0].get('criteria').split()[:2]

In [None]:
# get first two words from special cases

words = {}
idsss = []
enrollments = []

for i in range(special_case_df.shape[0]):
    first_word = special_case_df['eligibility'].iloc[i].get('criteria').split()[0].lower()
    if first_word in words:
        words[first_word] += 1
    elif first_word not in words:
        words[first_word] = 1
    if 'inclusion' in first_word:
        idsss.append(special_case_df['nct_id'].iloc[i])
    if 'enrollment' in first_word:
        enrollments.append(special_case_df['nct_id'].iloc[i])

In [None]:
len(idsss)

In [None]:
enrollments

In [None]:
df[df['nct_id'] == 'NCT00077831']['eligibility'][1881]['criteria']

In [None]:
df[df['nct_id'] == 'NCT00077831']['eligibility']

In [None]:
words

In [None]:
all_possible_eligibility_sub_tags = {}

for i in tqdm(range(df.shape[0]), desc='Calculations for all_possible_eligibility_sub_tags', disable=False, position=0, leave=True):
    if df['eligibility'][i] is None:
        continue
    for key in tuple(df['eligibility'][i].keys()):
        if key not in all_possible_tags:
            all_possible_eligibility_sub_tags[key] = 1
        else:
            all_possible_eligibility_sub_tags[key] += 1

In [None]:
all_possible_tags

In [None]:
all_possible_values_sample = {}

for i in tqdm(range(df.shape[0]), desc='Calculations...', disable=False, position=0, leave=True):
    if df['eligibility'][i] is None:
        continue
    value = df['eligibility'][i].get('sampling_method')
    if value is None:
        continue
    if value not in all_possible_values_sample:
        all_possible_values_sample[value] = 1
    else:
        all_possible_values_sample[value] += 1
    

In [None]:
all_possible_values_sample

#### Intervention field

In [None]:
df['intervention'][0]

In [None]:
df['keyword'][:10]

In [None]:
df['condition_browse'][:10]

### Less complicated cases (less subtags)

In [None]:
df['primary_purpose'].value_counts()

In [None]:
df['nct_id'].value_counts()

In [None]:
count_url = 0
for i in range(len(df['url'])):
    count_url += 1 if df['url'][i] else 0
print(count_url)

In [None]:
len(df['id_info'][2].keys())

In [None]:
max_keys = 0
max_string_keys = ''
for i in range(len(df['id_info'])):
    if max_keys >= len(df['id_info'][i].keys()):
        break
    max_keys = len(df['id_info'][i].keys())
    max_string_keys = df['id_info'][i].keys()
print(max_keys)
print(max_string_keys)

min_keys = 100
min_string_keys = ''
for i in range(len(df['id_info'])):
    if min_keys <= len(df['id_info'][i].keys()):
        break
    min_keys = len(df['id_info'][i].keys())
    min_string_keys = df['id_info'][i].keys()
print(min_keys)
print(min_string_keys)

In [None]:
df['id_info'][0].keys()

In [None]:
count_nct_id = 0
for i in range(len(df['id_info'])):
    count_nct_id += 1 if df['id_info'][i].get('nct_id', 0) else 0
print(count_nct_id)

In [None]:
df['brief_title']

In [None]:
df['brief_title'].isna().sum()

In [None]:
df['brief_summary'].isna().sum()

In [None]:
df['brief_summary'].apply(type).value_counts()

In [None]:
df['brief_summary'].value_counts()[:10]

In [None]:
bad_indexes = df.loc[pd.isna(df["brief_summary"]), :].index
print(bad_indexes)
print('len(bad_indexes):', len(bad_indexes))

### Check word (tokens) frequency

1. Word frequency in all strings

In [None]:
df['brief_summary'][:10000].str.split(expand=True).stack().value_counts()[:30]

In [None]:
df['brief_summary']

2. Token frequency in all strings

In [None]:
tokenizer = None

## Exploration of the XML tags

In [None]:
def get_file_names(raw_data_folder_path):
    file_paths = []
    for splits in os.scandir(raw_data_folder_path):
        if splits.is_dir():
            for nct_dir in os.scandir(splits.path):
                for file in os.scandir(nct_dir.path):
                    if file.is_file():
                        file_paths.append(file.path)
    return file_paths

In [None]:
raw_data_folder_path = os.path.normpath(os.getcwd() + '/..' + '/data/raw/')
raw_data_folder_path

In [None]:
def tag_exploration_loop_raw_numbers(raw_data_folder_path):
    file_paths = get_file_names(raw_data_folder_path)
    
    nb_elems_all = []
    max_elem_list = []
    all_tags_list = set()
    for file_path in tqdm(file_paths, desc='Parsing XML files to json:', disable=False, position=0, leave=True):
        # load and parse the file
        xml_tree = ET.parse(file_path)
        
        elem_list = []
        
        for elem in xml_tree.iter():
            elem_list.append(elem.tag)
        
        # now I remove duplicities - by convertion to set and back to list
        elem_list = list(set(elem_list))
    
        nb_elem = len(elem_list)
        nb_elems_all.append(nb_elem)
        
        if nb_elem > len(max_elem_list):
            max_elem_list = elem_list

        for tag in elem_list:
            if tag not in all_tags_list:
                all_tags_list.add(tag)

    return nb_elems_all, max_elem_list, all_tags_list

In [None]:
result_tuple = tag_exploration_loop_raw_numbers(raw_data_folder_path)

In [None]:
list_nb_of_elems = result_tuple[0]
list_max_elem = result_tuple[1]
all_possible_tags_list = result_tuple[2]

In [None]:
# Statistics on the number of elems in list
import statistics as stats
print('Number of Clinical Trials possible:', len(list_nb_of_elems))
print('Average number of tags in a list:', np.mean(list_nb_of_elems))
print('std of number of tags in a list:', np.std(list_nb_of_elems))

In [None]:
# analysis of a list with the most elements in it
print('Number of tags in the XML file with the most tags:', len(list_max_elem))
print('Number of possible tags in all files:', len(all_possible_tags_list))

In [None]:
# analysis of a list of all possible tags
print(all_possible_tags_list)

## Eligibility exploration

In [None]:
# new file
filepath = os.path.normpath(os.getcwd() + '/..' + '/data/processed/data_trials0.json')
filepath

In [None]:
df = pd.read_json(filepath)

In [None]:
df.shape

In [None]:
df.head()