In [1]:
import xml
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm
import numpy as np

### Exploration of the XML tags

In [2]:
def get_file_names(raw_data_folder_path):
    file_paths = []
    for splits in os.scandir(raw_data_folder_path):
        if splits.is_dir():
            for nct_dir in os.scandir(splits.path):
                for file in os.scandir(nct_dir.path):
                    if file.is_file():
                        file_paths.append(file.path)
    return file_paths

In [3]:
raw_data_folder_path = os.path.normpath(os.getcwd() + '/..' + '/data/raw/')
raw_data_folder_path

'/home/hazot/code/trec-clinical-trials-2023/data/raw'

In [4]:
def tag_exploration_loop_raw_numbers(raw_data_folder_path):
    file_paths = get_file_names(raw_data_folder_path)
    
    nb_elems_all = []
    max_elem_list = []
    all_tags_list = set()
    for file_path in tqdm(file_paths, desc='Parsing XML files to json:', disable=False, position=0, leave=True):
        # load and parse the file
        xml_tree = ET.parse(file_path)
        
        elem_list = []
        
        for elem in xml_tree.iter():
            elem_list.append(elem.tag)
        
        # now I remove duplicities - by convertion to set and back to list
        elem_list = list(set(elem_list))
    
        nb_elem = len(elem_list)
        nb_elems_all.append(nb_elem)
        
        if nb_elem > len(max_elem_list):
            max_elem_list = elem_list

        for tag in elem_list:
            if tag not in all_tags_list:
                all_tags_list.add(tag)

    return nb_elems_all, max_elem_list, all_tags_list

In [5]:
result_tuple = tag_exploration_loop_raw_numbers(raw_data_folder_path)

Parsing XML files to json:: 100%|█████████████████████████████████████████████| 451538/451538 [05:16<00:00, 1428.90it/s]


In [14]:
list_nb_of_elems = result_tuple[0]
list_max_elem = result_tuple[1]
all_possible_tags_list = result_tuple[2]

In [17]:
# Statistics on the number of elems in list
import statistics as stats
print('Number of Clinical Trials possible:', len(list_nb_of_elems))
print('Average number of tags in a list:', np.mean(list_nb_of_elems))
print('std of number of tags in a list:', np.std(list_nb_of_elems))

Number of Clinical Trials possible: 451538
Average number of tags in a list: 87.2884186934433
std of number of tags in a list: 22.63627100621393


In [18]:
# analysis of a list with the most elements in it
print('Number of tags in the XML file with the most tags:', len(list_max_elem))
print('Number of possible tags in all files:', len(all_possible_tags_list))

Number of tags in the XML file with the most tags: 183
Number of possible tags in all files: 235


In [20]:
# analysis of a list of all possible tags
print(all_possible_tags_list)

{'textblock', 'facility', 'description', 'secondary_id', 'document_url', 'counts', 'dispersion_value', 'removed_countries', 'results_first_submitted_qc', 'submitted', 'nct_alias', 'link_text', 'study_type', 'drop_withdraw_reason_list', 'patient_data', 'investigator', 'name_title', 'status', 'city', 'milestone', 'doc_url', 'number_of_arms', 'ci_upper_limit', 'participants', 'is_unapproved_device', 'download_date', 'period', 'study_first_submitted', 'eligibility', 'last_known_status', 'submission_canceled', 'participants_list', 'is_fda_regulated_drug', 'zip', 'analyzed', 'period_list', 'outcome', 'title', 'pre_assignment_details', 'units', 'pending_results', 'biospec_descr', 'required_header', 'dispersion_type', 'phone_ext', 'group_id', 'overall_contact_backup', 'method_desc', 'study_design_info', 'affiliation', 'last_update_submitted', 'expanded_access_type_treatment', 'arm_group_type', 'dispersion', 'agency_class', 'expanded_access_type_intermediate', 'limitations_and_caveats', 'lead_s

In [None]:
def tag_specific_exploration_loop(tags):
    file_paths = get_file_names(raw_data_folder_path)
    
    nb_elems_all = []
    max_elem_list = []
    all_tags_list = set()
    for file_path in tqdm(file_paths, desc='Parsing XML files to json:', disable=False, position=0, leave=True):
        # load and parse the file
        xml_tree = ET.parse(file_path)
        
        elem_list = []
        
        for elem in xml_tree.iter():
            elem_list.append(elem.tag)
        
        # now I remove duplicities - by convertion to set and back to list
        elem_list = list(set(elem_list))
    
        nb_elem = len(elem_list)
        nb_elems_all.append(nb_elem)
        
        if nb_elem > len(max_elem_list):
            max_elem_list = elem_list

        for tag in elem_list:
            if tag in tags:
                all_tags_list.add(tag)


### Analysis of NCT01160822

In [24]:
path = os.path.normpath(os.getcwd() + '/..' + '/data/raw/ClinicalTrials.2023-05-08.trials1/NCT0116xxxx/NCT01160822.xml')

In [25]:
xml_tree = ET.parse(path)

In [28]:
elem_list = []
        
for elem in xml_tree.iter():
    elem_list.append(elem.tag)

# now I remove duplicities - by convertion to set and back to list
elem_list = list(set(elem_list))