In [1]:
import xml
import os
import xml.etree.ElementTree as ET
from tqdm import tqdm
import numpy as np

### Exploration of the XML tags

In [2]:
def get_file_names(raw_data_folder_path):
    file_paths = []
    for splits in os.scandir(raw_data_folder_path):
        if splits.is_dir():
            for nct_dir in os.scandir(splits.path):
                for file in os.scandir(nct_dir.path):
                    if file.is_file():
                        file_paths.append(file.path)
    return file_paths

In [3]:
raw_data_folder_path = os.path.normpath(os.getcwd() + '/..' + '/data/raw/')
raw_data_folder_path

'/home/hazot/code/trec-clinical-trials-2023/data/raw'

In [4]:
def tag_exploration_loop_raw_numbers(raw_data_folder_path):
    file_paths = get_file_names(raw_data_folder_path)
    
    nb_elems_all = []
    max_elem_list = []
    all_tags_list = set()
    for file_path in tqdm(file_paths, desc='Parsing XML files to json:', disable=False, position=0, leave=True):
        # load and parse the file
        xml_tree = ET.parse(file_path)
        
        elem_list = []
        
        for elem in xml_tree.iter():
            elem_list.append(elem.tag)
        
        # now I remove duplicities - by convertion to set and back to list
        elem_list = list(set(elem_list))
    
        nb_elem = len(elem_list)
        nb_elems_all.append(nb_elem)
        
        if nb_elem > len(max_elem_list):
            max_elem_list = elem_list

        for tag in elem_list:
            if tag not in all_tags_list:
                all_tags_list.add(tag)

    return nb_elems_all, max_elem_list, all_tags_list

In [5]:
result_tuple = tag_exploration_loop_raw_numbers(raw_data_folder_path)

Parsing XML files to json:: 100%|█████████████████████████████████████████████| 451538/451538 [04:52<00:00, 1541.92it/s]


In [6]:
list_nb_of_elems = result_tuple[0]
list_max_elem = result_tuple[1]
all_possible_tags_list = result_tuple[2]

In [7]:
# Statistics on the number of elems in list
import statistics as stats
print('Number of Clinical Trials possible:', len(list_nb_of_elems))
print('Average number of tags in a list:', np.mean(list_nb_of_elems))
print('std of number of tags in a list:', np.std(list_nb_of_elems))

Number of Clinical Trials possible: 451538
Average number of tags in a list: 87.2884186934433
std of number of tags in a list: 22.63627100621393


In [8]:
# analysis of a list with the most elements in it
print('Number of tags in the XML file with the most tags:', len(list_max_elem))
print('Number of possible tags in all files:', len(all_possible_tags_list))

Number of tags in the XML file with the most tags: 183
Number of possible tags in all files: 235


In [9]:
# analysis of a list of all possible tags
print(all_possible_tags_list)

{'citation', 'organization', 'reported_events', 'facility', 'has_dmc', 'results_first_submitted', 'contact', 'is_us_export', 'expanded_access_info', 'description', 'condition_browse', 'PMID', 'overall_contact', 'has_expanded_access', 'results_first_posted', 'dispersion', 'title', 'time_perspective', 'target_duration', 'dispersion_value', 'study_first_submitted', 'last_known_status', 'number_of_groups', 'doc_type', 'expanded_access_type_individual', 'collaborator', 'ci_n_sides', 'sharing_ipd', 'lead_sponsor', 'study_design_info', 'measurement', 'intervention_model_description', 'expanded_access_type_treatment', 'document_has_protocol', 'param_value', 'participants', 'pi_employee', 'event_list', 'method', 'address', 'param', 'p_value', 'p_value_desc', 'study_docs', 'drop_withdraw_reason_list', 'count_list', 'group_id_list', 'other_name', 'study_first_submitted_qc', 'category', 'participants_list', 'clinical_study', 'certain_agreements', 'status', 'intervention_browse', 'criteria', 'respo

In [10]:
def tag_specific_exploration_loop(tags):
    #TODO: work on this function - check the frequency for each tag in the dataset
    file_paths = get_file_names(raw_data_folder_path)
    
    nb_elems_all = []
    max_elem_list = []
    all_tags_list = set()
    for file_path in tqdm(file_paths, desc='Parsing XML files to json:', disable=False, position=0, leave=True):
        # load and parse the file
        xml_tree = ET.parse(file_path)
        
        elem_list = []
        
        for elem in xml_tree.iter():
            elem_list.append(elem.tag)
        
        # now I remove duplicities - by convertion to set and back to list
        elem_list = list(set(elem_list))
    
        nb_elem = len(elem_list)
        nb_elems_all.append(nb_elem)
        
        if nb_elem > len(max_elem_list):
            max_elem_list = elem_list

        for tag in elem_list:
            if tag in tags:
                all_tags_list.add(tag)


In [None]:
#TODO: length of the text in each tags with their respective statistics

In [None]:
# Helper functions t get txt from xml files
def extract_data(element, tag):
    data = []

    try:
        for child in element.iter(tag):
            content = extract_content(child)
            data.append(content)
        return data if len(data) > 1 else data[0]
    except:
        return None


def extract_content(element):
    if len(element) > 0:
        content = {}
        for child in element:
            content[child.tag] = extract_content(child)
        if content.get('textblock', None):
            content = content[list(content.keys())[0]]
    else:
        content = ' '.join(element.text.strip().split()).replace(': ', ' ').replace(' - ', ' ')  # Remove unnecessary characters

    return content