In [None]:
import re
import xml.etree.ElementTree as ET

def fixClinicalTrialsXMl(input_path, output_path, xml):
    # Load and parse the XML file
    print(f"Now fixing file:'{xml}'")
    xml_file = input_path+xml
    output_file = output_path+xml
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Define the sections and their corresponding line attributes using regex
    sections_regex = [
        r"Generic Drug Name",
        r"Protocol Number",
        r"Study Start/End Dates",
        r"Reason for Termination",
        r"Study Design/Methodology",
        r"Centers",
        r"Objectives",
        r"Test Product\(s\), Dose\(s\), and Mode\(s\) of Administration",
        r"Statistical Methods",
        r"Study Population: rrrKey Inclusion/Exclusion Criteria",
        r"Participant Flow Table",
        r"Baseline Characteristics",
        r"Primary Outcome Result\(s\)",
        r"Secondary Outcome Result\(s\)",
        r"Summary of Safety",
        r"Safety Results",
        r"All-Cause Mortality",
        r"Serious Adverse Events",
        r"Other .* Adverse Events",
        r"Other Relevant Findings",
        r"Conclusion",
        r"Date of Clinical Trial Report"
    ]

    # Create a new XML structure with sections using regex
    new_root = ET.Element('document')

    current_section = None

    for line in root.findall(".//line"):
        text = line.text.strip()
        for section in sections_regex:
            if re.match(section, text):
                current_section = ET.SubElement(new_root, 'section', name=section)
        if current_section is not None:
            ET.SubElement(current_section, 'line').text = text

    # Convert the new XML structure to a string
    new_xml_str = ET.tostring(new_root, encoding='unicode', method='xml')

    # Save the new XML structure to a file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(new_xml_str)

ct_files = [
    'COAV101A12302.xml',
    'CPEA001A12201.xml',
    'CMHU650A12101.xml',
    'CTMT212X2102.xml',
    'CEMA401A2202.xml',
    'CSAF312B12201.xml',
    'CFTY720DUS40.xml',
    'COMB157G2102.xml',
    'CUNR844A2202.xml',
    'COAV101A12301.xml',
    'COAV101A12306.xml',
    'CSKO136A12201J.xml',
    'CRTH258AFR03.xml',
    'CAIN457AUS02.xml',
    'COAV101A1IC01.xml',
    'COAV101A12102.xml',
    'CAMG334ADE01.xml',
    'COAV101A12303.xml',
    'CLKA651X2104.xml',
    'CMIJ821X2201.xml',
    'CLKA651X2202.xml',
    'CQAW039B2201.xml',
    'COAV101A12304.xml',
    'CNJH395X2101.xml',
    'CLNA043X2201.xml',
    'CEMA401A2201.xml',
    'CRTH258B2301.xml',
    'CLEE011XDE01.xml',
    'CLCZ696D2302.xml',
    'CMCS110X2201.xml'
]

input_path = 'Database/ClinicalTrials_xml/'
output_path = 'Database/ClinicalTrials_xml2/'

for xml in ct_files:
    # fixClinicalTrialsXMl(input_path, output_path, xml)
    print(xml)


In [None]:
import re
import xml.etree.ElementTree as ET
import os

def find_file_by_partial_name(directory, partial_name):
    """
    This function searches for a file in the given directory that contains the partial_name in its filename.
    
    :param directory: The directory to search in.
    :param partial_name: The partial name of the file to search for.
    :return: The full name of the file if found, otherwise None.
    """
    try:
        # List all files in the directory
        for filename in os.listdir(directory):
            # Check if the partial_name is in the filename
            if partial_name in filename:
                return filename
        # If no file is found, return None
        return None
    except FileNotFoundError:
        return f"Directory '{directory}' does not exist."
    except Exception as e:
        return str(e)

def fixPatientsSummariesXMl(input_path, output_path, xml):
    # Load and parse the XML file
    full_filename = find_file_by_partial_name(input_path, xml)
    print(f"Now fixing file:'{full_filename}'")
    xml_file = input_path+full_filename
    output_file = output_path+full_filename
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Define the sections and their corresponding line attributes using regex
    mappingSections = {
            r"Did any patients have serious adverse events?": r"What adverse events did participants report?",
            r"How many participants had adverse events?": r"What adverse events did participants report?",
            r"How many participants reported serious adverse events?": r"What adverse events did participants report?",
            r"How many patients had adverse events during the trial?": r"What adverse events did participants report?",
            r"What adverse events did participants report?": r"What adverse events did participants report?",
            r"What adverse events did the participants have?": r"What adverse events did participants report?",
            r"What serious adverse events did participants have?": r"What adverse events did participants report?",
            r"What serious adverse events did the participants have?": r"What adverse events did participants report?",
            r"What was the most common serious adverse event?": r"What adverse events did participants report?",
            r"What were the most common serious adverse events?": r"What adverse events did participants report?",
            r"What were the serious adverse events?": r"What adverse events did participants report?",
            r"What non-serious adverse events did participants have?": r"What non-serious adverse events did participants have?",
            r"What other adverse events did the participants have?": r"What non-serious adverse events did participants have?",
            r"What was the most common non-serious adverse event?": r"What non-serious adverse events did participants have?",
            r"What were the most common non-serious adverse events?": r"What non-serious adverse events did participants have?",
            r"What were the non-serious adverse events?": r"What non-serious adverse events did participants have?",
            r"What were the results of the trial?": r"What were the results of the trial?",
            r"What were the results of this study?": r"What were the results of the trial?",
            r"What were the key results of this trial?": r"What were the results of the trial?",
            r"What were the main results of the trial?": r"What were the results of the trial?",
            r"What were the main results of this clinical trial?": r"What were the results of the trial?",
            r"What were the main results of this trial?": r"What were the results of the trial?",
            r"What was the main result of this trial?": r"What were the results of the trial?",
            r"What was learned from this trial?": r"What were the results of the trial?",
            r"What medical problems did patients have?": r"What medical problems did patients have?",
            r"What medical problems did the participants have during the entire trial, up to Week 60?": r"What medical problems did patients have?",
            r"What medical problems did the participants have during the trial?": r"What medical problems did patients have?",
            r"What medical problems happened during the trial?": r"What medical problems did patients have?",
            r"How has this clinical trial helped patients and researchers?": r"How has this trial helped?",
            r"How has this trial helped patients and researchers?": r"How has this trial helped?",
            r"How has this trial helped?": r"How has this trial helped?",
            r"How was this trial useful?": r"How has this trial helped?",
            r"What happened during the trial?": r"What happened during the trial?",
            r"What happened during this clinical trial?": r"What happened during the trial?",
            r"What happened during this trial?": r"What happened during the trial?",
            r"What treatments did the participants receive?": r"What treatments did the participants take?",
            r"What treatments did the participants take?": r"What treatments did the participants take?",
            r"What trial treatments did the participants take?": r"What treatments did the participants take?",
            r"What other key results were learned?": r"What other results were learned?",
            r"What other results were learned?": r"What other results were learned?",
            r"What were the other results of this trial?": r"What other results were learned?",
            r"Who was in the trial?": r"Who was in this clinical trial?",
            r"Who was in this clinical trial?": r"Who was in this clinical trial?",
            r"Who was in this trial?": r"Who was in this clinical trial?",
            r"What kind of trial was this?": r"What kind of trial was this?",
            r"What type of clinical trial was this?": r"What kind of trial was this?",
            r"What was the purpose of this clinical trial?": r"What was the purpose of this clinical trial?",
            r"What was the purpose of this trial?": r"What was the purpose of this clinical trial?",
            r"What was the main purpose of this trial?": r"What was the purpose of this clinical trial?",
            r"What was the goal of this observational study?": r"What was the purpose of this clinical trial?",
            r"Why was the research needed?": r"Why was the research needed?", 
            r"How long was the trial?": r"How long was the trial?", 
            r"How long was this trial?": r"How long was the trial?",
            r"How many participants stopped trial drug due to adverse events?": r"How many participants stopped trial drug due to adverse events?", 
            r"How this trial was designed": r"How this trial was designed", 
            r"What has happened since the trial ended?": r"What has happened since the trial ended?",
            r"Where can I learn more about this trial?": r"Where can I learn more about this trial?",
            r"Thank you": r"Thank you"
        }

    # Create a new XML structure with sections using regex
    new_root = ET.Element('document')

    current_section = None

    for line in root.findall(".//line"):
        text = line.text.strip()
        for section in mappingSections.keys():
            if re.match(section, text):
                current_section = ET.SubElement(new_root, 'section', name=mappingSections.get(section))
        if current_section is not None:
            ET.SubElement(current_section, 'line').text = text

    # Convert the new XML structure to a string
    new_xml_str = ET.tostring(new_root, encoding='unicode', method='xml')

    # Save the new XML structure to a file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(new_xml_str)

ct_files = [
    'CAIN457A2324',
    'CLCZ696DUS01',
    'CETB115J2411',
    'CBYL719A03201',
    'CAIN457A2325',
    'CQGE031C2201E1',
    'CLCZ696BDE03',
    'CLEE011G2301',
    'CAIN457S12201',
    'CAIN457ADE08',
    'CZPL389A2203',
    'CZPL389A2203E1',
    'CLCZ696BUS01',
    'CCTL019C2202',
    'CLEE011XDE01',
    'CLCZ696D2302',
    'CLOU064A2201'
]

input_path = 'Database/PatientSummary_xml/'
output_path = 'Database/PatientSummary_xml2/'

for xml in ct_files:
    fixPatientsSummariesXMl(input_path, output_path, xml)
