In [2]:
import xml.etree.ElementTree as ET
import pandas as pd

# Define keywords for filtering
keywords = ['Acne', 'Eczema', 'Psoriasis']

# Function to parse XML and extract questions and answers
def parse_and_filter_xml(file_path, keywords):
    tree = ET.parse(file_path)
    root = tree.getroot()

    qa_list = []

    for qapair in root.findall(".//QAPair"):
        question_text = qapair.find('Question').text.strip() if qapair.find('Question') is not None else ""
        for answer in qapair.findall('Answer'):
            answer_text = answer.text.strip() if answer is not None else ""

            # Check if the question or answer contains any of the keywords
            if any(keyword.lower() in question_text.lower() for keyword in keywords) or \
               any(keyword.lower() in answer_text.lower() for keyword in keywords):
                qa_list.append({
                    'question': question_text,
                    'answer': answer_text
                })

    return qa_list

# List of input files to process
file_paths = [
    r'D:\FYP\datasets\Github\All Datasets\MEDIQA2019-Task3-QA-TrainingSet2-Alexa.xml',
    r'D:\FYP\datasets\Github\All Datasets\MEDIQA2019-Task3-QA-ValidationSet.xml',
    r'D:\FYP\datasets\Github\All Datasets\MEDIQA2019-Task3-QA-TestSet.xml',
    r'D:\FYP\datasets\Github\All Datasets\MEDIQA2019-Task3-QA-TestSet-wLabels.xml',
    r'D:\FYP\datasets\Github\All Datasets\MEDIQA2019-Task3-QA-TrainingSet1-LiveQAMed.xml'
]

# Collect filtered QA pairs from all files
all_filtered_qa = []

for file_path in file_paths:
    filtered_qa = parse_and_filter_xml(file_path, keywords)
    all_filtered_qa.extend(filtered_qa)

# Convert the collected data to a DataFrame
filtered_df = pd.DataFrame(all_filtered_qa)

# Save the filtered dataset to a CSV file
output_csv_path = 'filtered_qa_acne_eczema_psoriasis.csv'
filtered_df.to_csv(output_csv_path, index=False)

print(f"Filtered data saved to {output_csv_path}")


Filtered data saved to filtered_qa_acne_eczema_psoriasis.csv
