In [4]:
import os
import pandas as pd
from xml.etree import ElementTree as ET

def extract_qa_from_xml(file_path):
    """
    Extract Q&A pairs from the given XML file if they contain Acne, Psoriasis, or Eczema.
    """
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    filtered_data = []
    for qa_pair in root.findall(".//QAPair"):
        question = qa_pair.find("Question").text
        answer = qa_pair.find("Answer").text
        
        # Filter by keyword
        if any(keyword in (question or "").lower() or keyword in (answer or "").lower() 
               for keyword in ["acne", "eczema", "psoriasis"]):
            filtered_data.append({
                "File": os.path.basename(file_path),
                "Question": question.strip() if question else None,
                "Answer": answer.strip() if answer else None
            })
    return filtered_data

def process_directory(directory):
    """
    Process all XML files in the given directory and filter Q&A pairs.
    """
    all_filtered_data = []
    
    for file_name in os.listdir(directory):
        if file_name.endswith(".xml"):
            file_path = os.path.join(directory, file_name)
            filtered_data = extract_qa_from_xml(file_path)
            all_filtered_data.extend(filtered_data)
    
    # Convert to DataFrame
    if all_filtered_data:
        df = pd.DataFrame(all_filtered_data)
        output_path = os.path.join(directory, "filtered_qa_acne_eczema_psoriasis.xlsx")
        df.to_excel(output_path, index=False)
        print(f"Filtered data saved to {output_path}")
    else:
        print("No relevant data found.")

# Specify the directory containing the XML files
directory_path = "D:\FYP\datasets\Github\All Datasets\MedQud\MedQuAD-master\1_CancerGov_QA"
process_directory(directory_path)


Filtered data saved to D:\FYP\datasets\Github\All Datasets\MedQud\MedQuAD-master\1_CancerGov_QA\filtered_qa_acne_eczema_psoriasis.xlsx
