In [None]:
!pip install xmltodict

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0


In [None]:
!pip install jsonpath

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jsonpath
  Downloading jsonpath-0.82.tar.gz (9.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: jsonpath
  Building wheel for jsonpath (setup.py) ... [?25l[?25hdone
  Created wheel for jsonpath: filename=jsonpath-0.82-py3-none-any.whl size=5610 sha256=0eb6034fe92ebc68b0a928b280bcab909bf1d2e9ac897d70fe6fcb40b13bce53
  Stored in directory: /root/.cache/pip/wheels/93/9d/2f/4dcbb0d8fdc7901bba9976a8f31f29f476460de7cb27259e2b
Successfully built jsonpath
Installing collected packages: jsonpath
Successfully installed jsonpath-0.82


In [None]:
import json
import os
import xmltodict
import re
import time
import pandas as pd
from jsonpath import jsonpath

In [None]:
# Initialize a dictionary to store the data
data = {
    "Questions": [],
    "Answers": [],
    "Focus": []
}

# Function to process an XML file
def processXmlFile(completePath):
    # Open the XML file
    with open(completePath) as f:
        # Read the contents of the file
        xmlstring = f.read()

        try:
            # Parse the XML string into a dictionary using xmltodict library
            dataDict = xmltodict.parse(xmlstring, xml_attribs=False)
            
            # Extract the QAPair and Focus information from the dictionary
            listOfQA = json.loads(json.dumps(jsonpath(dataDict, '$..' + "QAPair")[0]))
            focus = json.loads(json.dumps(jsonpath(dataDict, '$..' + "Focus")[0]))
        except Exception as e:
            # Handle exceptions, such as empty QAPair or Focus
            return

        # Check if there is only a single QA pair, and convert it to a list if needed
        if isinstance(listOfQA, dict):
            listOfQA = [listOfQA]
        
        # Process each QA pair
        for qaPair in listOfQA:
            try:
                # Clean up the answer text
                x = re.sub(' +', ' ', qaPair['Answer'])
                x = re.sub('Key Points', "", x)
                x = x.replace("\n", "").replace("-", "")
                
                # Append the processed data to the data dictionary
                data['Answers'].append(x)
                data['Questions'].append(qaPair['Question'])
                data['Focus'].append(focus)
            except:
                # Handle any exceptions that occur during processing
                return

In [None]:
# List of folders with empty answers
foldersWithEmptyAnswers = [
    "10_MPlus_ADAM_QA",
    "11_MPlusDrugs_QA",
    "12_MPlusHerbsSupplements_QA",
    "readme.txt",  # As it does not contain any QAs
    "QA-TestSet-LiveQA-Med-Qrels-2479-Answers.zip",  # Will use it later,
    "ProcessedData.csv"
]

# Base path for the folders
BASE_PATH = "/content/drive/MyDrive/MedQuAD-master"

# Iterate over the folders in the base path
for folder in os.listdir(BASE_PATH):
    # Check if the folder is in the list of folders with empty answers
    if folder in foldersWithEmptyAnswers:
        # If the folder is in the list, skip it and continue with the next folder
        continue
    else:
        # If the folder is not in the list, process it
        print("Processing folder:", folder)
        start = time.time()

        # Iterate over the XML files in the current folder
        for xmlFileName in os.listdir(os.path.join(BASE_PATH, folder)):
            completePath = os.path.join(BASE_PATH, folder, xmlFileName)
            
            # Process the XML file
            processXmlFile(completePath)

        print("Took", time.time() - start)

Processing folder: 2_GARD_QA
Took 37.40929365158081
Processing folder: 7_SeniorHealth_QA
Took 0.0029649734497070312
Processing folder: 5_NIDDK_QA
Took 0.0028579235076904297
Processing folder: 6_NINDS_QA
Took 0.0016994476318359375
Processing folder: 4_MPlus_Health_Topics_QA
Took 0.0016057491302490234
Processing folder: 3_GHR_QA
Took 0.0015909671783447266
Processing folder: 1_CancerGov_QA
Took 1.5483970642089844
Processing folder: 8_NHLBI_QA_XML
Took 0.0019779205322265625
Processing folder: 9_CDC_QA
Took 0.0012118816375732422


In [None]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Questions,Answers,Focus
0,What is (are) Achalasia ?,"Achalasia is a disorder of the esophagus, the ...",Achalasia
1,What are the symptoms of Achalasia ?,What are the signs and symptoms of achalasia? ...,Achalasia
2,What causes Achalasia ?,What causes achalasia? The lower esophageal sp...,Achalasia
3,How to diagnose Achalasia ?,How is achalasia diagnosed? Achalasia is suspe...,Achalasia
4,What are the treatments for Achalasia ?,How might achalasia be treated? The aim of tre...,Achalasia
