In [1]:
import pandas as pd
import xml.etree.ElementTree as et

In [2]:
def collapseAspects(aspects):
    
    polarityLookUp = {
        "neutral": 0,
        "negative": 0,
        "positive": 0
    }

    termLookUp = {
        "neutral": "",
        "negative": "",
        "positive": ""
    }

    for aspect in aspects:
        aspectPol = aspect.attrib["polarity"] 
        aspectTerm = aspect.attrib["term"] 

        match aspectPol:
            case "negative":
                polarityLookUp["negative"] += 1
                termLookUp["negative"] = aspectTerm
                break
            case "positive":
                polarityLookUp["positive"] += 1
                termLookUp["positive"] = aspectTerm
                break
            case "neutral":
                polarityLookUp["neutral"] += 1
                termLookUp["neutral"] = aspectTerm
                break
    
    polarity = ""
    values   = list(polarityLookUp.values())

    maxVal = max(values)
    maxKey = [k for k, v in polarityLookUp.items() if v == maxVal]

    if len(maxKey) == 1: polarity = maxKey[0]
    else:                polarity = "neutral"

    return (polarity, termLookUp[polarity])


def XMLtoDF(_path: str) -> pd.DataFrame:
    tree = et.parse(_path)
    root = tree.getroot()

    rows = []

    for sentence in root.findall("sentence"):
        txt         = sentence.find("text").text.strip()
        aspectTerms = sentence.find("aspectTerms")

        if aspectTerms is None:
            continue # Ignore data if it's not labled
        
        if len(aspectTerms) == 1:
            aspect = aspectTerms.find("aspectTerm")
            pol    = aspect.attrib["polarity"]
            term   = aspect.attrib["term"]
        else:
            pol, term = collapseAspects(aspectTerms)

        rows.append(
            {
                "sentence": txt,
                "aspect": term,
                "polarity": pol
            }
        )

    return pd.DataFrame(rows)


    
dfRaw = XMLtoDF("./Pars-ABSA_xml.xml")
dfRaw.to_csv("./data_unnormalized.csv", index=False)
# dfRaw.head(n=6)