In [1]:
from pathlib import Path


data_path = Path("ParlaMint")
if not data_path.exists():
    raise FileNotFoundError("Please run ./download_data.sh to download the data.")

# Load XML files from ALL countries
xml_en_files = []
for country_dir in (data_path / "Samples").iterdir():
    if country_dir.is_dir():
        country_code = country_dir.name.replace("ParlaMint-", "")
        xml_en_files.extend(list(country_dir.rglob("*-en_*.ana.xml")))

print(f"Found {len(xml_en_files)} XML files from all countries.")

Found 87 XML files from all countries.


In [6]:
from lxml import etree
import pandas as pd 

NS = {"tei": "http://www.tei-c.org/ns/1.0"}

def parse_parlamint_xml(xml_path):
    tree = etree.parse(str(xml_path))
    root = tree.getroot()

    date_str = root.xpath(".//tei:settingDesc/tei:setting/tei:date/@when", namespaces=NS)
    speech_date = pd.to_datetime(date_str[0]) if date_str else None

    data = []
    utterances = root.xpath("//tei:u", namespaces=NS)

    for u in utterances:
        u_id = u.get("{http://www.w3.org/XML/1998/namespace}id")
        speaker = u.get("who")
        ana = u.get("ana", "")
        topics = [a.replace("topic:", "") for a in ana.split() if "topic:" in a]

        sentences = u.xpath(".//tei:s", namespaces=NS)
        for s in sentences:
            s_id = s.get("{http://www.w3.org/XML/1998/namespace}id")
            
            sentiment_node = s.xpath(".//tei:measure[@type='sentiment']", namespaces=NS)
            sentiment_score = float(sentiment_node[0].get("quantity")) if sentiment_node else None
            
            named_entities = s.xpath(".//tei:name/@type", namespaces=NS)
            
            tokens = []
            reconstructed_text = ""
            
            token_elements = s.xpath(".//tei:w | .//tei:pc", namespaces=NS)
            
            for i, token_el in enumerate(token_elements):
                token_text = token_el.text or ""
                
                token_data = {
                    "text": token_text,
                    "type": "word" if token_el.tag.endswith("w") else "punct",
                    "lemma": token_el.get("lemma"),
                    "pos": token_el.get("pos"),
                    "msd": token_el.get("msd"),
                    "sem": token_el.get("function")
                }
                tokens.append(token_data)
                
                reconstructed_text += token_text
                if token_el.get("join") != "right" and i < len(token_elements) - 1:
                    reconstructed_text += " "

            data.append({
                "u_id": u_id,
                "s_id": s_id,
                "speaker": speaker,
                "topics": topics,
                "sentiment": sentiment_score,
                "entities": list(set(named_entities)),
                "text": reconstructed_text,
                "tokens": tokens, 
                "date": speech_date,
            })
            
    return data

all_data = []
for xml_file in xml_en_files:
    all_data.extend(parse_parlamint_xml(xml_file))

xml_df = pd.DataFrame(all_data)
print(f"Extracted {len(xml_df)} sentences from {len(xml_en_files)} XML files.")

if not xml_df.empty:
    print("\nClean Text for first sentence:")
    print(xml_df.iloc[0]['text'])
    print("\nToken metadata for first 3 tokens of first sentence:")
    print(pd.DataFrame(xml_df.iloc[0]['tokens']).head(3))

columns_to_show = ['u_id', 's_id', 'speaker', 'topics', 'sentiment', 'entities', 'date']
print("\nFirst 5 rows of extracted data:")
print(xml_df[columns_to_show].head(5))

Extracted 5432 sentences from 87 XML files.

Clean Text for first sentence:
Madam President, I would like

Token metadata for first 3 tokens of first sentence:
        text   type      lemma  pos                        msd  sem
0      Madam   word      madam  NNP  UPosTag=PROPN|Number=Sing  Z1f
1  President   word  President  NNP  UPosTag=PROPN|Number=Sing  Z1f
2          ,  punct       None    Z              UPosTag=PUNCT   Z9

First 5 rows of extracted data:
                       u_id                      s_id  ...    entities       date
0  i-BQ8dtftkuwhpor9i8CsgBV  i-7Gocwwgm1SAsUELPMWzXuE  ...          [] 2022-05-13
1  i-BQ8dtftkuwhpor9i8CsgBV  i-7GodASKwaGTGUA8Kq8nQCL  ...  [PER, LOC] 2022-05-13
2  i-BQ8dtftkuwhpor9i8CsgBV  i-7GodHw7T7ZphsKCgv74KaC  ...       [ORG] 2022-05-13
3  i-BQ8dtftkuwhpor9i8CsgBV  i-7GodPmH4ygtVQDNXYCAFye  ...          [] 2022-05-13
4  i-BQ8dtftkuwhpor9i8CsgBV  i-7GodVbSgqoxGw7YNAHGCP6  ...       [ORG] 2022-05-13

[5 rows x 7 columns]
