In [48]:
from pathlib import Path


data_path = Path("ParlaMint")
if not data_path.exists():
    raise FileNotFoundError("Please run ./download_data.sh to download the data.")

exploration_language = 'GB'
exploration_path = data_path / "Samples" / f"ParlaMint-{exploration_language}"

xml_en_files = list(exploration_path.rglob("*-en_*.ana.xml"))
print(f"Found {len(xml_en_files)} XML files in {exploration_path}.")

Found 3 XML files in ParlaMint/Samples/ParlaMint-GB.


In [49]:
from lxml import etree
import pandas as pd

NS = {"tei": "http://www.tei-c.org/ns/1.0"}

def parse_parlamint_xml(xml_path):
    tree = etree.parse(str(xml_path))
    root = tree.getroot()
    
    data = []
    utterances = root.xpath(".//tei:u", namespaces=NS)
    
    for u in utterances:
        u_id = u.get("{http://www.w3.org/XML/1998/namespace}id")
        speaker = u.get("who")
        ana = u.get("ana", "")
        topics = [a.replace("topic:", "") for a in ana.split() if "topic:" in a]
        
        sentences = u.xpath(".//tei:s", namespaces=NS)
        for s in sentences:
            s_id = s.get("{http://www.w3.org/XML/1998/namespace}id")
            
            sentiment_node = s.xpath(".//tei:measure[@type='sentiment']", namespaces=NS)
            sentiment_score = float(sentiment_node[0].get("quantity")) if sentiment_node else None
            
            named_entities = s.xpath(".//tei:name/@type", namespaces=NS)
            
            tokens = []
            reconstructed_text = ""
            
            token_elements = s.xpath(".//tei:w | .//tei:pc", namespaces=NS)
            
            for i, token_el in enumerate(token_elements):
                token_text = token_el.text or ""
                
                token_data = {
                    "text": token_text,
                    "type": "word" if token_el.tag.endswith("w") else "punct",
                    "lemma": token_el.get("lemma"),
                    "pos": token_el.get("pos"),
                    "msd": token_el.get("msd"),
                    "sem": token_el.get("function")
                }
                tokens.append(token_data)
                
                reconstructed_text += token_text
                if token_el.get("join") != "right" and i < len(token_elements) - 1:
                    reconstructed_text += " "

            data.append({
                "u_id": u_id,
                "s_id": s_id,
                "speaker": speaker,
                "topics": topics,
                "sentiment": sentiment_score,
                "entities": list(set(named_entities)),
                "text": reconstructed_text,
                "tokens": tokens
            })
            
    return data

all_data = []
for xml_file in xml_en_files:
    all_data.extend(parse_parlamint_xml(xml_file))

xml_df = pd.DataFrame(all_data)
print(f"Extracted {len(xml_df)} sentences from {len(xml_en_files)} XML files.")

if not xml_df.empty:
    print("\nClean Text for first sentence:")
    print(xml_df.iloc[0]['text'])
    print("\nToken metadata for first 3 tokens of first sentence:")
    print(pd.DataFrame(xml_df.iloc[0]['tokens']).head(3))


Extracted 198 sentences from 3 XML files.

Clean Text for first sentence:
Before we come to today’s business, I want to mark the departure of two senior members of the House Service.

Token metadata for first 3 tokens of first sentence:
     text  type   lemma  pos  \
0  Before  word  before   IN   
1      we  word      we  PRP   
2    come  word    come  VBP   

                                                 msd  \
0                                        UPosTag=ADP   
1  UPosTag=PRON|Case=Nom|Number=Plur|Person=1|Pro...   
2      UPosTag=VERB|Mood=Ind|Tense=Pres|VerbForm=Fin   

                                                 sem  
0                                                 Z5  
1                                                 Z8  
2  M1,N3.1,A2.2,A3+,X2.1,A9+,S4,N4,S3.2,T1,A1.1.1,N5  


In [50]:
def load_taxonomy(taxonomy_path):
    tree = etree.parse(str(taxonomy_path))
    categories = tree.xpath("//tei:category", namespaces=NS)
    mapping = {}
    for cat in categories:
        cat_id = cat.get("{http://www.w3.org/XML/1998/namespace}id")
        full_desc = cat.xpath("string(./tei:catDesc)", namespaces=NS)
        if cat_id and full_desc:
            if ":" in full_desc:
                mapping[cat_id] = full_desc.split(":", 1)[-1].strip()
            else:
                mapping[cat_id] = full_desc.strip()
    return mapping

usas_tax_path = exploration_path / "ParlaMint-taxonomy-USAS.ana.xml"
usas_map = load_taxonomy(usas_tax_path)

print(f"Z5 means: {usas_map.get('Z5')}")
print(f"A1.1.1 means: {usas_map.get('A1.1.1')}")

Z5 means: Grammatical bin
A1.1.1 means: General actions / making


In [51]:
topic_tax_path = exploration_path / "ParlaMint-taxonomy-topic.xml"
topic_map = load_taxonomy(topic_tax_path)

print(f"Topic 'educa' means: {topic_map.get('educa')}")
print(f"Topic 'healt' means: {topic_map.get('healt')}")

Topic 'educa' means: Education
Topic 'healt' means: Health


In [52]:
speaker_tax_path = exploration_path / "ParlaMint-taxonomy-politicalOrientation.xml"
speaker_map = load_taxonomy(speaker_tax_path)

print(f"Speaker orientation 'orientation.C' means: {speaker_map.get('orientation.C')}")

Speaker orientation 'orientation.C' means: Centre


In [None]:
def load_person_list(person_path):
    tree = etree.parse(str(person_path))
    root = tree.getroot()
    
    persons = root.xpath("//tei:person", namespaces=NS)
    person_mapping = {}
    
    for p in persons:
        p_id = p.get("{http://www.w3.org/XML/1998/namespace}id")
        
        # Get full name
        forenames = p.xpath(".//tei:persName/tei:forename/text()", namespaces=NS)
        surname = p.xpath(".//tei:persName/tei:surname/text()", namespaces=NS)
        full_name = f"{' '.join(forenames)} {''.join(surname)}".strip()
        
        # Get sex
        sex = p.xpath("./tei:sex/@value", namespaces=NS)
        sex = sex[0] if sex else None
        
        # Get current/last party affiliation
        affiliations = p.xpath(".//tei:affiliation[contains(@ref, '#party')]", namespaces=NS)
        party_ref = affiliations[-1].get("ref").replace("#", "") if affiliations else None
        
        person_mapping[f"#{p_id}"] = {
            "name": full_name,
            "sex": sex,
            "party": party_ref
        }
        
    return person_mapping

person_path = exploration_path / "ParlaMint-GB-listPerson.xml"
speaker_info = load_person_list(person_path)

example_id = "#MargaretProsser"
print(f"Info for {example_id}: {speaker_info.get(example_id)}")

Info for #MargaretProsser: {'name': 'Margaret Prosser', 'sex': 'F', 'party': 'party.LAB'}


In [None]:
def load_org_list(org_path):
    tree = etree.parse(str(org_path))
    root = tree.getroot()
    
    orgs = root.xpath("//tei:org", namespaces=NS)
    org_mapping = {}
    
    for org in orgs:
        org_id = org.get("{http://www.w3.org/XML/1998/namespace}id")
        role = org.get("role")
        
        full_name = org.xpath("./tei:orgName[@full='yes']/text()", namespaces=NS)
        full_name = full_name[0] if full_name else None
        
        if not full_name:
            any_name = org.xpath("./tei:orgName/text()", namespaces=NS)
            full_name = any_name[0] if any_name else None
            
        org_mapping[org_id] = {
            "name": full_name.strip() if full_name else None,
            "role": role
        }
        
    return org_mapping

org_path = exploration_path / "ParlaMint-GB-listOrg.xml"
org_info = load_org_list(org_path)

print(f"Info for 'party.LAB': {org_info.get('party.LAB')}")
print(f"Info for 'parliament.HC': {org_info.get('parliament.HC')}")


Info for 'party.LAB': {'name': 'Labour', 'role': 'politicalParty'}
Info for 'parliament.HC': {'name': 'House of Commons', 'role': 'parliament'}


In [None]:
final_df = xml_df.copy()

final_df['speaker_name'] = final_df['speaker'].map(lambda x: speaker_info.get(x, {}).get('name', 'Unknown'))
final_df['speaker_sex'] = final_df['speaker'].map(lambda x: speaker_info.get(x, {}).get('sex', 'Unknown'))
final_df['party_id'] = final_df['speaker'].map(lambda x: speaker_info.get(x, {}).get('party', 'Unknown'))

final_df['party_name'] = final_df['party_id'].map(lambda x: org_info.get(x, {}).get('name', 'Unknown'))

final_df['topic_labels'] = final_df['topics'].map(lambda t_list: [topic_map.get(t, t) for t in t_list] if t_list else [])

columns_to_show = [
    's_id', 'speaker_name', 'speaker_sex', 'party_name', 
    'sentiment', 'topic_labels', 'text'
]

print(f"Final dataset built with {len(final_df)} rows and {len(final_df.columns)} columns.")
final_df[columns_to_show].head(10)

Final dataset built with 198 rows and 13 columns.


Unnamed: 0,s_id,speaker_name,speaker_sex,party_name,sentiment,topic_labels,text
0,ParlaMint-GB_2022-07-21-commons.seg1.1,Lindsay Harvey Hoyle,M,Labour,1.938,[Government Operations],"Before we come to today’s business, I want to ..."
1,ParlaMint-GB_2022-07-21-commons.seg1.2,Lindsay Harvey Hoyle,M,Labour,2.087,[Government Operations],"Penny Young, Librarian and Managing Director o..."
2,ParlaMint-GB_2022-07-21-commons.seg1.3,Lindsay Harvey Hoyle,M,Labour,2.296,[Government Operations],"Isabel Coman, Managing Director of the Strateg..."
3,ParlaMint-GB_2022-07-21-commons.seg2.1,Lindsay Harvey Hoyle,M,Labour,4.29,[Government Operations],Penny came to the House of Commons in 2015 fol...
4,ParlaMint-GB_2022-07-21-commons.seg2.2,Lindsay Harvey Hoyle,M,Labour,5.305,[Government Operations],"In her time here, she has consistently champio..."
5,ParlaMint-GB_2022-07-21-commons.seg2.3,Lindsay Harvey Hoyle,M,Labour,5.413,[Government Operations],She has been valued by her team for her suppor...
6,ParlaMint-GB_2022-07-21-commons.seg3.1,Lindsay Harvey Hoyle,M,Labour,4.392,[Government Operations],Isabel joined Parliament in March 2020 and has...
7,ParlaMint-GB_2022-07-21-commons.seg3.2,Lindsay Harvey Hoyle,M,Labour,4.82,[Government Operations],She has overseen the completion of a number of...
8,ParlaMint-GB_2022-07-21-commons.seg3.3,Lindsay Harvey Hoyle,M,Labour,5.421,[Government Operations],The insight and experience Isabel has brought ...
9,ParlaMint-GB_2022-07-21-commons.seg4.1,Lindsay Harvey Hoyle,M,Labour,5.401,[Government Operations],I am sure the House will join me in thanking P...
