In [1]:
!pip install stopwordsiso



Preprocessing Pipeline HSA

Imports

In [2]:
import os
import glob
import time
from tqdm import tqdm
from lxml import etree
import pandas as pd
import stopwordsiso as stopwords

  import pkg_resources


In [3]:
TEI_NS = {"tei": "http://www.tei-c.org/ns/1.0"}
XML_NS = "{http://www.w3.org/XML/1998/namespace}"

In [4]:

def extract_paragraph_metadata_from_files(file_list):
    all_rows = []

    def get_id_and_surname(pn_nodes):
        if not pn_nodes:
            return None, None
        pn = pn_nodes[0]
        pid = pn.get("corresp")
        sname = pn.xpath("normalize-space(tei:surname)", namespaces=TEI_NS) or None
        return pid, sname

    def text_of(elem):
        raw = "".join(elem.itertext())
        return " ".join(raw.split())

    def get_subject_pref_label(root):
        label = root.xpath(
            "//tei:profileDesc/tei:textClass/tei:keywords[@scheme='https://gams.uni-graz.at/o:hsa.subjects']"
            "/tei:term[@type='skos:Concept']/tei:term[@type='skos:prefLabel']/text()",
            namespaces=TEI_NS,
        )
        return label[0].strip() if label else None

    for file_path in file_list:
        tree = etree.parse(file_path)
        root = tree.getroot()

        sender_pn = root.xpath(
            "//tei:teiHeader//tei:correspDesc/tei:correspAction[@type='sent']/tei:persName",
            namespaces=TEI_NS,
        )
        recv_pn = root.xpath(
            "//tei:teiHeader//tei:correspDesc/tei:correspAction[@type='received']/tei:persName",
            namespaces=TEI_NS,
        )

        sender_id, sender_surname = get_id_and_surname(sender_pn)
        receiver_id, receiver_surname = get_id_and_surname(recv_pn)
        lang = root.xpath("//tei:body/tei:div[@subtype='original']/@xml:lang", namespaces=TEI_NS)
        lang = lang[0] if lang else None

        subject_label = get_subject_pref_label(root)  

        date_sent = root.xpath(
            "//tei:teiHeader//tei:correspDesc/tei:correspAction[@type='sent']/tei:date/@when",
            namespaces=TEI_NS
        )
        date_sent = date_sent[0] if date_sent else None

        # iterate over letters
        letter_divs = root.xpath("//tei:text/tei:body//tei:div[@type='letter']", namespaces=TEI_NS)
        for div in letter_divs:
            xml_id = div.get(f"{XML_NS}id", "")
            if xml_id.startswith("L.") and "." in xml_id:
                letter_num = xml_id.split(".", 1)[1]
            else:
                letter_num = xml_id or "1"

            paragraphs = div.xpath(".//tei:p", namespaces=TEI_NS)
            for idx, p in enumerate(paragraphs, start=1):
                pid = f"L.{letter_num}-{idx}"
                all_rows.append({
                    "source_file": os.path.basename(file_path),
                    "pid": pid,
                    "sender_id": sender_id,
                    "sender": sender_surname,
                    "receiver_id": receiver_id,
                    "receiver": receiver_surname,
                    "date": date_sent,
                    "text": text_of(p),
                    "language": lang,
                    "keywords": subject_label,  
                })

    df = pd.DataFrame(all_rows, columns=[
        "source_file", "pid",
        "sender_id", "sender",
        "receiver_id", "receiver", "date",
        "text", "language", "keywords"
    ])
    return all_rows, df

In [5]:
def count_words(text):
    if not text:
        return 0
    return len(text.strip().split())


In [6]:
def filter_long_paragraphs(df, threshold=256):

    if "word_count" not in df.columns:
        raise ValueError("DataFrame must have a 'word_count' column.")

    long_df = df[df["word_count"] > threshold].copy()
    count = len(long_df)
    return long_df, count

In [7]:
def split_long_paragraphs(long_df: pd.DataFrame, chunk_size: int = 511):

    required_cols = {"pid", "text"}
    missing = required_cols - set(long_df.columns)
    if missing:
        raise ValueError(f"Missing required column(s): {', '.join(sorted(missing))}")

    all_sections = []
    meta_cols = [c for c in long_df.columns if c not in ("pid", "text")]

    for _, row in long_df.iterrows():
        parent_pid = row["pid"]
        text = row["text"] or ""
        words = text.strip().split()
        if not words:
            continue

        section_num = 0
        for start in range(0, len(words), chunk_size):
            section_num += 1
            chunk_words = words[start:start + chunk_size]
            chunk_text = " ".join(chunk_words)

            section_id = f"{parent_pid}-{section_num}"  # e.g., L.1-2-1

            section_row = {
                "section_id": section_id,
                "parent_pid": parent_pid,
                "section_number": section_num,
                "section_text": chunk_text,
                "section_word_count": len(chunk_words),
            }

            for c in meta_cols:
                section_row[c] = row[c]

            all_sections.append(section_row)

    df_sections = pd.DataFrame(all_sections, columns=[
        "section_id", "parent_pid", "section_number",
        "section_text", "section_word_count", *meta_cols
    ])
    return all_sections, df_sections

In [8]:
def replace_long_paragraphs_with_sections(df, chunk_size=511, threshold=511, pad_sections=False):

    # sanity 
    if not df.columns.is_unique:
        dups = df.columns[df.columns.duplicated()].tolist()
        raise ValueError(f"Input df has duplicate column names: {dups}")

    required_cols = {"pid", "text", "word_count"}
    missing = required_cols - set(df.columns)
    if missing:
        raise ValueError(f"Missing required column(s) in df: {', '.join(sorted(missing))}")


    short_df = df[df["word_count"] <= threshold].copy()
    long_df  = df[df["word_count"] >  threshold].copy()


    _, sections_df = split_long_paragraphs(long_df, chunk_size=chunk_size)

    if pad_sections:
        # zum Beispiel L.x-y-01 
        sections_df["pid"] = sections_df["parent_pid"] + "-" + sections_df["section_number"].astype(str).str.zfill(2)
    else:
        sections_df["pid"] = sections_df["parent_pid"] + "-" + sections_df["section_number"].astype(str)

    sections_df["text"] = sections_df["section_text"]
    sections_df["word_count"]     = sections_df["section_word_count"]

    base_cols = list(df.columns)

    # add cols
    for c in base_cols:
        if c not in sections_df.columns:
            sections_df[c] = pd.NA

    helper_cols = [c for c in sections_df.columns if c not in base_cols]
    if helper_cols:
        sections_df = sections_df.drop(columns=helper_cols)


    sections_df = sections_df[base_cols]

    # concat back 
    combined_df = pd.concat([short_df, sections_df], ignore_index=True, sort=False)
    sort_cols = [c for c in ["source_file", "paragraph_id"] if c in combined_df.columns]
    if sort_cols:
        combined_df = combined_df.sort_values(by=sort_cols).reset_index(drop=True)

    return combined_df


In [9]:

def get_para_per_amount_of_words(df, amount = 511):
    mask = df["text"].apply(lambda text: len(str(text).split()) > amount)
    return df.loc[mask, ["pid", "text"]]

In [10]:
def get_unique_languages(df):
    return sorted(lang for lang in df["language"].dropna().unique())

In [11]:
folder_path = "get"  # <- adjust if needed
file_pattern = "*.xml"  
file_list = glob.glob(os.path.join(folder_path, file_pattern))

rows, df = extract_paragraph_metadata_from_files(file_list)
df["word_count"] = df["text"].apply(count_words)

df_final = replace_long_paragraphs_with_sections(df, chunk_size=511, threshold=511)

print(df_final.head())

         source_file     pid                                      sender_id  \
0   hsa.letter.1.xml   L.1-1  https://gams.uni-graz.at/o:hsa.persons#P.1069   
1   hsa.letter.1.xml   L.1-2  https://gams.uni-graz.at/o:hsa.persons#P.1069   
2  hsa.letter.10.xml  L.10-1  https://gams.uni-graz.at/o:hsa.persons#P.2100   
3  hsa.letter.10.xml  L.10-2  https://gams.uni-graz.at/o:hsa.persons#P.2100   
4  hsa.letter.10.xml  L.10-3  https://gams.uni-graz.at/o:hsa.persons#P.2100   

              sender                                   receiver_id  \
0            Baissac  https://gams.uni-graz.at/o:hsa.persons#P.109   
1            Baissac  https://gams.uni-graz.at/o:hsa.persons#P.109   
2  Machado y Álvarez  https://gams.uni-graz.at/o:hsa.persons#P.109   
3  Machado y Álvarez  https://gams.uni-graz.at/o:hsa.persons#P.109   
4  Machado y Álvarez  https://gams.uni-graz.at/o:hsa.persons#P.109   

     receiver        date                                               text  \
0  Schuchardt  1885-01-2

In [12]:
languages = get_unique_languages(df)
print(languages)

df_511 = get_para_per_amount_of_words(df_final, amount = 511)
print(df_511)

['ar', 'ca', 'cy', 'da', 'de', 'en', 'es', 'eu', 'fr', 'ft', 'hu', 'idb', 'io', 'it', 'la', 'lad', 'ms', 'nl', 'pap', 'pt', 'ro', 'roa']
Empty DataFrame
Columns: [pid, text]
Index: []


In [13]:
drop = {"source_file"}
df_final = df_final.drop(columns=drop)
df_final.head()

Unnamed: 0,pid,sender_id,sender,receiver_id,receiver,date,text,language,keywords,word_count
0,L.1-1,https://gams.uni-graz.at/o:hsa.persons#P.1069,Baissac,https://gams.uni-graz.at/o:hsa.persons#P.109,Schuchardt,1885-01-20,Ma Doudou vous envoie une petite brochure jaun...,fr,,28
1,L.1-2,https://gams.uni-graz.at/o:hsa.persons#P.1069,Baissac,https://gams.uni-graz.at/o:hsa.persons#P.109,Schuchardt,1885-01-20,Nous sommes anxieux l’un et l’autre d’avoir de...,fr,,31
2,L.10-1,https://gams.uni-graz.at/o:hsa.persons#P.2100,Machado y Álvarez,https://gams.uni-graz.at/o:hsa.persons#P.109,Schuchardt,1882,contesto a todas sus anteriores en forma teleg...,es,Giornale di Filologia Romanza,8
3,L.10-2,https://gams.uni-graz.at/o:hsa.persons#P.2100,Machado y Álvarez,https://gams.uni-graz.at/o:hsa.persons#P.109,Schuchardt,1882,1º Sanjurjo ha sido nombrado catedrático de Ma...,es,Giornale di Filologia Romanza,78
4,L.10-3,https://gams.uni-graz.at/o:hsa.persons#P.2100,Machado y Álvarez,https://gams.uni-graz.at/o:hsa.persons#P.109,Schuchardt,1882,2º No he podido averiguar el nombre del gobern...,es,Giornale di Filologia Romanza,35


In [14]:
df_final.to_csv("letters_py.csv", index=False, encoding="utf-8")

In [15]:
langs = df_final['language'].unique()
for lang in langs:
    print(lang, stopwords.has_lang(lang))

fr True
es True
de True
it True
en True
hu True
pt True
ar True
cy False
lad False
eu True
ft False
nl True
ms True
da True
roa False
ca True
la True
None False
ro True
idb False
pap False
io False
