Imports

In [136]:
import pandas as pd
import os, glob
import math

Define which files should be read into which frame

In [137]:
def define_names():

    d_path = r"ParlaMint-DK-en.ana\ParlaMint-DK-en.txt\2022"

    all_meta = glob.glob(os.path.join(d_path, "*-meta.tsv"))

    f_names_ana_meta = [f for f in all_meta if f.endswith("-ana-meta.tsv")]
    f_names_meta = [f for f in all_meta if f not in f_names_ana_meta]

    f_names_txt = glob.glob(os.path.join(d_path, "*.txt"))
    return f_names_ana_meta, f_names_meta, f_names_txt

Read data into the frames from the defined files

In [138]:
def create_df(f_names):
    m_frames = []

    for f_name in f_names:
        df = pd.read_csv(f_name, sep='\t')
        m_frames.append(df)

    df = pd.concat(m_frames)
    return df

In [139]:
def load_id_text_from_files(f_names_txt):

    lst = []
    for f_name in f_names_txt:
        with open(f_name, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if "\t" in line:
                    l_id, l_txt = line.split("\t", 1)  # split only once
                    lst.append({"ID": l_id, "Text": l_txt})
    return pd.DataFrame(lst)




Merge 2 different Dataframes

In [140]:
def merge_meta_dfs(df_ana_meta, df_meta):
    merged_df = pd.merge(
        df_ana_meta,
        df_meta,
        left_on="Parent_ID",
        right_on="ID",
        how="left",
        suffixes=('_ana', '_full')
    )
    merged_df.drop(columns=["ID_full"], inplace=True)
    merged_df.rename(columns={"ID_ana": "ID"}, inplace=True)
    return merged_df

Clean up Dataframe as needed

In [141]:
def clear_columns(merged_df):

    columns_to_keep = [
        "ID",             
        #"Text",            
        "Speaker_ID",      
        "Speaker_birth",
        "Language",
        "Party_orientation",
        "Topic",         
        "Senti_6",
        "Text",
    ]
    reduced_df = merged_df[columns_to_keep].copy()
    return reduced_df

Display Dataframes

In [142]:
def display_final_df(reduced_df):
    print(reduced_df.shape)
    display(reduced_df.head())

Pipeline

In [143]:
f_names_ana_meta, f_names_meta, f_names_txt = define_names()
df_meta = create_df(f_names_meta)
df_ana_meta = create_df(f_names_ana_meta)
df_txt = load_id_text_from_files(f_names_txt)
df_text_meta = pd.merge(df_txt, df_meta, on="ID")
merged_df =  merge_meta_dfs(df_ana_meta, df_text_meta)
reduced_df = clear_columns(merged_df)



In [144]:
display_final_df(reduced_df)

(174156, 8)


Unnamed: 0,ID,Speaker_ID,Speaker_birth,Language,Party_orientation,Topic,Senti_6,Text
0,ParlaMint-DK_20220111130003,,,English,,,-,
1,ParlaMint-DK_20220111130003.seg1.1,KristensenHenrikDam,1957.0,English,Centre-left,Mix,mixed positive,The meeting is open. Since this is the first m...
2,ParlaMint-DK_20220111130003.seg1.2,KristensenHenrikDam,1957.0,English,Centre-left,Mix,mixed positive,The meeting is open. Since this is the first m...
3,ParlaMint-DK_20220111130003.seg1.3,KristensenHenrikDam,1957.0,English,Centre-left,Mix,neutral negative,The meeting is open. Since this is the first m...
4,ParlaMint-DK_20220111130003.seg1.4,KristensenHenrikDam,1957.0,English,Centre-left,Mix,neutral positive,The meeting is open. Since this is the first m...


Everything after this point can be revoked once there is a csv with sentences

In [145]:
def reduce_by_base_id_mode_sentiment(df, id_col="ID", senti_col="Senti_6"):
    # base ID without any ".seg..." suffix
    d = df.assign(_base_id=df[id_col].str.split(".").str[0])

    # aggregate: most common sentiment, first for other cols
    agg = {c: "first" for c in d.columns if c not in [id_col, "_base_id", senti_col]}
    agg[senti_col] = lambda s: s.mode().iloc[0] if not s.mode().empty else s.iloc[0]

    out = (
        d.groupby("_base_id", as_index=False)
         .agg(agg)
         .rename(columns={"_base_id": id_col})
    )
    return out

# usage
lessened_df = reduce_by_base_id_mode_sentiment(reduced_df)
lessened_df.head()

Unnamed: 0,ID,Speaker_ID,Speaker_birth,Language,Party_orientation,Topic,Text,Senti_6
0,ParlaMint-DK_20220111130003,KristensenHenrikDam,1957.0,English,Centre-left,Mix,The meeting is open. Since this is the first m...,neutral negative
1,ParlaMint-DK_20220111130206,KristensenHenrikDam,1957.0,English,Centre-left,Other,Are there any objections to the promotion of t...,-
2,ParlaMint-DK_20220111130249,KristensenHenrikDam,1957.0,English,Centre-left,Other,"The debate is open. Sir. Troels Ravn, the Soci...",neutral positive
3,ParlaMint-DK_20220111130259,RavnTroels,1961.0,English,Centre-left,Macroeconomics,Thank you for the floor. In the Social Democra...,mixed positive
4,ParlaMint-DK_20220111130435,KristensenHenrikDam,1957.0,English,Centre-left,Other,Thank you to the rapporteur. There are no brie...,mixed positive


In [146]:
def split_text_into_segments(df, id_col="ID", text_col="Text", words_per_seg=255):
    rows = []
    other_cols = [c for c in df.columns if c not in (id_col, text_col)]

    for _, r in df.iterrows():
        text = (r[text_col] or "")
        words = str(text).split()
        if not words:
            continue

        n = math.ceil(len(words) / words_per_seg)
        for i in range(n):
            seg_words = words[i*words_per_seg:(i+1)*words_per_seg]
            seg_text = " ".join(seg_words)
            seg_id = f"{r[id_col]}-{i+1}"
            new_row = {id_col: seg_id, text_col: seg_text}
            for c in other_cols:
                new_row[c] = r[c]
            rows.append(new_row)

    # keep original column order (ID, others..., Text at end if that's how yours is)
    col_order = [id_col] + [c for c in df.columns if c not in (id_col, text_col)] + [text_col]
    return pd.DataFrame(rows, columns=col_order)

In [147]:
final_provis_parl_df = split_text_into_segments(lessened_df)
final_provis_parl_df.head()

Unnamed: 0,ID,Speaker_ID,Speaker_birth,Language,Party_orientation,Topic,Senti_6,Text
0,ParlaMint-DK_20220111130003-1,KristensenHenrikDam,1957.0,English,Centre-left,Mix,neutral negative,The meeting is open. Since this is the first m...
1,ParlaMint-DK_20220111130003-2,KristensenHenrikDam,1957.0,English,Centre-left,Mix,neutral negative,are among the highest in Europe?). Martin Geer...
2,ParlaMint-DK_20220111130003-3,KristensenHenrikDam,1957.0,English,Centre-left,Mix,neutral negative,then elected.
3,ParlaMint-DK_20220111130206-1,KristensenHenrikDam,1957.0,English,Centre-left,Other,-,Are there any objections to the promotion of t...
4,ParlaMint-DK_20220111130249-1,KristensenHenrikDam,1957.0,English,Centre-left,Other,neutral positive,"The debate is open. Sir. Troels Ravn, the Soci..."


In [148]:
final_provis_parl_df.to_csv("final_provis_parl.csv")