Imports

In [1]:
import pandas as pd
import os
import math
import glob

Define which files should be read into which frame

In [2]:


def define_names():
    d_path = "ParlaMint_src"

    target_folders = ["ParlaMint-IS-en.txt", "ParlaMint-IT-en.txt"]

    f_names_ana_meta, f_names_meta = [], []


    for folder in target_folders:
        folder_path = os.path.join(d_path, folder)

        all_meta = glob.glob(os.path.join(folder_path, "**", "*-meta.tsv"), recursive=True)

        f_ana_meta = [f for f in all_meta if f.endswith("-ana-meta.tsv")]
        f_meta = [f for f in all_meta if f not in f_ana_meta]    

        f_names_ana_meta.extend(f_ana_meta)
        f_names_meta.extend(f_meta)

    f_names_txt = glob.glob(os.path.join("**", "parlamint-it-is-2022.txt"), recursive=True)
    if not f_names_txt:
        raise FileNotFoundError(
            "Could not find 'parlamint-it-is-2022.txt'. "
            "Ensure it exists in the repo and try again."
        )

    return f_names_ana_meta, f_names_meta, f_names_txt

Read data into the frames from the defined files

In [3]:
def create_df(f_names):
    m_frames = []

    for f_name in f_names:
        df = pd.read_csv(f_name, sep='\t')
        m_frames.append(df)

    df = pd.concat(m_frames)
    return df

In [4]:
def load_id_text_from_files(f_names_txt):

    dfs = []
    for file in f_names_txt:
        df = pd.read_csv(
            file,
            sep="\t",
            header=0,
            names=["ID", "Parent_ID", "Text"],
            dtype=str
        )
        dfs.append(df)
    
    return pd.concat(dfs, ignore_index=True)




Merge 2 different Dataframes

In [5]:
def merge_meta_dfs(df_ana_meta, df_meta):
    merged_df = pd.merge(
        df_ana_meta,
        df_meta,
        left_on="Parent_ID",
        right_on="ID",
        how="left",
        suffixes=('_ana', '_full')
    )
    merged_df.drop(columns=["ID_full"], inplace=True)
    merged_df.rename(columns={"ID_ana": "ID"}, inplace=True)
    return merged_df

In [6]:
def merge_with_text(merged_df, df_txt):
    df_final = pd.merge(
        merged_df,
        df_txt[["ID", "Text"]],  
        on="ID",
        how="left"
    )
    return df_final


In [7]:
def clear_columns(merged_df):

    columns_to_keep = [
        "ID",                        
        "Speaker_ID",      
        "Speaker_birth",
        "Language",
        "Topic",         
        "Senti_6",
        "Text",
    ]
    reduced_df = merged_df[columns_to_keep].copy()
    return reduced_df

Display Dataframes

In [8]:
f_names_ana_meta, f_names_meta, f_names_txt = define_names()
df_meta = create_df(f_names_meta)
df_meta.head()

Unnamed: 0,Text_ID,ID,Title,Date,Body,Term,Session,Meeting,Sitting,Agenda,...,Speaker_minister,Speaker_party,Speaker_party_name,Party_status,Party_orientation,Speaker_ID,Speaker_name,Speaker_gender,Speaker_birth,Topic
0,ParlaMint-IS-en_2022-01-17-20,ParlaMint-IS_2022-01-17-20.u1,Speeches in the national parliament of Iceland...,2022-01-17,Unicameralism,24. kjörtímabil,152. löggjafarþing,Hefðbundinn,-,-,...,Minister,-,-,-,-,KatrinJakobsdottir,"Jakobsdóttir, Katrín",F,1976,Other
1,ParlaMint-IS-en_2022-01-17-20,ParlaMint-IS_2022-01-17-20.u2,Speeches in the national parliament of Iceland...,2022-01-17,Unicameralism,24. kjörtímabil,152. löggjafarþing,Hefðbundinn,-,-,...,notMinister,-,-,-,-,BirgirArmannsson,"Ármannsson, Birgir",M,1968,Social Welfare
2,ParlaMint-IS-en_2022-01-17-20,ParlaMint-IS_2022-01-17-20.u3,Speeches in the national parliament of Iceland...,2022-01-17,Unicameralism,24. kjörtímabil,152. löggjafarþing,Hefðbundinn,-,-,...,notMinister,-,-,-,-,BirgirArmannsson,"Ármannsson, Birgir",M,1968,Other
3,ParlaMint-IS-en_2022-01-17-20,ParlaMint-IS_2022-01-17-20.u4,Speeches in the national parliament of Iceland...,2022-01-17,Unicameralism,24. kjörtímabil,152. löggjafarþing,Hefðbundinn,-,-,...,notMinister,-,-,-,-,BirgirArmannsson,"Ármannsson, Birgir",M,1968,Government Operations
4,ParlaMint-IS-en_2022-01-17-20,ParlaMint-IS_2022-01-17-20.u5,Speeches in the national parliament of Iceland...,2022-01-17,Unicameralism,24. kjörtímabil,152. löggjafarþing,Hefðbundinn,-,-,...,notMinister,-,-,-,-,LogiEinarsson,"Einarsson, Logi",M,1964,Health


Pipeline

In [9]:

df_ana_meta = create_df(f_names_ana_meta)
df_txt = load_id_text_from_files(f_names_txt)
df_txt.head()


Unnamed: 0,ID,Parent_ID,Text
0,ParlaMint-IS_2022-01-17-20.seg2.1,ParlaMint-IS_2022-01-17-20.u1,President of the United States reports:
1,ParlaMint-IS_2022-01-17-20.seg3.1,ParlaMint-IS_2022-01-17-20.u1,"I have decided, according to the proposal of t..."
2,ParlaMint-IS_2022-01-17-20.seg4.1,ParlaMint-IS_2022-01-17-20.u1,"Arrange sites, January 11th, 2022."
3,ParlaMint-IS_2022-01-17-20.seg6.1,ParlaMint-IS_2022-01-17-20.u1,Katrín Jakobsdóttir's daughter.
4,ParlaMint-IS_2022-01-17-20.seg7.1,ParlaMint-IS_2022-01-17-20.u1,Presidential Letters for a meeting of the Gene...


Everything after this point can be revoked once there is a csv with sentences

In [10]:

df_meta_merged = merge_meta_dfs(df_ana_meta, df_meta)
df_meta_merged.head()

Unnamed: 0,ID,Parent_ID,Element,Language,Senti_3,Senti_6,Senti_n,Sents,Words,Tokens,...,Speaker_minister,Speaker_party,Speaker_party_name,Party_status,Party_orientation,Speaker_ID,Speaker_name,Speaker_gender,Speaker_birth,Topic
0,ParlaMint-IS_2022-01-17-20.u1,ParlaMint-IS-en_2022-01-17-20,u,English,-,-,-,8,97,113,...,,,,,,,,,,
1,ParlaMint-IS_2022-01-17-20.seg1.1,ParlaMint-IS_2022-01-17-20.u1,s,English,Neutral,neutral positive,2.674,1,6,7,...,Minister,-,-,-,-,KatrinJakobsdottir,"Jakobsdóttir, Katrín",F,1976.0,Other
2,ParlaMint-IS_2022-01-17-20.seg2.1,ParlaMint-IS_2022-01-17-20.u1,s,English,Neutral,neutral positive,3.043,1,6,7,...,Minister,-,-,-,-,KatrinJakobsdottir,"Jakobsdóttir, Katrín",F,1976.0,Other
3,ParlaMint-IS_2022-01-17-20.seg3.1,ParlaMint-IS_2022-01-17-20.u1,s,English,Neutral,neutral positive,2.919,1,27,32,...,Minister,-,-,-,-,KatrinJakobsdottir,"Jakobsdóttir, Katrín",F,1976.0,Other
4,ParlaMint-IS_2022-01-17-20.seg4.1,ParlaMint-IS_2022-01-17-20.u1,s,English,Neutral,neutral positive,2.855,1,5,8,...,Minister,-,-,-,-,KatrinJakobsdottir,"Jakobsdóttir, Katrín",F,1976.0,Other


In [11]:
df_meta_merged.to_csv("merged_parlamint.tsv", sep="\t", index=False, encoding="utf-8")

In [12]:
df_final = merge_with_text(df_meta_merged, df_txt)
df_final = df_final.dropna(subset=["Text"])
df_final.head(15)

Unnamed: 0,ID,Parent_ID,Element,Language,Senti_3,Senti_6,Senti_n,Sents,Words,Tokens,...,Speaker_party,Speaker_party_name,Party_status,Party_orientation,Speaker_ID,Speaker_name,Speaker_gender,Speaker_birth,Topic,Text
2,ParlaMint-IS_2022-01-17-20.seg2.1,ParlaMint-IS_2022-01-17-20.u1,s,English,Neutral,neutral positive,3.043,1,6,7,...,-,-,-,-,KatrinJakobsdottir,"Jakobsdóttir, Katrín",F,1976.0,Other,President of the United States reports:
3,ParlaMint-IS_2022-01-17-20.seg3.1,ParlaMint-IS_2022-01-17-20.u1,s,English,Neutral,neutral positive,2.919,1,27,32,...,-,-,-,-,KatrinJakobsdottir,"Jakobsdóttir, Katrín",F,1976.0,Other,"I have decided, according to the proposal of t..."
4,ParlaMint-IS_2022-01-17-20.seg4.1,ParlaMint-IS_2022-01-17-20.u1,s,English,Neutral,neutral positive,2.855,1,5,8,...,-,-,-,-,KatrinJakobsdottir,"Jakobsdóttir, Katrín",F,1976.0,Other,"Arrange sites, January 11th, 2022."
6,ParlaMint-IS_2022-01-17-20.seg6.1,ParlaMint-IS_2022-01-17-20.u1,s,English,Positive,mixed positive,3.611,1,4,5,...,-,-,-,-,KatrinJakobsdottir,"Jakobsdóttir, Katrín",F,1976.0,Other,Katrín Jakobsdóttir's daughter.
7,ParlaMint-IS_2022-01-17-20.seg7.1,ParlaMint-IS_2022-01-17-20.u1,s,English,Neutral,neutral positive,2.773,1,17,18,...,-,-,-,-,KatrinJakobsdottir,"Jakobsdóttir, Katrín",F,1976.0,Other,Presidential Letters for a meeting of the Gene...
8,ParlaMint-IS_2022-01-17-20.seg8.1,ParlaMint-IS_2022-01-17-20.u1,s,English,Positive,mixed positive,4.247,1,27,30,...,-,-,-,-,KatrinJakobsdottir,"Jakobsdóttir, Katrín",F,1976.0,Other,I'd like to use this opportunity here after re...
12,ParlaMint-IS_2022-01-17-20.seg10.1,ParlaMint-IS_2022-01-17-20.u3,s,English,Neutral,neutral negative,2.371,1,35,38,...,-,-,-,-,BirgirArmannsson,"Ármannsson, Birgir",M,1968.0,Other,The President intends to consult the council's...
14,ParlaMint-IS_2022-01-17-20.seg10.3,ParlaMint-IS_2022-01-17-20.u3,s,English,Neutral,neutral negative,1.812,1,26,28,...,-,-,-,-,BirgirArmannsson,"Ármannsson, Birgir",M,1968.0,Other,"This is not the case, nor can a conference sta..."
16,ParlaMint-IS_2022-01-17-20.seg11.1,ParlaMint-IS_2022-01-17-20.u4,s,English,Neutral,neutral negative,2.369,1,27,34,...,-,-,-,-,BirgirArmannsson,"Ármannsson, Birgir",M,1968.0,Government Operations,"To answer, Prime Minister, Minister of Health,..."
19,ParlaMint-IS_2022-01-17-20.seg12.2,ParlaMint-IS_2022-01-17-20.u5,s,English,Negative,mixed negative,0.536,1,17,19,...,-,-,-,-,LogiEinarsson,"Einarsson, Logi",M,1964.0,Health,While there are natural limits for health cont...


In [13]:
df_reduced = clear_columns(df_final)
df_reduced.head()

Unnamed: 0,ID,Speaker_ID,Speaker_birth,Language,Topic,Senti_6,Text
2,ParlaMint-IS_2022-01-17-20.seg2.1,KatrinJakobsdottir,1976.0,English,Other,neutral positive,President of the United States reports:
3,ParlaMint-IS_2022-01-17-20.seg3.1,KatrinJakobsdottir,1976.0,English,Other,neutral positive,"I have decided, according to the proposal of t..."
4,ParlaMint-IS_2022-01-17-20.seg4.1,KatrinJakobsdottir,1976.0,English,Other,neutral positive,"Arrange sites, January 11th, 2022."
6,ParlaMint-IS_2022-01-17-20.seg6.1,KatrinJakobsdottir,1976.0,English,Other,mixed positive,Katrín Jakobsdóttir's daughter.
7,ParlaMint-IS_2022-01-17-20.seg7.1,KatrinJakobsdottir,1976.0,English,Other,neutral positive,Presidential Letters for a meeting of the Gene...


In [14]:
keep_mask = df_final["Text"].fillna("").str.strip().str.len().ge(20) #adjust if needed
df_filtered = df_final.loc[keep_mask].copy()

print(f"Removed {len(df_final) - len(df_filtered)} rows with text < 20 chars.")
print(f"Remaining rows: {len(df_filtered)}")

Removed 0 rows with text < 20 chars.
Remaining rows: 160545


In [15]:
df_text = df_filtered.loc[:, ["ID", "Text"]]
df_meta = df_filtered.drop(columns=["Text"])

In [16]:
df_reduced.to_csv("parlamintITIS.csv")
df_text.to_csv("parlamintITIS_text.csv")
df_meta.to_csv("parlamintITIS_meta.csv")