Imports

In [None]:
import pandas as pd
import os
import math
import glob

Define which files should be read into which frame

In [None]:


def define_names():
    d_path = "ParlaMint_src"

    target_folders = ["ParlaMint-IS-en.txt", "ParlaMint-IT-en.txt"]

    f_names_ana_meta, f_names_meta = [], []


    for folder in target_folders:
        folder_path = os.path.join(d_path, folder)

        all_meta = glob.glob(os.path.join(folder_path, "**", "*-meta.tsv"), recursive=True)

        f_ana_meta = [f for f in all_meta if f.endswith("-ana-meta.tsv")]
        f_meta = [f for f in all_meta if f not in f_ana_meta]    

        f_names_ana_meta.extend(f_ana_meta)
        f_names_meta.extend(f_meta)

    f_names_txt = glob.glob(os.path.join("**", "parlamint-it-is-2022.txt"), recursive=True)
    if not f_names_txt:
        raise FileNotFoundError(
            "Could not find 'parlamint-it-is-2022.txt'. "
            "Ensure it exists in the repo and try again."
        )

    return f_names_ana_meta, f_names_meta, f_names_txt

Read data into the frames from the defined files

In [None]:
def create_df(f_names):
    m_frames = []

    for f_name in f_names:
        df = pd.read_csv(f_name, sep='\t')
        m_frames.append(df)

    df = pd.concat(m_frames)
    return df

In [None]:
def load_id_text_from_files(f_names_txt):

    dfs = []
    for file in f_names_txt:
        df = pd.read_csv(
            file,
            sep="\t",
            header=0,
            names=["ID", "Parent_ID", "Text"],
            dtype=str
        )
        dfs.append(df)
    
    return pd.concat(dfs, ignore_index=True)




Merge 2 different Dataframes

In [None]:
def merge_meta_dfs(df_ana_meta, df_meta):
    merged_df = pd.merge(
        df_ana_meta,
        df_meta,
        left_on="Parent_ID",
        right_on="ID",
        how="left",
        suffixes=('_ana', '_full')
    )
    merged_df.drop(columns=["ID_full"], inplace=True)
    merged_df.rename(columns={"ID_ana": "ID"}, inplace=True)
    return merged_df

In [None]:
def merge_with_text(merged_df, df_txt):
    df_final = pd.merge(
        merged_df,
        df_txt[["ID", "Text"]],  
        on="ID",
        how="left"
    )
    return df_final


In [None]:
def clear_columns(merged_df):

    columns_to_keep = [
        "ID",                        
        "Speaker_ID",      
        "Speaker_birth",
        "Language",
        "Topic",         
        "Senti_6",
        "Text",
    ]
    reduced_df = merged_df[columns_to_keep].copy()
    return reduced_df

Display Dataframes

In [None]:
f_names_ana_meta, f_names_meta, f_names_txt = define_names()
df_meta = create_df(f_names_meta)
df_meta.head()

Pipeline

In [None]:

df_ana_meta = create_df(f_names_ana_meta)
df_txt = load_id_text_from_files(f_names_txt)
df_txt.head()


Everything after this point can be revoked once there is a csv with sentences

In [None]:

df_meta_merged = merge_meta_dfs(df_ana_meta, df_meta)
df_meta_merged.head()

In [None]:
df_meta_merged.to_csv("merged_parlamint.tsv", sep="\t", index=False, encoding="utf-8")


In [None]:
df_final = merge_with_text(df_meta_merged, df_txt)
df_final = df_final.dropna(subset=["Text"])
df_final.head(15)

In [None]:
df_reduced = clear_columns(df_final)
df_reduced.head()

In [None]:
df_reduced.to_csv("parlamintITIS.csv")