In [3]:
import pandas as pd
import os
from functools import reduce

In [4]:
scripts_dir = os.getcwd()
os.chdir("../data")
data_dir = os.getcwd()
os.chdir("../results")
results_dir = os.getcwd()
os.chdir("../metadata")
meta_dir = os.getcwd()
os.chdir(scripts_dir)

In [5]:
all_speeches = os.path.join(meta_dir, "riksdagen_speeches_with_ages.parquet")
all_speeches_df = pd.read_parquet(all_speeches)

In [6]:
dfs = dict()
splits = ["train", "dev", "test"]

for split in splits:
    across_age = f"{split}_within_speaker_across_age_comparisons.parquet"
    within_age = f"{split}_within_speaker_within_age_comparisons.parquet"
    acc_speaker_accr_age = f"{split}_across_speaker_comparisons.parquet"
    acc_speaker_with_age = f"{split}_across_speaker_within_age_comparisons.parquet"

    across_age_df = pd.read_parquet(os.path.join(meta_dir, across_age))
    within_age_df = pd.read_parquet(os.path.join(meta_dir, within_age))
    acc_speaker_accr_age_df = pd.read_parquet(os.path.join(meta_dir, acc_speaker_accr_age))
    acc_speaker_with_age_df = pd.read_parquet(os.path.join(meta_dir, acc_speaker_with_age))

    dfs[split] = [(across_age_df, 3), (within_age_df, 3), (acc_speaker_accr_age_df, 1), (acc_speaker_with_age_df, 1)]

In [7]:
danfs = dict()

for split in splits:
    tmp_danfs = set()
    for df, num_pairs in dfs[split]:
        for pair_num in range(1, num_pairs+1):
            tmp_danfs.update(set(reduce(lambda x, y: list(x) + list(y), df[f"pair_{pair_num}"].to_list())))
    danfs[split] = tmp_danfs

In [8]:
for split in splits:
    print(split)
    print(f"\tNumber of comparisons: {len(dfs[split][3][0])}")
    print(f"\tNumber of speeches: {len(danfs[split])}")

train
	Number of comparisons: 2379
	Number of speeches: 7422
dev
	Number of comparisons: 480
	Number of speeches: 1363
test
	Number of comparisons: 585
	Number of speeches: 2310


In [9]:
full_dfs = dict()

for split in splits:
    full_dfs[split] = all_speeches_df[all_speeches_df["dokid_anfnummer"].apply(lambda x: x in danfs[split])]
    full_dfs[split]["debateyear"] = full_dfs[split]["debatedate"].apply(lambda x: x.year)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_dfs[split]["debateyear"] = full_dfs[split]["debatedate"].apply(lambda x: x.year)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_dfs[split]["debateyear"] = full_dfs[split]["debatedate"].apply(lambda x: x.year)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_dfs[split]["debateyear"] = f

In [25]:
def print_df_data(df, split, metadata):
    min_year, max_year = min(df["debatedate"].tolist()),\
                         max(df["debatedate"].tolist())
    print(f"first debate: {min_year}, last debate: {max_year}")
    metadata["First debate"] = f"{min_year.date()}"
    metadata["Last debate"] = f"{max_year.date()}"
    
    num_debates = len(set(df["dokid"]))
    num_speeches = len(df)
    print(f"Number of debates: {num_debates}")
    print(f"Number of speeches: {num_speeches}")
    metadata["Number of debates"] = num_debates
    metadata["Number of speeches"] = num_speeches

    num_speakers = len(set(df["intressent_id"]))
    print(f"Number of speakers: {num_speakers}")
    metadata["Number of speakers"] = num_speakers

    min_age, max_age = min(df[df["age"] != 0]["age"].tolist()),\
                         max(df["age"].tolist())
    print(f"youngest age: {min_age}, oldest age: {max_age}")
    metadata["Youngest age"] = min_age
    metadata["Oldest age"] = max_age

    num_debs_per_year = df.groupby("debateyear").agg(["count"])["dokid"]
    mean_debs = num_debs_per_year.mean()[0]
    std_debs = num_debs_per_year.std()[0]
    print(f"Debates per year mean: {mean_debs:.0f}, std: {std_debs:.0f}")
    metadata["Debates per year, mean (std.)"] = f"{mean_debs:.0f} ({std_debs:.0f})"

    min_debs, min_year = num_debs_per_year.min()[0], num_debs_per_year.idxmin()[0]
    max_debs, max_year = num_debs_per_year.max()[0], num_debs_per_year.idxmax()[0]
    print(f"Lowest number of debates ({min_debs}) in {min_year}")
    print(f"Highest number of debates ({max_debs}) in {max_year}")
    metadata["Lowest number of debates (year)"] = f"{min_debs} ({min_year})"
    metadata["Highest number of debates (year)"] = f"{max_debs} ({max_year})"
    
    num_debs_per_speaker = df.groupby("intressent_id").agg(["count"])["dokid"]
    mean_speeches = num_debs_per_speaker.mean()[0]
    std_speeches = num_debs_per_speaker.std()[0]
    print(f"Mean speeches per speaker: {mean_speeches:.0f}, std: {std_speeches:.0f}")
    metadata["Number of speeches per speaker, mean (std.)"] = f"{mean_speeches:.0f} ({std_speeches:.0f})"

#     num_debs_per_speaker = df.groupby(["intressent_id", "debateyear"])\
#         .agg(["count"])["dokid"]
#     mean_speeches = num_debs_per_speaker.mean()[0]
#     std_speeches = num_debs_per_speaker.std()[0]
#     print(f"Mean speeches per speaker per year: {mean_speeches:.0f}, \
# std: {std_speeches:.0f}")
    #
##    mean_speech_length = df.groupby("duration_segment")\
##        .agg(["mean"])["dokid"]
#     mean_length = df["duration_segment"].mean()
#     std_length = df["duration_segment"].std()
#     print(f"Mean speech_length: {mean_length:.0f}, \
# std: {std_length:.0f}")

    return metadata

In [26]:
metadata = dict()
for split in splits:
    print(split)
    split_metadata = dict()
    split_metatdata = print_df_data(full_dfs[split], split, split_metadata)
    print()
    metadata[split] = split_metadata

train
first debate: 2003-11-11 00:00:00, last debate: 2023-02-03 00:00:00
Number of debates: 3156
Number of speeches: 7422
Number of speakers: 177
youngest age: 19, oldest age: 78
Debates per year mean: 353, std: 222
Lowest number of debates (6) in 2003
Highest number of debates (694) in 2016
Mean speeches per speaker: 42, std: 44

dev
first debate: 2004-01-23 00:00:00, last debate: 2023-01-31 00:00:00
Number of debates: 743
Number of speeches: 1363
Number of speakers: 36
youngest age: 24, oldest age: 68
Debates per year mean: 68, std: 32
Lowest number of debates (7) in 2023
Highest number of debates (123) in 2015
Mean speeches per speaker: 38, std: 32

test
first debate: 2006-01-25 00:00:00, last debate: 2021-12-08 00:00:00
Number of debates: 1191
Number of speeches: 2310
Number of speakers: 20
youngest age: 24, oldest age: 58
Debates per year mean: 144, std: 75
Lowest number of debates (15) in 2021
Highest number of debates (238) in 2016
Mean speeches per speaker: 116, std: 36



In [37]:
for key in metadata["train"]:
    print(f"\t\t{key}", end="")
    for split in splits:
        print(f" & {metadata[split][key]}", end="")
    print(" \\\\")

		First debate & 2003-11-11 & 2004-01-23 & 2006-01-25 \\
		Last debate & 2023-02-03 & 2023-01-31 & 2021-12-08 \\
		Number of debates & 3156 & 743 & 1191 \\
		Number of speeches & 7422 & 1363 & 2310 \\
		Number of speakers & 177 & 36 & 20 \\
		Youngest age & 19 & 24 & 24 \\
		Oldest age & 78 & 68 & 58 \\
		Debates per year, mean (std.) & 353 (222) & 68 (32) & 144 (75) \\
		Lowest number of debates (year) & 6 (2003) & 7 (2023) & 15 (2021) \\
		Highest number of debates (year) & 694 (2016) & 123 (2015) & 238 (2016) \\
		Number of speeches per speaker, mean (std.) & 42 (44) & 38 (32) & 116 (36) \\
