In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import os

In [2]:
DATA_FOLDER = "./"

MEMORIZATION_PERPLEXITY_FOLDER = "memorization_results/perplexity/"
MEMORIZATION_RANK_FOLDER = "memorization_results/rank/"

SINTACTIC_PERPLEXITY_FOLDER = "syntactic_results/perplexity/"
SINTACTIC_RANK_FOLDER = "syntactic_results/rank/"

SEMANTIC_PERPLEXITY_FOLDER = "semantic_results/perplexity/"
SEMANTIC_RANK_FOLDER = "semantic_results/rank/"

In [3]:
def get_csv_files(path):
    if not os.path.exists(path):
        return []
    return [f for f in os.listdir(path) if f.endswith(".csv")]


def read_files(path, file_list):
    data_list = []
    for file in file_list:
        full_path = os.path.join(path, file)
        data = pd.read_csv(full_path)
        data_list.append(data)
    return data_list

In [4]:
memorization_rank_files = get_csv_files(
    os.path.join(DATA_FOLDER, MEMORIZATION_RANK_FOLDER)
)
memorization_perplexity_files = get_csv_files(
    os.path.join(DATA_FOLDER, MEMORIZATION_PERPLEXITY_FOLDER)
)
syntactic_rank_files = get_csv_files(os.path.join(DATA_FOLDER, SINTACTIC_RANK_FOLDER))
syntactic_perplexity_files = get_csv_files(
    os.path.join(DATA_FOLDER, SINTACTIC_PERPLEXITY_FOLDER)
)
semantic_rank_files = get_csv_files(os.path.join(DATA_FOLDER, SEMANTIC_RANK_FOLDER))
semantic_perplexity_files = get_csv_files(
    os.path.join(DATA_FOLDER, SEMANTIC_PERPLEXITY_FOLDER)
)

In [5]:
print("Memorization Rank Files:", memorization_rank_files)
print("Memorization Perplexity Files:", memorization_perplexity_files)

print("Syntactic Rank Files:", syntactic_rank_files)
print("Syntactic Perplexity Files:", syntactic_perplexity_files)

print("Semantic Rank Files:", semantic_rank_files)
print("Semantic Perplexity Files:", semantic_perplexity_files)

Memorization Rank Files: ['wandb_export_2025-10-29T13_16_55.575+08_00.csv', 'wandb_export_2025-10-29T13_16_05.743+08_00.csv']
Memorization Perplexity Files: ['wandb_export_2025-10-29T14_40_59.161+08_00.csv', 'wandb_export_2025-10-29T14_40_43.473+08_00.csv']
Syntactic Rank Files: ['wandb_export_2025-10-29T14_41_38.626+08_00.csv', 'wandb_export_2025-10-29T14_41_31.684+08_00.csv']
Syntactic Perplexity Files: ['wandb_export_2025-10-29T14_42_28.324+08_00.csv', 'wandb_export_2025-10-29T14_42_21.704+08_00.csv']
Semantic Rank Files: ['wandb_export_2025-10-29T14_43_07.014+08_00.csv', 'wandb_export_2025-10-29T14_43_13.296+08_00.csv']
Semantic Perplexity Files: ['wandb_export_2025-10-29T14_43_38.412+08_00.csv', 'wandb_export_2025-10-29T14_43_30.299+08_00.csv']


In [6]:
memorization_rank_data = read_files(
    os.path.join(DATA_FOLDER, MEMORIZATION_RANK_FOLDER), memorization_rank_files
)
memorization_perplexity_data = read_files(
    os.path.join(DATA_FOLDER, MEMORIZATION_PERPLEXITY_FOLDER),
    memorization_perplexity_files,
)

syntactic_rank_data = read_files(
    os.path.join(DATA_FOLDER, SINTACTIC_RANK_FOLDER), syntactic_rank_files
)
syntactic_perplexity_data = read_files(
    os.path.join(DATA_FOLDER, SINTACTIC_PERPLEXITY_FOLDER), syntactic_perplexity_files
)

semantic_rank_data = read_files(
    os.path.join(DATA_FOLDER, SEMANTIC_RANK_FOLDER), semantic_rank_files
)
semantic_perplexity_data = read_files(
    os.path.join(DATA_FOLDER, SEMANTIC_PERPLEXITY_FOLDER), semantic_perplexity_files
)

In [7]:
def get_df_res(data_list):
    if not data_list:
        return None

    max_rows = max(file.shape[0] for file in data_list)

    for i, file in enumerate(data_list):
        start_index = max_rows - file.shape[0]
        end_index = max_rows
        new_index = range(start_index, end_index)
        file.index = new_index

        # Keep only desired columns
        cols_to_keep = [
            c
            for c in file.columns
            if not (c.endswith("__MIN") or c.endswith("__MAX") or c == "Step")
        ]

        data_list[i] = file[cols_to_keep]

    df_res = pd.concat(data_list, axis=1)
    df_res.sort_index(inplace=True)

    return df_res

In [8]:
df_res_rank_memorization = get_df_res(memorization_rank_data)
df_res_perplexity_memorization = get_df_res(memorization_perplexity_data)

df_res_rank_syntactic = get_df_res(syntactic_rank_data)
df_res_perplexity_syntactic = get_df_res(syntactic_perplexity_data)

df_res_rank_semantic = get_df_res(semantic_rank_data)
df_res_perplexity_semantic = get_df_res(semantic_perplexity_data)

In [9]:
def plot_df_res(df_res, title, y_label):
    if df_res is None:
        print("No data to plot.")
        return None

    fig = px.line(
        df_res,
        x=df_res.index,
        y=df_res.columns,
        labels={"value": y_label, "variable": "Experiment"},
        title=title,
    )

    fig.update_layout(
        legend_title_text="Experiment",
        template="plotly_white",
        legend=dict(
            orientation="v",
            yanchor="top",
            y=1.1,
            xanchor="right",
        ),
        hovermode="x unified",
    )

    return fig

In [10]:
plot_df_res(
    df_res_rank_memorization,
    title="Memorization Rank Average over Epochs",
    y_label="Memorization Rank Average",
).show()
plot_df_res(
    df_res_perplexity_memorization,
    title="Memorization Perplexity Average over Epochs",
    y_label="Memorization Perplexity Average",
).show()

plot_df_res(
    df_res_rank_syntactic,
    title="Syntactic Rank Average over Epochs",
    y_label="Syntactic Rank Average",
).show()
plot_df_res(
    df_res_perplexity_syntactic,
    title="Syntactic Perplexity Average over Epochs",
    y_label="Syntactic Perplexity Average",
).show()

plot_df_res(
    df_res_rank_semantic,
    title="Semantic Rank Average over Epochs",
    y_label="Semantic Rank Average",
).show()
plot_df_res(
    df_res_perplexity_semantic,
    title="Semantic Perplexity Average over Epochs",
    y_label="Semantic Perplexity Average",
).show()