### Notebook to generate LaTeX tables

### Metrics

In [52]:
# imports
import pandas as pd
import os
import pickle
import numpy as np
import pandas as pd
import glob
import json
from rich import print as rprint
from rich.table import Table
from rich.console import Console
import gzip
import datasets
from datasets import load_dataset
datasets.logging.set_verbosity_error()
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 1000, 'display.width', 1000, 'display.max_rows',1000)
pd.set_option("max_colwidth", 500)

In [46]:
# Load CSV
df = pd.read_csv("../../metrics/2014.csv") # path from wandb exported metrics

# global amazon dataset mappings
DATA_NAME_MAP = {
    'beauty': 'Beauty',
    'toys': 'Toys and Games',
    'sports': 'Sports and Outdoors',
    'games': 'Video Games',
    'software': 'Software',
}
INVERSE_DATA_NAME_MAP = {v: k for k, v in DATA_NAME_MAP.items()}

# Filter columns
filter_cols = ["Name", "dataset_split", 
               "metrics/recall@5", "metrics/ndcg@5", 
               "metrics/recall@10", "metrics/ndcg@10"]
df = df[filter_cols].copy()

# Extract model from Name
def identify_model(name):
    name = name.lower()
    if "clip" in name:
        return "TIGRESS"
    elif "decoder" in name:
        return "TIGER"
    else:
        return "UNKNOWN"

df["Model"] = df["Name"].apply(identify_model)

# Filter out unknowns if needed
df = df[df["Model"] != "UNKNOWN"]

# Pivot
pivoted = df.pivot(index="dataset_split", columns="Model")

# Flatten column names
pivoted.columns = [f"{metric.replace('metrics/', '').replace('@', '_')}_{model}" for metric, model in pivoted.columns]

# Define desired column order
expected_cols = ['recall_5_TIGER', 'ndcg_5_TIGER', 'recall_10_TIGER', 'ndcg_10_TIGER',
                 'recall_5_TIGRESS', 'ndcg_5_TIGRESS', 'recall_10_TIGRESS', 'ndcg_10_TIGRESS']

# Ensure columns exist
missing = [c for c in expected_cols if c not in pivoted.columns]
if missing:
    print("Available columns:", pivoted.columns.tolist())
    raise KeyError(f"Missing expected columns: {missing}")

# Reorder
pivoted = pivoted[expected_cols].round(4)

# Bold max values
def bold_max(row, metric):
    t = row[f"{metric}_TIGER"]
    tr = row[f"{metric}_TIGRESS"]
    if t > tr:
        return f"\\textbf{{{t:.4f}}}", f"{tr:.4f}"
    elif tr > t:
        return f"{t:.4f}", f"\\textbf{{{tr:.4f}}}"
    else:
        return f"{t:.4f}", f"{tr:.4f}"

# Generate LaTeX rows
latex_rows = []
for category, row in pivoted.iterrows():
    r5_t, r5_tr = bold_max(row, "recall_5")
    n5_t, n5_tr = bold_max(row, "ndcg_5")
    r10_t, r10_tr = bold_max(row, "recall_10")
    n10_t, n10_tr = bold_max(row, "ndcg_10")
    latex_rows.append(
        f"{DATA_NAME_MAP[category]} & {r5_t} & {n5_t} & {r10_t} & {n10_t} & {r5_tr} & {n5_tr} & {r10_tr} & {n10_tr} \\\\"
    )

# Final LaTeX table
latex_table = r"""
\begin{table}[ht]
\centering
\resizebox{1\textwidth}{!}{%
\begin{tabular}{lcccccccc}
    \toprule
    \textbf{Category} & \multicolumn{4}{c}{\textbf{TIGER}} & \multicolumn{4}{c}{\textbf{TIGRESS}} \\
    \cmidrule(r){2-5} \cmidrule(l){6-9}
    & \textbf{Recall@5} & \textbf{NDCG@5} & \textbf{Recall@10} & \textbf{NDCG@10}
    & \textbf{Recall@5} & \textbf{NDCG@5} & \textbf{Recall@10} & \textbf{NDCG@10} \\
    \midrule
""" + "\n".join(latex_rows) + r"""
    \bottomrule
    \end{tabular}
    }
\vspace{1mm}
\caption{Comparison of TIGER and TIGRESS on Recall and NDCG at cutoff 5 and 10 across categories.}
\label{tab:tiger_vs_tigress}
\end{table}
"""

print(latex_table)


\begin{table}[ht]
\centering
\resizebox{1\textwidth}{!}{%
\begin{tabular}{lcccccccc}
    \toprule
    \textbf{Category} & \multicolumn{4}{c}{\textbf{TIGER}} & \multicolumn{4}{c}{\textbf{TIGRESS}} \\
    \cmidrule(r){2-5} \cmidrule(l){6-9}
    & \textbf{Recall@5} & \textbf{NDCG@5} & \textbf{Recall@10} & \textbf{NDCG@10}
    & \textbf{Recall@5} & \textbf{NDCG@5} & \textbf{Recall@10} & \textbf{NDCG@10} \\
    \midrule
Beauty & 0.0214 & 0.0157 & 0.0275 & 0.0176 & \textbf{0.0399} & \textbf{0.0283} & \textbf{0.0563} & \textbf{0.0336} \\
Sports and Outdoors & 0.0061 & 0.0045 & 0.0091 & 0.0055 & \textbf{0.0214} & \textbf{0.0153} & \textbf{0.0290} & \textbf{0.0178} \\
Toys and Games & 0.0227 & 0.0181 & 0.0280 & 0.0198 & \textbf{0.0287} & \textbf{0.0209} & \textbf{0.0388} & \textbf{0.0242} \\
    \bottomrule
    \end{tabular}
    }
\vspace{1mm}
\caption{Comparison of TIGER and TIGRESS on Recall and NDCG at cutoff 5 and 10 across categories.}
\label{tab:tiger_vs_tigress}
\end{table}



In [58]:
df = pd.read_csv("../../metrics/2014.csv") # path from wandb exported metrics
df[["Name", "dataset_split", "rqvae_uid", "uid", "log_dir", "pretrained_rqvae_path"]]

Unnamed: 0,Name,dataset_split,rqvae_uid,uid,log_dir,pretrained_rqvae_path
0,clip-sum-v2-decoder-amazon-sports/1749950218/1750312346,sports,1749950000.0,1750312346,/home/scur2745/logdir/decoder/amazon/sports/1750312346,/home/scur2745/logdir/rqvae/amazon/sports/1749950218/checkpoint_400000.pt
1,clip-sum-v2-decoder-amazon-toys/1749950278/1750312203,toys,1749950000.0,1750312203,/home/scur2745/logdir/decoder/amazon/toys/1750312203,/home/scur2745/logdir/rqvae/amazon/toys/1749950278/checkpoint_400000.pt
2,clip-sum-v2-decoder-amazon-beauty/1749948820/1750296496,beauty,1749949000.0,1750296496,/home/scur2745/logdir/decoder/amazon/beauty/1750296496,/home/scur2745/logdir/rqvae/amazon/beauty/1749948820/checkpoint_400000.pt
3,decoder-amazon-sports/1749670527/1750188998,sports,1749671000.0,1750188998,/home/scur2745/logdir/decoder/amazon/sports/1750188998,/home/scur2745/logdir/rqvae/amazon/sports/1749670527/checkpoint_400000.pt
4,decoder-amazon-toys/1749670347/1749780487,toys,,1749780487,/home/scur2745/logdir/decoder/amazon/toys/1749780487,/home/scur2745/logdir/rqvae/amazon/toys/1749670347/checkpoint_400000.pt
5,decoder-amazon-beauty/1749670228/1749779426,beauty,,1749779426,/home/scur2745/logdir/decoder/amazon/beauty/1749779426,/home/scur2745/logdir/rqvae/amazon/beauty/1749670228/checkpoint_400000.pt


In [59]:
df = pd.read_csv("../../metrics/2023.csv") # path from wandb exported metrics
df[["Name", "dataset_split", "rqvae_uid", "uid", "log_dir", "pretrained_rqvae_path"]]

Unnamed: 0,Name,dataset_split,rqvae_uid,uid,log_dir,pretrained_rqvae_path
0,new-2023-clip-sum-decoder-amazon-software/1750643408/1750648753,software,1750643408,1750648753,/home/scur2745/logdir/decoder/amazon/2023/software/1750648753,/home/scur2745/logdir/rqvae/amazon/2023/software/1750643408/checkpoint_400000.pt
1,new-2023-clip-sum-decoder-amazon-games/1750643230/1750648522,games,1750643230,1750648522,/home/scur2745/logdir/decoder/amazon/2023/games/1750648522,/home/scur2745/logdir/rqvae/amazon/2023/games/1750643230/checkpoint_400000.pt
2,new-2023-clip-sum-decoder-amazon-sports/1750617741/1750642896,sports,1750617741,1750642896,/home/scur2745/logdir/decoder/amazon/2023/sports/1750642896,/home/scur2745/logdir/rqvae/amazon/2023/toys/1750617741/checkpoint_400000.pt
3,new-2023-clip-sum-decoder-amazon-toys/1750617741/1750642293,toys,1750617741,1750642293,/home/scur2745/logdir/decoder/amazon/2023/toys/1750642293,/home/scur2745/logdir/rqvae/amazon/2023/toys/1750617741/checkpoint_400000.pt
4,new-2023-clip-sum-decoder-amazon-beauty/1750618001/1750619946,beauty,1750618001,1750619946,/home/scur2745/logdir/decoder/amazon/2023/beauty/1750619946,/home/scur2745/logdir/rqvae/amazon/2023/beauty/1750618001/checkpoint_400000.pt
5,new-2023-decoder-amazon-software/1750517045/1750619515,software,1750517045,1750619515,/home/scur2745/logdir/decoder/amazon/2023/software/1750619515,/home/scur2745/logdir/rqvae/amazon/2023/software/1750517045/checkpoint_400000.pt
6,new-2023-decoder-amazon-games/1750517268/1750619301,games,1750517268,1750619301,/home/scur2745/logdir/decoder/amazon/2023/games/1750619301,/home/scur2745/logdir/rqvae/amazon/2023/games/1750517268/checkpoint_400000.pt
7,new-2023-decoder-amazon-toys/1750516446/1750562505,toys,1750516446,1750562505,/home/scur2745/logdir/decoder/amazon/2023/toys/1750562505,/home/scur2745/logdir/rqvae/amazon/2023/toys/1750516446/checkpoint_400000.pt
8,new-2023-decoder-amazon-sports/1750516310/1750562267,sports,1750516310,1750562267,/home/scur2745/logdir/decoder/amazon/2023/sports/1750562267,/home/scur2745/logdir/rqvae/amazon/2023/sports/1750516310/checkpoint_400000.pt
9,new-2023-decoder-amazon-beauty/1750516729/1750518050,beauty,1750516729,1750518050,/home/scur2745/logdir/decoder/amazon/2023/beauty/1750518050,/home/scur2745/logdir/rqvae/amazon/2023/beauty/1750516729/checkpoint_400000.pt


### Fairness and Diversity

In [44]:
# Load CSV
df = pd.read_csv("../../metrics/2014.csv") # path from wandb exported metrics

# global amazon dataset mappings
DATA_NAME_MAP = {
    'beauty': 'All Beauty',
    'toys': 'Toys and Games',
    'sports': 'Sports and Outdoors',
    'games': 'Video Games',
    'software': 'Software',
}
INVERSE_DATA_NAME_MAP = {v: k for k, v in DATA_NAME_MAP.items()}
# fairness and diversity metrics
FD_METRICS = ["gini"]

# Filter columns
filter_cols = ["Name", "dataset_split", "metrics/gini@5", "metrics/gini@10"]
df = df[filter_cols].copy()

# Extract model from Name
def identify_model(name):
    name = name.lower()
    if "clip" in name:
        return "TIGRESS"
    elif "decoder" in name:
        return "TIGER"
    else:
        return "UNKNOWN"

df["Model"] = df["Name"].apply(identify_model)

# Filter out unknowns if needed
df = df[df["Model"] != "UNKNOWN"]

# Pivot
pivoted = df.pivot(index="dataset_split", columns="Model")

# Flatten column names
pivoted.columns = [f"{metric.replace('metrics/', '').replace('@', '_')}_{model}" for metric, model in pivoted.columns]

# Define desired column order
# metric_cols = ['gini_5_TIGER', 'gini_10_TIGER',
#                'gini_5_TIGRESS', 'gini_10_TIGRESS']
metric_cols = [c for c in pivoted.columns if c.split("_")[0] in FD_METRICS]

# Ensure columns exist
missing = [c for c in metric_cols if c not in pivoted.columns]
if missing:
    print("Available columns:", pivoted.columns.tolist())
    raise KeyError(f"Missing expected columns: {missing}")

# Reorder
pivoted = pivoted[metric_cols].round(4)

# Bold max values
def bold_max(row, metric):
    t = row[f"{metric}_TIGER"]
    tr = row[f"{metric}_TIGRESS"]
    if t > tr:
        return f"\\textbf{{{t:.4f}}}", f"{tr:.4f}"
    elif tr > t:
        return f"{t:.4f}", f"\\textbf{{{tr:.4f}}}"
    else:
        return f"{t:.4f}", f"{tr:.4f}"

# Generate LaTeX rows
latex_rows = []
for category, row in pivoted.iterrows():
    g5_t, g5_tr = bold_max(row, "gini_5")
    g10_t, g10_tr = bold_max(row, "gini_10")
    latex_rows.append(
        f"{DATA_NAME_MAP[category]} & {g5_t} & {g10_t} & {g5_tr} & {g10_tr} \\\\"
    )

# Final LaTeX table
latex_table = r"""
\begin{table}[ht]
\centering
\resizebox{1\textwidth}{!}{%
\begin{tabular}{lcccc}
    \toprule
    \textbf{Category} & \multicolumn{2}{c}{\textbf{TIGER}} & \multicolumn{2}{c}{\textbf{TIGRESS}} \\
    \cmidrule(r){2-5} \cmidrule(l){6-9}
    & \textbf{Gini@5} & \textbf{Gini@10} & \textbf{Gini@5} & \textbf{Gini@10} \\
    \midrule
""" + "\n".join(latex_rows) + r"""
    \bottomrule
    \end{tabular}
    }
\vspace{1mm}
\caption{Comparison of TIGER and TIGRESS on Fairness/Diversity Metrics at cutoff 5 and 10 across categories.}
\label{tab:tiger_vs_tigress}
\end{table}
"""

print(latex_table)


\begin{table}[ht]
\centering
\resizebox{1\textwidth}{!}{%
\begin{tabular}{lcccc}
    \toprule
    \textbf{Category} & \multicolumn{2}{c}{\textbf{TIGER}} & \multicolumn{2}{c}{\textbf{TIGRESS}} \\
    \cmidrule(r){2-5} \cmidrule(l){6-9}
    & \textbf{Gini@5} & \textbf{Gini@10} & \textbf{Gini@5} & \textbf{Gini@10} \\
    \midrule
All Beauty & 0.0827 & 0.1506 & \textbf{0.1108} & \textbf{0.2005} \\
Sports and Outdoors & 0.0958 & 0.1830 & \textbf{0.1259} & \textbf{0.2317} \\
Toys and Games & 0.0675 & 0.1331 & \textbf{0.1264} & \textbf{0.2266} \\
    \bottomrule
    \end{tabular}
    }
\vspace{1mm}
\caption{Comparison of TIGER and TIGRESS on Fairness/Diversity Metrics at cutoff 5 and 10 across categories.}
\label{tab:tiger_vs_tigress}
\end{table}



### Datasets

In [7]:
def parse(path):
    g = gzip.open(path, "r")
    for l in g:
        yield eval(l)
        
def parse_2023(path):
    with gzip.open(path, 'rt', encoding='utf-8') as f:
        for line in tqdm(f, desc=f"Parsing {path}"):
            yield json.loads(line)

def display_pickle_summary(data, title="Pickle File Contents"):
    """
    Load and summarize the contents of a pickle file using rich.

    :param data: .pkl data
    :param title: Optional title for the printed table.
    """
    table = Table(title=title)
    
    table.add_column("Key/Type", style="cyan", no_wrap=True)
    table.add_column("Description", style="magenta")

    total_size = 0
    if isinstance(data, dict):
        for key, value in data.items():
            desc = f"{type(value).__name__}, len={len(value)}" if hasattr(value, '__len__') else type(value).__name__
            table.add_row(str(key), desc)
            if key in ["train", "test", "val"]:
                total_size += len(value)
            else:
                total_size = "N/A"
    else:
        table.add_row(type(data).__name__, f"{data}" if isinstance(data, (int, float, str)) else str(type(data)))

    table.add_row("Total Size", str(total_size))
    console = Console()
    console.print(table)
    
    if "train" in data or "test" in data or "val" in data:
        rprint("Train Sample:")
        rprint(data['train'][0])
        rprint("Val Sample:")
        rprint(data['val'][0])
        rprint("Test Sample:")
        print(data['test'][0])
        
        
def df_stats(df: pd.DataFrame, title="DataFrame Stats"):
    table = Table(title=title)
    rprint(f"DataFrame shape: {df.shape}")
    table.add_column("Column", style="cyan", no_wrap=True)
    table.add_column("Non-Null Count", style="yellow")
    table.add_column("Unique Count", style="magenta")
    table.add_column("Null/NA Count", style="red")
    table.add_column("Data Type", style="green")

    for col in df.columns:
        try:
            non_null_count = df[col].notna().sum()
        except:
            non_null_count = "Error"
        try:
            unique_count = df[col].nunique(dropna=True)
        except:
            unique_count = "Error"
        try:
            null_count = df[col].isna().sum()
        except:
            null_count = "Error"
        try:
            dtype = str(df[col].dtype)
        except:
            dtype = "Error"
        table.add_row(col, str(non_null_count), str(unique_count), str(null_count), dtype)

    Console().print(table)
    
def data_user_item_stats(items_data: pd.DataFrame, 
                         interactions_data: list, 
                        #  reviews_data: pd.DataFrame,
                         dataset_split: str,
                         title="DataFrame User Interaction Stats"):
    table = Table(title=title)
    table.add_column("Category", style="cyan", no_wrap=True)
    table.add_column("# Users", style="yellow")
    table.add_column("# Items ", style="magenta")
    table.add_column("# Brands ", style="magenta")
    table.add_column("Sequence Length (Mean)", style="red")
    table.add_column("Sequence Length (Median)", style="green")
    table.add_column("Item Data Shape")
    
    user_ids, seq_length = [], []
    for interactions in interactions_data:
        user_ids.append(interactions[0])
        seq_length.append(len(interactions))
    try:
        users = len(set(user_ids))
    except:
        users = "Error"
    try:
        items = items_data["asin"].nunique(dropna=True)
    except:
        items = "Error"
    try:
        brands = items_data["brand"].nunique(dropna=True)
    except:
        brands = "Error"
    try:
        seq_len_mean = str(round(np.mean(seq_length), 2))
    except:
        seq_len_mean = "Error"
    try:
        seq_len_median = str(int(np.median(seq_length)))
    except:
        seq_len_median = "Error"
    table.add_row(dataset_split, str(users), str(items), str(brands), 
                  str(seq_len_mean), str(seq_len_median), 
                  str(items_data.shape))

    Console().print(table)
    

In [10]:
# global vars
YEAR = 2023
DATASET_DIR = f"../dataset/amazon/{YEAR}/raw"
DATASET_SPLIT = "beauty"

In [11]:
YEAR = 2023
DATASET_DIR = f"../dataset/amazon/{YEAR}/raw"

for dataset_split in ["beauty", "sports", "toys", "software", "games"]:    
    items_data = pd.read_csv(f"{DATASET_DIR}/{dataset_split}/item_data.csv")
    
    # reviews_df =  pd.DataFrame([
    #     meta
    #     for meta in parse_2023(
    #         path=os.path.join(DATASET_DIR, dataset_split, "reviews.json.gz")
    #     )])
    # reviews_data = reviews_df[reviews_df["parent_asin"].isin(items_data["asin"])]
    # reviews_data["text_len"] = reviews_data["text"].str.len()
    
    interactions_data = []
    with open(
        os.path.join(DATASET_DIR, dataset_split, "sequential_data.txt"), "r"
    ) as f:
        for line in f:
            parsed_line = list(map(int, line.strip().split()))
            interactions_data.append(parsed_line)
            
    data_user_item_stats(items_data, interactions_data, dataset_split)

In [12]:
YEAR = 2014
DATASET_DIR = f"../dataset/amazon/{YEAR}/raw"

for dataset_split in ["beauty", "sports", "toys"]:    
    items_data = pd.read_csv(f"{DATASET_DIR}/{dataset_split}/item_data.csv")
    
    # reviews_df =  pd.DataFrame([
    #     meta
    #     for meta in parse_2023(
    #         path=os.path.join(DATASET_DIR, dataset_split, "reviews.json.gz")
    #     )])
    # reviews_data = reviews_df[reviews_df["parent_asin"].isin(items_data["asin"])]
    # reviews_data["text_len"] = reviews_data["text"].str.len()
    
    interactions_data = []
    with open(
        os.path.join(DATASET_DIR, dataset_split, "sequential_data.txt"), "r"
    ) as f:
        for line in f:
            parsed_line = list(map(int, line.strip().split()))
            interactions_data.append(parsed_line)
            
    data_user_item_stats(items_data, interactions_data, dataset_split)

In [5]:
df_stats(items_data)

In [6]:
items_data["brand"].nunique()

1311