In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import IPython.display as display
import json
import math
from random import sample
import os
import csv

# Data prep

In [None]:
def get_attribute(df, idx, attribute):
    return df.loc[idx][attribute]

def get_rating(df, user_idx, item_idx):
    return df.loc[user_idx].loc[item_idx]["rating:float"]

In [None]:
def get_remapped_losses(model, dataset, l_r, e_s, fair):
    # Loads the remappings made by the model during its training phase
    with open(f"remappings/{dataset}.json") as f:
        remappings = json.load(f)

    # Removes unneeded padding
    del remappings["user_id"]["[PAD]"]
    del remappings["item_id"]["[PAD]"]

    # remappings is of the form {new_id: old_id} and can thus reverse the new ids
    # Items without any rating were never assigned a new ID and thus are still the
    # same without being reversed.
    try:
        remappings = {alias: {int(v): int(k) for k, v in remappings[alias].items()} for alias in remappings.keys()}
    except ValueError:
        remappings = {alias: {int(v): k for k, v in remappings[alias].items()} for alias in remappings.keys()}

    loss_info = pd.read_csv(f"training_losses/{model}/{dataset}/{l_r}-{e_s}_{fair}.csv", names=["Iteration", "User ID", "Item ID", "Predicted rating", "Loss"])
    loss_info["User ID"] = loss_info["User ID"].map(remappings["user_id"])
    loss_info["Item ID"] = loss_info["Item ID"].map(remappings["item_id"])

    return loss_info

# loss_info = get_remapped_losses("bpr", "ml-1m", "0001", "128")
# loss_info.head()

In [None]:
# Used to plot losses of different user groups
def plot_losses(model, dataset, l_r, e_s, fair, sensitive_attributes):
    plot_path = f"training_losses/{model}/{dataset}/{l_r}-{e_s}_{fair}_user.png"
    if os.path.exists(plot_path):
        display.display(Image.open(plot_path))
        return
    
    loss_info = get_remapped_losses(model, dataset, l_r, e_s, fair)
    item_info = pd.read_csv(f"{dataset}/{dataset}.item", sep="\t", engine="python", encoding="latin-1", index_col=0, header=0)
    user_info = pd.read_csv(f"{dataset}/{dataset}.user", sep="\t", engine="python", index_col=0, header=0)
        
    iterations = set(loss_info["Iteration"])
    groups_info = {}
    
    if sensitive_attributes["user"]:
        if sensitive_attributes["item"]:
            split = "both"
        else:
            split = "user"
    else:
        split = "item"

    if split == "both":
        for u_group in set(user_info[sensitive_attributes["user"]]):
            if u_group == None:
                continue
            groups_info[u_group] = {}
            for i_group in set(item_info[sensitive_attributes["item"]]):
                if i_group == None:
                    continue
                groups_info[u_group][i_group] = {"indices": [], "losses": []}
                groups_info[u_group][i_group]["indices"].append(list(user_info[user_info[sensitive_attributes["user"]] == u_group].index))
                groups_info[u_group][i_group]["indices"].append(list(item_info[item_info[sensitive_attributes["item"]] == i_group].index))
    elif split == "user":
        for group in set(user_info[sensitive_attributes["user"]]):
            if group == None:
                continue
            groups_info[group] = {"losses": []}
            try:
                groups_info[group]["indices"] = [int(i) for i in list(user_info[user_info[sensitive_attributes["user"]] == group].index)]
            except ValueError:
                groups_info[group]["indices"] = [i for i in list(user_info[user_info[sensitive_attributes["user"]] == group].index)]

    elif split == "item":
        for group in set(item_info[sensitive_attributes["item"]]):
            if group == None:
                continue
            groups_info[group] = {"losses": []}
            groups_info[group]["indices"] = list(item_info[item_info[sensitive_attributes["item"]] == group].index)

    for i in iterations:
        iteration_loss_info = loss_info[loss_info["Iteration"] == i]
        if split == "both":
            for u_group in groups_info.keys():
                for i_group in groups_info[u_group].keys():
                    u_indices, i_indices = groups_info[u_group][i_group]["indices"]
                    group_loss_info = iteration_loss_info[iteration_loss_info["User ID"].isin(u_indices)]
                    group_loss_info = group_loss_info[group_loss_info["Item ID"].isin(i_indices)]
                    groups_info[u_group][i_group]["losses"].append(group_loss_info["Loss"])
        else:
            for group in groups_info.keys():
                if split == "user":
                    group_loss_info = iteration_loss_info[iteration_loss_info["User ID"].isin(groups_info[group]["indices"])]
                elif split == "item":
                    group_loss_info = iteration_loss_info[iteration_loss_info["Item ID"].isin(groups_info[group]["indices"])]
                groups_info[group]["losses"].append(group_loss_info["Loss"])

    if split == "both":
        for u_group in groups_info.keys():
            for i_group in groups_info[u_group].keys():
                N = len(groups_info[u_group][i_group]["losses"][0])
                plt.plot(list(iterations)[:-11], [losses.mean() for losses in groups_info[u_group][i_group]["losses"]][:-11], label=f"User: {str(u_group)}, Item: {str(i_group)} (N = {N})")
    else:
        all_group_losses = {}

        final_losses = []

        for group in groups_info.keys():
            group_losses = [losses.mean() for losses in groups_info[group]["losses"]][:-11]
            num_epochs = len(group_losses)
            all_group_losses[group] = list(zip(range(num_epochs), group_losses, [group] * num_epochs))

            N = len(groups_info[group]["losses"][0])
            plt.plot(list(iterations)[:-11], [losses.mean() for losses in groups_info[group]["losses"]][:-11], label=f"{group} (N = {N})")
            final_losses.append([losses.mean() for losses in groups_info[group]["losses"]][:-11][-1])

        print(f"user: {max(final_losses) / min(final_losses)}")

    # all_losses = []
    # for i in range(num_epochs):
    #     all_losses.append(all_group_losses[2][i])
    #     all_losses.append(all_group_losses[1][i])
    #     all_losses.append(all_group_losses[0][i])
    # print(all_losses)
    # with open("user_losses.csv", "a") as f:
    #     writer = csv.writer(f)
    #     writer.writerows(all_losses)

    plt.title(f"{model.upper()} Loss over time per user group (lambda={fair[:-2]}.{fair[-2:]})")
    plt.xlabel("Iteration")
    plt.ylabel(f"{model.upper()} Loss")
    plt.legend()
    plt.savefig(plot_path)
    plt.show()

In [None]:
# Used to plot losses of different item groups
def plot_item_losses(model, dataset, l_r, e_s, fair):
    colour_map = {"H": "r", "M": "g", "T": "b"}
    item_groups = {"H": "Head", "M": "Mid", "T": "Tail"}
    plot_path = f"training_losses/{model}/{dataset}/{l_r}-{e_s}_{fair}_item.png"

    if os.path.exists(plot_path):
        display.display(Image.open(plot_path))
        return
    
    loss_info = get_remapped_losses(model, dataset, l_r, e_s, fair)
    item_info = pd.read_csv(f"{dataset}/{dataset}.item", sep="\t", engine="python", encoding="latin-1", index_col=0, header=0)
    
    item_labels = ["H", "M", "T"]
    item_indices = {item_label: item_info.loc[item_info["popular item"] == item_label].index for item_label in item_labels}
    grouped_item_losses = {item_label: [] for item_label in item_labels}
    N_values = {}
    iterations = sorted(set(loss_info["Iteration"].values))
    
    for i in iterations:
        iteration_loss_info = loss_info.loc[loss_info["Iteration"] == i]
        for item_label in item_labels:
            iteration_losses = iteration_loss_info.loc[iteration_loss_info["Item ID"].astype(float).isin(item_indices[item_label].astype(float))]["Loss"]
            grouped_item_losses[item_label].append(iteration_losses.mean())
            if i == 0:
                N_values[item_label] = len(iteration_losses)


    final_losses = [grouped_item_losses[item_label][-11] for item_label in item_labels]
    if fair != "base":
        print(f"item: {max(final_losses) / min(final_losses)}")
    else:
        print(f"item (base): {max(final_losses) / min(final_losses)}")

    for item_label in item_labels:
        label = f"{item_groups[item_label]}"
        if fair == "base":
            plt.plot(iterations[:-11], grouped_item_losses[item_label][:-11], label=f"{label} (BPR)", color=colour_map[item_label])
        else:
            plt.plot(iterations[:-11], grouped_item_losses[item_label][:-11], label=f"{label} (ILE)", color=colour_map[item_label], linestyle="dashed")

    if "base" not in plot_path:
        plt.xlabel("Epoch", fontsize=18)
        plt.ylabel("Average Loss", fontsize=18)
        plot_item_losses(model, dataset, l_r, e_s, "base")

    if fair == "base":
        plt.legend(fontsize=12, loc="upper right", ncols=2)
        plt.xlabel("Epoch", fontsize=18)
        plt.ylabel("Average loss", fontsize=18)
        plt.savefig(plot_path)
    
        plt.xticks(fontsize=14)
        plt.yticks(fontsize=14)
        
        plt.show()

In [None]:
fair = "item020std"

# plot_losses("bpr", "goodreads", "0001", "128", fair, {"user": "mainstream class (even groups)", "item": None})
plot_item_losses("bpr", "goodreads", "0001", "128", fair)

# Plotting

### BPR - ML-1M - 0001-128 (DeepSVDD labels)

In [None]:
MS_similarity = np.load('../MS_DeepSVDD.npy')

sorted_MS_scores = sorted(list(enumerate(MS_similarity)), key=lambda x: x[1])
a = [t[0]+1 for t in sorted_MS_scores]
b = sorted([list(range(5)) * 1208][0])
c = dict(zip(a, b))
user_info["test"] = c

plot_losses(loss_info, user_info, item_info, {"user": "test", "item": None})

In [None]:
mainstream_1 = sample(list(user_info[user_info["mainstream class (thresholds)"] == 4].index), 1)
mainstream_5 = sample(list(user_info[user_info["mainstream class (thresholds)"] == 4].index), 5)
niche_1 = sample(list(user_info[user_info["mainstream class (thresholds)"] == 0].index), 1)
niche_5 = sample(list(user_info[user_info["mainstream class (thresholds)"] == 0].index), 5)

sample_1, sample_5 = mainstream_1 + niche_1, mainstream_5 + niche_5

plot_losses(loss_info[loss_info["User ID"].isin(sample_1)], user_info, item_info, {"user": "mainstream class (thresholds)", "item": None})
plot_losses(loss_info[loss_info["User ID"].isin(sample_5)], user_info, item_info, {"user": "mainstream class (thresholds)", "item": None})

mainstream_1 = sample(list(user_info[user_info["mainstream class (even groups)"] == 4].index), 1)
mainstream_5 = sample(list(user_info[user_info["mainstream class (even groups)"] == 4].index), 5)
niche_1 = sample(list(user_info[user_info["mainstream class (even groups)"] == 0].index), 1)
niche_5 = sample(list(user_info[user_info["mainstream class (even groups)"] == 0].index), 5)

sample_1, sample_5 = mainstream_1 + niche_1, mainstream_5 + niche_5

plot_losses(loss_info[loss_info["User ID"].isin(sample_1)], user_info, item_info, {"user": "mainstream class (even groups)", "item": None})
plot_losses(loss_info[loss_info["User ID"].isin(sample_5)], user_info, item_info, {"user": "mainstream class (even groups)", "item": None})

# Item Popularity

In [None]:
def plot_item_popularities(datasets):
    for ds in datasets:
        ds_items = pd.read_csv(f"{ds}/{ds}.item", sep="\t", engine="python", encoding="latin-1", index_col=0, header=0)
        ds_inters = pd.read_csv(f"{ds}/{ds}.inter", sep="\t", engine="python", encoding="latin-1", header=0)

        # Stores all given ratings for each item
        all_item_ratings = {}
        for item_idx in ds_items.index:
            all_item_ratings[item_idx] = list(ds_inters[ds_inters["item_id:token"] == item_idx]["rating:float"])

        # Percentage of all ratings that belong to each item
        ds_items["total ratings (%)"] = {item_idx: len(all_item_ratings[item_idx]) / len(ds_inters.index) for item_idx in all_item_ratings.keys()}


        plt.plot(range(len(ds_items.index)), sorted(ds_items["total ratings (%)"], reverse=True), label=ds)

    plt.legend()
    plt.show()


plot_item_popularities(["ml-1m", "goodreads", "google_reviews", "yelp"])