# 3 Get interaction counts for each subset for each grouping for each category

For specified subsets loads `X` and `seq_lens`, combines them to extract the interaction counts for each of the users for subsets separately. This information is combined with the output of `./2`, allowing to classify users to categories (e.g. `low`, `middle`,`high`) based on different groupings (e.g. mainstreamness) and calculate the number of interactions performed by those users.

Requires:
* X.npy and seq_lens.npy files for each of the dataset's subsets.
* uid2stats.csv denoting the user membership of the $\text{low, middle, high}$ group for each of the stats specified during the previous step.

Returns:
* subset_grouping_counts.csv containing absolute interaction counts by users belonging to the current dataset's specified subset per category for each of the groupings. Notebook also contains the percentage counts but those are not exported.


In [None]:
LOCATION = "local"
DATASET = "lastfm_10_pc"
SUBSETS = ['test']

In [None]:
import ast
import datetime
import os
import subprocess

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns

In [None]:
# Copied from evaluation/2_evaluate_exports/RQ2.1/prev_current_dist_to_recs_vs_dt_log_bins.ipynb

def randomString(stringLength=10):
    """Generate a random string of fixed length """
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(stringLength))

def load_arrays(root, *args):

    if len(args) > 0 and not root.startswith("s3"):
        return {k: np.load(os.path.join(root, k + ".npy")) for k in args}
    outputs = {}
    temp_path = os.path.join(randomString())
    subprocess.call(["mkdir", "-p", temp_path])
    for a in args:
        local_path = os.path.join(temp_path, a)
        s3_path = os.path.join(root, a)
        subprocess.call(["mkdir", "-p", local_path])
        subprocess.call(["aws", "s3", "cp", s3_path, local_path, "--recursive"])
        file_names = sorted([os.path.join(local_path, x) for x in next(os.walk(local_path))[-1]])
        outputs[a] = np.concatenate([np.load(x) for x in file_names])
    subprocess.call(["rm", "-r", temp_path])
    return outputs

In [None]:
groups_root = {
    "local": "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/RQ3",
    "server": "/tudelft.net/staff-bulk/ewi/insy/MMC/nknyazev/RQ3",
    "rtl": "s3://ci-data-apps/norman/sagemaker/thesis/offline-evaluation/RQ3"
}[LOCATION]

data_root = {
    "local": "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/processed/final",
    "server": "/home/nfs/nknyazev/thesis/data/numpy",
    "rtl": "s3://ci-data-apps/norman/sagemaker/thesis/data/processed/new/rtl/numpy",
}[LOCATION]

output_root = {
    "local": "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/RQ3",
    "server": "/tudelft.net/staff-bulk/ewi/insy/MMC/nknyazev/RQ3",
    "rtl": "s3://ci-data-apps/norman/sagemaker/thesis/offline-evaluation/RQ3"
}[LOCATION]

data_keys = {
    "train": os.path.join(DATASET if DATASET != "rtl" else "", "train"),
    "validation": os.path.join(DATASET if DATASET != "rtl" else "", "validation"),
    "test": os.path.join(DATASET if DATASET != "rtl" else "", "test"),
}

data_paths = {k: os.path.join(data_root, v) for k, v in data_keys.items()}

In [None]:
groups_filename = "uid2stats.csv"
groups_path = os.path.join(groups_root, DATASET, groups_filename)

output_filename = "subset_grouping_counts.csv"
output_path = os.path.join(output_root, DATASET, output_filename)

In [None]:
# Load user df containing user histories
if LOCATION != "rtl":
    groups_df = pd.read_csv(groups_path, sep="\t", index_col=0)
else:
    tmp_folder = '/tmp'
    _ = subprocess.call(["aws", 's3', 'cp', groups_path, tmp_folder])
    tmp_path = os.path.join(tmp_folder, groups_filename)
    groups_df = pd.read_csv(tmp_path, sep="\t", index_col=0)
    _ = subprocess.call(['rm', tmp_path])

In [None]:
stats = groups_df.columns

In [None]:
arrays = {k: load_arrays(data_paths[k], "X", "seq_lens") for k in SUBSETS}

In [None]:
arrays = {k: np.stack([v["X"][:,0,0], v["seq_lens"]]).T for k,v in arrays.items()}
dfs = {k: pd.DataFrame(a, columns=["uid", "seq_lens"]) for k,a in arrays.items()}

In [None]:
interaction_counts = {k: v.groupby("uid")["seq_lens"].sum() for k, v in dfs.items()}

In [None]:
# Ensure no users in data from X that isn't in the grouping data
for k, v in interaction_counts.items():
    not len(set(v.index) - set(groups_df.index))

In [None]:
interaction_counts_groups = {k: pd.concat([v, groups_df], axis=1, join="inner") for k,v in interaction_counts.items()}

In [None]:
group_interaction_counts = {subset: {stat: grouping.groupby(stat)["seq_lens"].sum().to_dict() for stat in stats} for subset, grouping in interaction_counts_groups.items()}

In [None]:
counts_pandas_dict = {(subset, stat): stat_value for subset, subset_stats in group_interaction_counts.items() for stat, stat_value in subset_stats.items()}

In [None]:
counts_df = pd.DataFrame.from_dict(counts_pandas_dict, orient="index")[["low", "middle", "high"]]

In [None]:
percentage_df = (counts_df.T/counts_df.sum(axis=1)).T

In [None]:
# Save as csv
if LOCATION != "rtl":
    counts_df.to_csv(output_path, sep="\t", index=True)
else:
    tmp_folder = "/tmp"
    tmp_path = os.path.join(tmp_folder, output_filename)
    counts_df.to_csv(tmp_path, sep="\t", index=True)
    _ = subprocess.call(["aws", 's3', 'cp', tmp_path, output_path])
    _ = subprocess.call(['rm', tmp_path])