# 0 Extract dt stats and item consumption per user

Requires:
* X.npy files for each of the dataset's subsets.

Returns:
* A csv for the combined dataset (from all subsets) with dt stats (e.g. median, mean) for each user as well a dict with all items and their counts that have been consumed by the user 

In [None]:
LOCATION = "local"
DATASET = "lastfm_10_pc"
SUBSETS = ["train","validation", "test"]
STATS_TO_CALCULATE = ["median", "mean", "count"]

In [None]:
import math
import os
import random
import string
import subprocess
import sys
import tempfile

import numpy as np
import pandas as pd

In [None]:
# Copied from evaluation/2_evaluate_exports/RQ2.1/prev_current_dist_to_recs_vs_dt_log_bins.ipynb

def randomString(stringLength=10):
    """Generate a random string of fixed length """
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(stringLength))

def load_arrays(root, *args):
    
    if len(args) > 0 and not root.startswith("s3"):
        return {k: np.load(os.path.join(root, k + ".npy")) for k in args}
    outputs = {}
    temp_path = os.path.join(randomString())
    subprocess.call(["mkdir", "-p", temp_path])
    for a in args:
        local_path = os.path.join(temp_path, a)
        s3_path = os.path.join(root, a)
        subprocess.call(["mkdir", "-p", local_path])
        subprocess.call(["aws", "s3", "cp", s3_path, local_path, "--recursive"])
        file_names = sorted([os.path.join(local_path, x) for x in next(os.walk(local_path))[-1]])
        outputs[a] = np.concatenate([np.load(x) for x in file_names])
    subprocess.call(["rm", "-r", temp_path])
    return outputs

In [None]:
data_root = {
    "local": "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/processed/final",
    "server": "/home/nfs/nknyazev/thesis/data/numpy",
    "rtl": "s3://ci-data-apps/norman/sagemaker/thesis/data/processed/new/rtl/numpy",
}[LOCATION]

output_root = {
    "local": "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/RQ3",
    "server": "/tudelft.net/staff-bulk/ewi/insy/MMC/nknyazev/RQ3",
    "rtl": "s3://ci-data-apps/norman/sagemaker/thesis/offline-evaluation/RQ3"
}[LOCATION]

data_keys = {
    "train": os.path.join(DATASET if DATASET != "rtl" else "", "train"),
    "validation": os.path.join(DATASET if DATASET != "rtl" else "", "validation"),
    "test": os.path.join(DATASET if DATASET != "rtl" else "", "test"),
}

data_paths = {k: os.path.join(data_root, v) for k, v in data_keys.items()}

In [None]:
# Numpy data path
dataset_path = os.path.join(data_root, DATASET)
# Dataset stats output path
output_folder = os.path.join(output_root, DATASET)
output_path = os.path.join(output_folder, "user_stats.csv")

In [None]:
arrays = {k: load_arrays(data_paths[k], "X")["X"] for k in SUBSETS}

In [None]:
# {subset: [interaction1, interaction2, ..., interactionN]}
arrays = {k:np.reshape(array, [-1, 3]) for k,array in arrays.items()}

In [None]:
# Remove all padded indices (values 0,0,0) - there is no user 0
rm_padding = lambda array: array[~np.all(array == np.array([0,0,0]), axis=1)]
arrays = {k:rm_padding(array) for k,array in arrays.items()}

In [None]:
# Combine into one dataset
array = np.concatenate(list(arrays.values()), axis=0)

In [None]:
# Convert to pandas DF
df = pd.DataFrame(array, columns=["uid", "iid", "dt"])

In [None]:
# Group on uid
group = df.groupby("uid")

In [None]:
# Calculate stats for each user
aggregated = group.agg({"dt": STATS_TO_CALCULATE})["dt"].rename(lambda x: "dt_"+x, axis=1)

In [None]:
# df w/ index of uid and val with 1 col, containing a dict {item: times_consumed} if the item was ever consumed by uid
aggregation = {"iid": lambda x: dict(x.value_counts())}
item_consumption = df.groupby("uid").agg(aggregation).rename({"iid": "user_item_consumption"}, axis=1)

# Join two above dfs
aggregated_with_consumption = aggregated.join(item_consumption)

In [None]:
# Write to csv
if LOCATION != "rtl":
    subprocess.call(["mkdir", "-p", output_folder])
    aggregated_with_consumption.to_csv(output_path, sep="\t")
else:
    output_name = os.path.split(output_path)[-1]
    aggregated_with_consumption.to_csv(f"/tmp/{output_name}", sep="\t")
    subprocess.call(["aws", "s3", "cp", f"/tmp/{output_name}", output_path])
    os.remove(f"/tmp/{output_name}")