# 1 Find user dt behaviour

Extracts statistics related to delta t, as specified in ../0. Calculates the thresholds for $\text{low, middle, high}$ groups based on the provided quantiles for each of the statistic and assigns the user into groups on the basis of each of them. These are exported as a csv with index of uid and each column indicating a particular user's interaction dt statistic group according to a particular ranking. Note that the low-high classification of different features may lead to the opposite meanings in the same dt_group. For example, high interaction count is associated with the heavy use whereas high mean dt is associated with low use.

Requires:

* `user_stats.csv` from `../general/0` containing statistics specified by `STATS_TO_CALCULATE` for each user.

Returns:

* `uid2dt_groups.csv` where each row contains the user index and the remaining columns denote the user membership of the $\text{low, middle, high}$ group for each of the stats specified by `STATS_TO_USE`.

In [None]:
LOCATION = "local"
DATASET = "lastfm_10_pc"
STATS_TO_USE = ["dt_median", "dt_mean", "dt_count"]
QUANTILES = {"low": 1/3, "high": 2/3}

In [None]:
import ast
import datetime
import os
import subprocess

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns


In [None]:
data_root = {
    "local": "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/RQ3",
    "server": "/tudelft.net/staff-bulk/ewi/insy/MMC/nknyazev/RQ3",
    "rtl": "s3://ci-data-apps/norman/sagemaker/thesis/offline-evaluation/RQ3"
}[LOCATION]

output_root = {
    "local": "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/RQ3",
    "server": "/tudelft.net/staff-bulk/ewi/insy/MMC/nknyazev/RQ3",
    "rtl": "s3://ci-data-apps/norman/sagemaker/thesis/offline-evaluation/RQ3"
}[LOCATION]

In [None]:
# DATASET stats input path
# user_stats_output_path = f"/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/{DATASET}/user_stats.csv"
data_filename = "user_stats.csv"
data_path = os.path.join(data_root, DATASET, data_filename)

# User preference output path
# output_path = f"/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/{DATASET}/uid2mainstreamness.csv"
output_filename = "uid2dt_groups.csv"
output_folder = os.path.join(output_root, DATASET)
output_path = os.path.join(output_folder, output_filename)

In [None]:
# Load user df containing user histories
if LOCATION != "rtl":
    user_stats = pd.read_csv(data_path, sep="\t", index_col=0)[STATS_TO_USE]
else:
    tmp_folder = '/tmp'
    _ = subprocess.call(["aws", 's3', 'cp', data_path, tmp_folder])
    tmp_path = os.path.join(tmp_folder, data_filename)
    user_stats = pd.read_csv(tmp_path, sep="\t", index_col=0)[STATS_TO_USE]
    _ = subprocess.call(['rm', tmp_path])

In [None]:
# Calculate the thresholds for each of the groups for each of the stats
# Note that the thresholds are the tuples of the highest allowed low value and highest allowed medium value
thresholds = {c: {g: user_stats[c].quantile(q) for g, q in QUANTILES.items()} for c in STATS_TO_USE}

In [None]:
# Resulting thresholds (in seconds or num interactions)
thresholds

In [None]:
# Function used to calculate the user group membership based on the two thresholds and the user's value
assign_group = lambda x, low, high: "low" if x <= low else "high" if x > high else "middle"

In [None]:
# Calculate the user low/medium/high membership for each of the used stats
output_df = user_stats.copy()
new_column_names = []
for c, q in thresholds.items():
    new_column_name = c + "_group"
    output_df[new_column_name] = output_df[c].apply(lambda x: assign_group(x, *q.values()))
    new_column_names.append(new_column_name)

In [None]:
# Save as csv
if LOCATION != "rtl":
    output_df.to_csv(output_path, sep="\t", columns=new_column_names, index=True)
else:
    output_df.to_csv(tmp_path, sep="\t", columns=new_column_names, index=True)
    _ = subprocess.call(["aws", 's3', 'cp', tmp_path, output_path])
    _ = subprocess.call(['rm', tmp_path])