# 1 Find user mainstreamness
Builds the user-item matrix based on the previously extracted interaction counts. Calculates the user mainstreamness based on their similarity to the general item popularity.
The metrics used described in https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0217389&type=printable. The obtained mainstreamness levels over all users are used to define cutoff points to classify users as low/med/high mainstreamness. These are exported as a csv with index of uid and each column indicating a particular user's mainstreamness according to a particular ranking.

Requires:
* {DATASET}/user_stats.csv generated during the previous step.

Returns:
* {DATASET}/uid2mainstreamness.csv with columns indicating each user's mainstreamness according to the column's metric

In [None]:
LOCATION = "local"
DATASET = "lastfm_10_pc"

In [None]:
import ast
import datetime
import os
import subprocess

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns


In [None]:
data_root = {
    "local": "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/RQ3",
    "server": "/tudelft.net/staff-bulk/ewi/insy/MMC/nknyazev/RQ3",
    "rtl": "s3://ci-data-apps/norman/sagemaker/thesis/offline-evaluation/RQ3"
}[LOCATION]

output_root = {
    "local": "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/RQ3",
    "server": "/tudelft.net/staff-bulk/ewi/insy/MMC/nknyazev/RQ3",
    "rtl": "s3://ci-data-apps/norman/sagemaker/thesis/offline-evaluation/RQ3"
}[LOCATION]

In [None]:
# DATASET stats input path
# user_stats_output_path = f"/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/{DATASET}/user_stats.csv"
data_filename = "user_stats.csv"
data_path = os.path.join(data_root, DATASET, data_filename)

# User preference output path
# output_path = f"/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/{DATASET}/uid2mainstreamness.csv"
output_filename = "uid2mainstreamness.csv"
output_folder = os.path.join(output_root, DATASET)
output_path = os.path.join(output_folder, output_filename)

In [None]:
# Load user df containing user histories
if LOCATION != "rtl":
    user_stats = pd.read_csv(data_path, sep="\t", index_col=0)
else:
    tmp_folder = '/tmp'
    _ = subprocess.call(["aws", 's3', 'cp', data_path, tmp_folder])
    tmp_path = os.path.join(tmp_folder, data_filename)
    user_stats = pd.read_csv(tmp_path, sep="\t", index_col=0)
    _ = subprocess.call(['rm', tmp_path])

In [None]:
# Find dims of a user-item matrix
max_uid = user_stats.index.max()

In [None]:
# Decode column from string to dict
user_stats["user_item_consumption"] = user_stats["user_item_consumption"].apply(ast.literal_eval)

In [None]:
# positions in 2d ui matrix to update along with the values
updates = {(x,x2):y2 for x,y in user_stats["user_item_consumption"].items() for x2,y2 in y.items()}

In [None]:
max_iid = np.max([x[1] for x in updates.keys()])

In [None]:
# Initialise user item matrix
uim = np.zeros((max_uid+1, max_iid+1))

In [None]:
# Indexing has to be all positions for one dimension in one list and all for the second list in the second one
update_idx = list(zip(*updates.keys()))
# Counts how many times each user consumed item from genre
update_vals = list(updates.values())

In [None]:
# Fill in the uim
uim[update_idx] = list(update_vals)

In [None]:
# Get consumption counts for every item as sum over all users
item_consumption_counts = np.sum(uim, axis=0).astype(int)

In [None]:
# Calculate kendall's tau and associated p-values for every user vs global distribution
tau_p = np.apply_along_axis(lambda x: stats.kendalltau(item_consumption_counts, x),1,  uim)
# Split the above into two
tau, p = [x[:,0] for x in np.split(tau_p, 2, axis=1)]

In [None]:
# Plot kendall's Tau distribution
fig, ax = plt.subplots()
fig.set_size_inches(10,5)

ax.set_title(f"Kendall's Tau for MovieLens users vs global distribution {DATASET}", fontsize=16, y=1.025)
ax.tick_params(labelsize=12)
ax.xaxis.label.set_size(10)

ax = sns.distplot(tau[~np.isnan(tau)], kde=False, norm_hist=True, bins=100)
plt.show()

In [None]:
# Combine previous results into one dict together with the manually selected thresholds
metrics_and_thresholds = {
    "Kendalls Tau": {
        "metric": tau[user_stats.index],
        "thresholds": {
            "low": np.percentile(tau[user_stats.index], 1/3*100),
            "high": np.percentile(tau[user_stats.index], 2/3*100)
        }
    },
}

In [None]:
# Assign every user to be 0 if in left tail, 1 if in the middle and 2 if in the right tail
user_to_metric_group = {
    m: (d["metric"] > d["thresholds"]["low"]).astype(np.int32) + (d["metric"] >= d["thresholds"]["high"]).astype(np.int32) for m, d in metrics_and_thresholds.items()
}

In [None]:
# Convert back from the numeric mapping to low/middle/high and convert to df
replacement_mapping = {0: "low", 1: "middle", 2: "high"}
metric_group_df = pd.DataFrame.from_dict(user_to_metric_group).set_index(user_stats.index)
metric_group_df_str = metric_group_df.replace({col: replacement_mapping for col in metric_group_df.columns})
metric_group_df_str.head(10)

In [None]:
for c in metric_group_df_str.columns:
    print(metric_group_df_str.join(user_stats.dt_count).groupby(c).sum())

In [None]:
# Save as csv
if LOCATION != "rtl":
    metric_group_df_str.to_csv(output_path, sep="\t")
else:
    metric_group_df_str.to_csv(tmp_path, sep="\t")
    _ = subprocess.call(["aws", 's3', 'cp', tmp_path, output_path])
    _ = subprocess.call(['rm', tmp_path])