# 2 Combine low/middle/high groupings (e.g. mainstreamness and dt_stats)

Load data obtained from steps 1 and combine it into one user-indexed csv with values $\in [\text{low, middle, high}]$.

Requires:
* Files contained by `DATA_FILENAMES` where each row specifies the user id and the remaining columns denote the user membership of the $\text{low, middle, high}$ group for each of the stats that is to be used to partition users. All files have to have the same user ids.
    
Returns:
* `uid2stats.csv` containing the contents of all input files joined on the user id.

In [None]:
LOCATION = "local"
DATASET = "lastfm_10_pc"
DATA_FILENAMES = ["uid2mainstreamness.csv", "uid2dt_groups.csv"]

In [None]:
import ast
import datetime
import functools
import os
import subprocess

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns


In [None]:
data_root = {
    "local": "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/RQ3",
    "server": "/tudelft.net/staff-bulk/ewi/insy/MMC/nknyazev/RQ3",
    "rtl": "s3://ci-data-apps/norman/sagemaker/thesis/offline-evaluation/RQ3"
}[LOCATION]

output_root = {
    "local": "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/RQ3",
    "server": "/tudelft.net/staff-bulk/ewi/insy/MMC/nknyazev/RQ3",
    "rtl": "s3://ci-data-apps/norman/sagemaker/thesis/offline-evaluation/RQ3"
}[LOCATION]

In [None]:
data_paths = [os.path.join(data_root, DATASET, n) for n in DATA_FILENAMES]
output_filename = "uid2stats.csv"
output_folder = os.path.join(output_root, DATASET)
output_path = os.path.join(output_folder, output_filename)

In [None]:
# Load dfs containing all memberships and combine into one
if LOCATION != "rtl":
    dfs = [pd.read_csv(p, sep="\t", index_col=0) for p in data_paths]
    # Confirm that all dataframes the same length
    assert (lambda a: ~np.any(y-np.max(y)))(np.array([x.shape[0] for x in dfs]))
    output_df = functools.reduce(lambda a,b: pd.DataFrame.join(a,b), dfs)
else:
    tmp_folder = "/tmp"
    _ = [subprocess.call(["aws", 's3', 'cp', x, tmp_folder]) for x in data_paths]
    _data_paths = [os.path.join(tmp_folder, x) for x in DATA_FILENAMES]
    dfs = [pd.read_csv(p, sep="\t", index_col=0) for p in _data_paths]
    # Confirm that all dataframes the same length
    assert (lambda y: ~np.any(y-np.max(y)))(np.array([x.shape[0] for x in dfs]))
    output_df = functools.reduce(lambda a,b: pd.DataFrame.join(a,b), dfs)
    _ = subprocess.call(['rm'] + _data_paths)

In [None]:
# Save as csv
if LOCATION != "rtl":
    output_df.to_csv(output_path, sep="\t", index=True)
else:
    tmp_folder = "/tmp"
    tmp_path = os.path.join(tmp_folder, output_filename)
    output_df.to_csv(tmp_path, sep="\t", index=True)
    _ = subprocess.call(["aws", 's3', 'cp', tmp_path, output_path])
    _ = subprocess.call(['rm', tmp_path])