# 6 Combine results with groupings to get final results

Combine individual user results\* from step 5 together with user groupings from step 2 to calculate final results for every subset for every supported grouping for each of its category for each metric

Requires:
* user_evaluator_results.json containing each user's numerator and denominator
* uid2stats.csv denoting the user membership of the  low, middle, high  group for each of the stats specified during previous steps.

Returns:
* uid2final_results.csv containing results described above

###### that can be used to calculate non user metrics

In [None]:
LOCATION = "local"
DATASET = "lastfm_10_pc"

In [None]:
import ast
import datetime
import functools
import os
import subprocess

from io import StringIO

import boto3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns


In [None]:
grouping_root = {
    "local": "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/RQ3",
    "server": "/tudelft.net/staff-bulk/ewi/insy/MMC/nknyazev/RQ3",
    "rtl": "s3://ci-data-apps/norman/sagemaker/thesis/offline-evaluation/RQ3"
}[LOCATION]

results_root = {
    "local": "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/RQ3",
    "server": "/tudelft.net/staff-bulk/ewi/insy/MMC/nknyazev/RQ3",
    "rtl": "s3://ci-data-apps/norman/sagemaker/thesis/offline-evaluation/RQ3"
}[LOCATION]

output_root = {
    "local": "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/RQ3",
    "server": "/tudelft.net/staff-bulk/ewi/insy/MMC/nknyazev/RQ3",
    "rtl": "s3://ci-data-apps/norman/sagemaker/thesis/offline-evaluation/RQ3"
}[LOCATION]

In [None]:
grouping_filename = 'uid2stats.csv'
results_filename = 'user_evaluator_results.json'
output_filename = 'final_results.csv'

In [None]:
grouping_path = os.path.join(grouping_root, DATASET, grouping_filename)
results_path = os.path.join(results_root, DATASET, results_filename)
output_folder = os.path.join(output_root, DATASET)
output_path = os.path.join(output_folder, output_filename)

In [None]:
if LOCATION != "rtl":
    grouping_df = pd.read_csv(grouping_path, sep='\t', index_col="uid")
    results_df = pd.read_csv(results_path, sep='\t', index_col="uid")
else:
    client = boto3.client('s3')
    grouping_bucket = grouping_path.split("/")[2]
    results_bucket = results_path.split('/')[2]
    grouping_key = re.findall(grouping_bucket + "/(.+)", grouping_path)[0]
    results_key = re.findall(results_bucket + "/(.+)", results_path)[0]
    grouping_string = client.get_object(Bucket=grouping_bucket, Key=grouping_key)["Body"].read().decode('utf-8')
    results_string = client.get_object(Bucket=results_bucket, Key=results_key)["Body"].read().decode('utf-8')
    grouping_df = pd.read_csv(StringIO(grouping_string), sep='\t', index_col="uid")
    results_df = pd.read_csv(StringIO(results_string), sep='\t', index_col="uid")

In [None]:
# Remove users with 0 interactions for each of the sets
results_df = results_df[results_df.denom != 0]

In [None]:
# Rename metric column not to include u_
results_df.metric = results_df.metric.apply(lambda x: x[2:])

In [None]:
# Vars to be included in the final output
grouping_vars = grouping_df.columns
results_vars = ["subset", 'model_id', 'metric', 'uid', 'num', 'denom', 'user_score']

In [None]:
# Merge all data on user id and implement multiindex
df = results_df.join(grouping_df, how="inner")\
    .reset_index()

In [None]:
# Add column containing u_(recall|mrr) score for the given user
df["user_score"] = df["num"]/df["denom"]

In [None]:
# Set grouping names as values in one column and the associated category for that column in another
# Number of entries is increased sum of categories times
melted = df.melt(
    id_vars = results_vars, 
    value_vars = grouping_vars,
    var_name = 'grouping',
    value_name = 'category'
)
melted.head(10)

In [None]:
# Calculate the results for standard and user metrics
groupby = melted.groupby(['subset', "grouping", 'category', 'model_id', 'metric'])

metrics = groupby['num'].sum() / groupby['denom'].sum()
u_metrics = groupby['user_score'].mean()

In [None]:
# Combine the two into one DataFrame and create 4 columns, one for each metric
output_df = pd.DataFrame([metrics, u_metrics], index=["", "u_"]).T\
        .unstack(-1)
output_df.columns = ["".join(x) for x in output_df.columns.tolist()]

In [None]:
# Reorder low, medium, high so low is first and high is last
cat_to_int = {"low":0, "middle":1, "high":2}
int_to_cat = {v:k for k,v in cat_to_int.items()}
output_df = output_df.rename(cat_to_int, level=2)\
        .sort_index()\
        .rename(int_to_cat, level=2)

In [None]:
output_df.head(10)

In [None]:
# Write the results
if LOCATION != "rtl":
    output_df.to_csv(output_path, sep="\t")
else:
    tmp_folder = "/tmp"
    tmp_path = os.path.join(tmp_folder, output_filename)
    output_df.to_csv(tmp_path, sep="\t")
    _ = subprocess.call(['aws', 's3', 'cp', tmp_path, output_path])
    os.remove(tmp_path)