# 5 Extract results from user_evaluators

Loads all specified u_evaluators for the given DATASET, SUBSETS. Combines them into one csv file with columns specifying the subset, model_id, metric and the numerator and denominator values.

Requires:
* u_evaluator_folders.json - json file formatted as:
> 'unique_run_id': {'dataset': 'lastfm_10_pc',
   'model_id': 6,
   'path': 'path/to/last/common/ancestor/folder/of/all/u_evaluators'}
   
   Can also specify the folder structure to be able load individual evaluators from the specified path.
* Project code to be located at the path specified by `code_root`

Returns:
* user_evaluator_results.json containing each user's numerator and denominator as described above. Note that the empty users are not discarded.

In [None]:
DATASET = "lastfm_10_pc"
LOCATION = "local"
SUBSETS = ["validation","test"]

In [None]:
import json
import glob
import os
import pickle
import subprocess
import sys

from collections import defaultdict

import boto3
import numpy as np
import pandas as pd

In [None]:
if LOCATION == "local":
    code_root = "/Users/nknyazev/Documents/Delft/Thesis/temporal/code/model"
    pickle_paths_path = "/Users/nknyazev/Downloads/u_evaluator_folders.json"
    pickle_template = "3/{}_u_evaluator"
    output_root = "/Users/nknyazev/Documents/Delft/Thesis/temporal/data/results/RQ3"
    
elif LOCATION == "server":
    code_root = "/home/nfs/nknyazev/thesis/Thesis/model"
    pickle_paths_path = "/home/nfs/nknyazev/thesis/data/results/u_evaluator_folders.json"
    pickle_template = "3/{}_u_evaluator"
    output_root = "/tudelft.net/staff-bulk/ewi/insy/MMC/nknyazev/RQ3"


elif LOCATION == "rtl":
    code_root = "/home/ec2-user/SageMaker/thesis/Thesis/model"
    pickle_paths_bucket = "ci-data-apps"
    pickle_paths_key = "norman/sagemaker/thesis/offline-evaluation/results/u_evaluator_folders.json"
    pickle_template = "all/{}_u_evaluator"
    output_root = "s3://ci-data-apps/norman/sagemaker/thesis/offline-evaluation/RQ3"


In [None]:
# Extend the path as otherwise running into import issues during unpickling
sys.path.append(code_root)

output_filename = "user_evaluator_results.json"
output_folder = os.path.join(output_root, DATASET)
output_path = os.path.join(output_folder, output_filename)

In [None]:
# pickle_params = {
#     '14': {'dataset': 'lastfm_10_pc',
#   'model_id': 4,
#   'path': '/Users/nknyazev/Downloads/evaluator_test/lastfm_10_pc/500/4/0.5/0.5/0.5/0/tf.sigmoid/1147'},
#  '15': {'dataset': 'lastfm_10_pc',
#   'model_id': 6,
#   'path': '/Users/nknyazev/Downloads/evaluator_test/lastfm_10_pc/500/6/0.5/0.5/0.5/1/tf.nn.relu/1142'}}

In [None]:
local_pickle_paths = defaultdict(dict)

if LOCATION == "rtl":
    s3 = boto3.resource('s3')
    content_object = s3.Object(pickle_paths_bucket, pickle_paths_key)
    file_content = content_object.get()['Body'].read().decode('utf-8')
    pickle_params = json.loads(file_content)
    tmp_folder = "/tmp"
    for d in pickle_params.values():
        if d["dataset"] != DATASET:
            continue
        for subset in SUBSETS:
            model_id = str(d['model_id'])
            model_tmp_folder = os.path.join(tmp_folder, model_id)
            pickle_filename = pickle_template.format(subset)
            pickle_s3_path = os.path.join(d['path'], pickle_filename)
            pickle_local_path = os.path.join(model_tmp_folder, pickle_filename)
            _ = subprocess.call(['aws', 's3', 'cp', pickle_s3_path, pickle_local_path])

            local_pickle_paths[subset][model_id] = pickle_local_path
else:
    with open(pickle_paths_path) as input_file:
        pickle_params = json.load(input_file)
    for d in pickle_params.values():
        if d["dataset"] != DATASET:
            continue
        for subset in SUBSETS:
            model_id = str(d['model_id'])
            pickle_filename = pickle_template.format(subset)
            pickle_local_path = os.path.join(d['path'], pickle_filename)
            local_pickle_paths[subset][model_id] = pickle_local_path

In [None]:
pickles = defaultdict(dict)
for model_id, subsets in local_pickle_paths.items():
    for subset, pickle_path in subsets.items():
        pickles[subset][model_id] = pickle.load(open(pickle_path, "rb"))
        if LOCATION == "rtl":
            os.remove(pickle_path)

In [None]:
# {subset: {model_id: {metric: ([num_u1, num_u2,...], [denom_u1, denom_u2,...]),...}}}
num_denoms = {s: {model_id: v._num_denom for model_id, v in evaluators.items()} for s, evaluators in pickles.items()}

In [None]:
# Save to one pandas array
pre_df_dict = {
    (model_id, subset, metric): [num, denom]
    for model_id, subsets in num_denoms.items() 
    for subset, metrics in subsets.items() 
    for metric, (num, denom) in metrics.items() 
}

values = np.concatenate(list(pre_df_dict.values()), axis=1).T
index_values = [(*k,i) for k, v in pre_df_dict.items() for i in range(v[0].shape[0])]
index = pd.MultiIndex.from_frame(pd.DataFrame(index_values, columns=["model_id", "subset", "metric", "uid"]))

output_df = pd.DataFrame(values, index=index, columns=["num", "denom"])

In [None]:
if LOCATION != "rtl":
    output_df.to_csv(output_path, sep="\t")
else:
    tmp_folder = "/tmp"
    tmp_path = os.path.join(tmp_folder, output_filename)
    output_df.to_csv(tmp_path, sep="\t")
    _ = subprocess.call(['aws', 's3', 'cp', tmp_path, output_path])
    os.remove(tmp_path)