In [17]:
import sys
import os
sys.path.append(os.path.join(os.path.abspath(''), '../'))

from collections import defaultdict
from evaluation.evaluation_harness import OnlineEvaluator, include_coldstart, include_no_interactions, include_some_interactions, include_all, score_time_spent, score_constant
from statistical_test import estimate_probability_each_model_is_best
from pprint import pprint
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
sys.path.append(os.path.join(os.path.abspath(''), '../backend'))
from backend.blueprints.recommendation import model_wrappers

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
evaluator = OnlineEvaluator(include_all, score_constant)

In [19]:
current_models = []
for model_wrapper in model_wrappers:
    model_wrapper.model = model_wrapper.definition()
    model_wrapper.model.load(model_wrapper.model_save_file_name, load_published_model=True)
    current_models.append((model_wrapper.model.name(), model_wrapper.model_save_file_name))
evaluation_names = {'evaluation_test_random_model': 'random_model', 'evaluation_test_popularity_model': 'popularity_model', 'evaluation_test_common_neighbors_constant_scoring': 'common_neighbors_constant_scoring', 'evaluation_test_common_neighbors': 'common_neighbors_gaussian_scoring', 'evaluation_test_common_neighbors_percentile_scoring': 'common_neighbors_percentile_scoring', 'evaluation_test_cf_low_weight_decay_increased_lr_best_model_bugfix': 'cf', 'evaluation_test_gcf_low_weight_decay_increased_lr_best_model_bugfix': 'non_linear_cf', 'evaluation_test_mlp_low_weight_decay_increased_lr_best_model_bugfix': 'mlp', 'evaluation_test_ncf_low_weight_decay_increased_lr_best_model_bugfix': 'neural_cf', 'evaluation_test_cf_embed_all_except_tags_genres_best_model_bugfix_clip_embeddings': 'cf_with_game_embeddings', 'evaluation_test_gcf_embed_all_except_tags_genres_best_model_bugfix_clip_embeddings': 'non_linear_cf_with_game_embeddings', 'evaluation_test_mlp_embed_all_except_tags_genres_best_model_bugfix_clip_embeddings': 'mlp_with_game_embeddings', 'evaluation_test_ncf_embed_all_except_tags_genres_best_model_bugfix_clip_embeddings': 'neural_cf_with_game_embeddings'}

In [58]:
bucket_width = 5

cur_models_by_name = defaultdict(list)
for model_name, model_save_path in current_models:
    if model_save_path not in evaluation_names:
        continue
    cur_models_by_name[model_name].append(model_save_path)
data = []
for model_name, model_save_path in current_models:
    evaluator.reset(model_name, model_save_path)
    num_interactions = evaluator.results_df["num_game_interactions_external"] + evaluator.results_df["num_game_interactions_local"]
    min_num = num_interactions.min()
    max_num = num_interactions.max()

    xs = []
    y_10s = []
    y_50s = []
    counts = []
    bucket_width = (max_num - min_num) / 3
    for start in [min_num, min_num + bucket_width, min_num + 2 * bucket_width]:
        evaluator.reset(model_name, model_save_path, start, start + bucket_width)
        if evaluator.metrics["num_interactions"] == 0:
            counts.append(0)
            y_50s.append(None)
            continue
        evaluator.compute_top_N_hit_percentage(10)
        evaluator.compute_top_N_hit_percentage(50)
        xs.append(start + bucket_width / 2)
        y_10s.append(evaluator.metrics["top_10_hit_percentage"])
        y_50s.append(evaluator.metrics["top_50_hit_percentage"])
        counts.append(evaluator.metrics["num_interactions"])
    data.append({
        "model_save_path": model_save_path,
        "1/3": y_50s[0],
        "num1": counts[0],
        "2/3": y_50s[1],
        "num2": counts[1],
        "3/3": y_50s[2],
        "num3": counts[2],
    })
    # fig = plt.figure()
    # fig.suptitle(f"{model_name} {model_save_path}")
    # ax = fig.add_subplot()
    # # ax.plot(xs, y_10s, label="Top 10 Hit %")
    # # ax.plot(xs, y_50s, label="Top 50 Hit %")
    # ax.bar(xs, y_50s, width=bucket_width, alpha=0.5)
    # ax.set_ylim(0, 1)
    # for x, y, count in zip(xs, y_50s, counts):
    #     ax.annotate(count, (x, y - 0.04))
    # fig.legend()
data = pd.DataFrame(data)
data = data[data["model_save_path"].isin(evaluation_names)]
data = data.dropna()
data["model_save_path"] = data["model_save_path"].map(evaluation_names)
display(data)

Unnamed: 0,model_save_path,1/3,num1,2/3,num2,3/3,num3
0,common_neighbors_constant_scoring,0.407895,76,0.470588,34,0.491228,57
1,common_neighbors_percentile_scoring,0.521277,94,0.68,50,0.405797,70
2,common_neighbors_gaussian_scoring,0.583333,131,0.565217,23,0.532258,62
3,popularity_model,0.362069,116,0.366197,72,0.328358,68
4,random_model,0.166667,90,0.245098,133,0.252525,100
5,cf,0.217391,23,0.4,10,0.347826,23
8,neural_cf,0.344828,29,0.3,10,0.2,10
10,non_linear_cf_with_game_embeddings,0.3,10,0.2,10,0.8,5
11,mlp_with_game_embeddings,0.1,10,0.1,10,0.0,3


In [21]:
# groups = evaluator.all_results.groupby(by=["rec_model_name", "rec_model_save_path"])
# for (model_name, model_save_path), data in groups:
all_metrics = []
for model_name, model_save_path in current_models:
    evaluator.reset(model_name, model_save_path)
    # evaluator.plot_top_N_hit_percentage_percentiles(10)
    evaluator.compute_top_N_hit_percentage(10)
    # evaluator.plot_top_N_hit_percentage_percentiles(50)
    evaluator.compute_top_N_hit_percentage(50)
    # evaluator.plot_user_rank_roc_curve()
    evaluator.compute_user_rank_auc_roc()
    # evaluator.save_metrics('test_online_evaluator', overwrite=True)
    # pprint(evaluator.metrics)
    all_metrics.append({'model_name': model_name, 'model_save_path': model_save_path, **evaluator.metrics})
    # print()
results = pd.DataFrame(all_metrics)
estimate_probability_each_model_is_best(results, 'user_rank_auc_roc')
estimate_probability_each_model_is_best(results, 'top_10_hit_percentage')
estimate_probability_each_model_is_best(results, 'top_50_hit_percentage')

In [22]:
df = results.loc[results['model_save_path'].isin(evaluation_names), ['model_save_path', 'num_interactions', 'user_rank_auc_roc_best_probability', 'user_rank_auc_roc', 'user_rank_auc_roc_variance', 'top_50_hit_percentage_best_probability', 'top_50_hit_percentage', 'top_50_hit_percentage_variance', 'top_10_hit_percentage_best_probability', 'top_10_hit_percentage', 'top_10_hit_percentage_variance']].copy()
df['model_save_path'] = df['model_save_path'].map(evaluation_names)
display(df.sort_values(by='user_rank_auc_roc_best_probability', ascending=False))

Unnamed: 0,model_save_path,num_interactions,user_rank_auc_roc_best_probability,user_rank_auc_roc,user_rank_auc_roc_variance,top_50_hit_percentage_best_probability,top_50_hit_percentage,top_50_hit_percentage_variance,top_10_hit_percentage_best_probability,top_10_hit_percentage,top_10_hit_percentage_variance
12,neural_cf_with_game_embeddings,25,0.41855,0.657051,0.012706,0.20878,0.52,0.044921,0.22317,0.541667,0.042357
10,non_linear_cf_with_game_embeddings,35,0.29816,0.630435,0.011376,0.00199,0.342857,0.008969,0.01939,0.37037,0.019943
8,neural_cf,58,0.07886,0.567568,0.006396,0.02821,0.362069,0.026727,0.04476,0.377778,0.033333
9,cf_with_game_embeddings,30,0.06967,0.527778,0.011794,0.20387,0.4,0.131034,0.26104,0.444444,0.162393
7,mlp,32,0.06672,0.519481,0.012884,0.0002,0.34375,0.003935,0.02844,0.4,0.019753
1,common_neighbors_percentile_scoring,224,0.02613,0.573866,0.001545,0.29099,0.595092,0.024845,0.10116,0.482143,0.026342
4,random_model,333,0.02603,0.575264,0.001384,0.00223,0.218085,0.023057,0.02496,0.269841,0.046083
6,non_linear_cf,29,0.00799,0.393939,0.014251,0.00848,0.37931,0.011501,0.0045,0.333333,0.013072
2,common_neighbors_gaussian_scoring,226,0.00371,0.53252,0.001522,0.18325,0.572414,0.011973,0.13186,0.533333,0.017172
11,mlp_with_game_embeddings,33,0.00354,0.271605,0.023811,0.0,0.181818,0.003001,0.0,0.111111,0.0


In [63]:
out = []
for model_name, model_save_path in current_models:
    evaluator.reset(model_name, model_save_path)

    data = evaluator.results_df["time_spent"]
    data = data[data < 30]

    out.append({
        "model_save_path": model_save_path,
        "median": np.median(data),
        "num_interactions": len(data),

    })
    # fig = plt.figure()
    # fig.suptitle(f"{model_name} {model_save_path}")
    # ax = fig.add_subplot()
    # ax.hist(data, bins=20)
    # median = np.median(data)
    # ax.annotate(f"median: {median:.3f} with {len(data)}", (0.5, 0.5), xycoords='axes fraction')
    # ax.axvline(median, color='k', linestyle='dashed', linewidth=1)
out = pd.DataFrame(out)
out = out[out["model_save_path"].isin(evaluation_names)]
out["model_save_path"] = out["model_save_path"].map(evaluation_names)
display(out.sort_values(by="median", ascending=False))

Unnamed: 0,model_save_path,median,num_interactions
6,non_linear_cf,5.478,17
9,cf_with_game_embeddings,4.415,27
11,mlp_with_game_embeddings,3.8895,28
12,neural_cf_with_game_embeddings,3.634,25
10,non_linear_cf_with_game_embeddings,3.2095,34
8,neural_cf,2.858,53
5,cf,2.043,63
4,random_model,1.79,324
7,mlp,1.7255,32
0,common_neighbors_constant_scoring,1.5145,170
