In [None]:
import pandas as pd
from datetime import datetime
import seaborn as sns 
from matplotlib import pyplot as plt
import hashlib

from bias_tree import BiasDetectionTree, get_metric_bias_tree_for_model, evaluate_model
from data_preparation.movielens_100k import MovieLens100KData
from recommender.factorization_recommender import fit_recommendation_model, retrain_recommendation_model,\
tune_recommendation_hyperparams, BiasEvaluationCallback

### Prepare training data

In [None]:
data = MovieLens100KData(data_path='data/ml-100k')

### Analyze minimum and maximum biased nodes during training

In [None]:
NUM_EPOCH = 50
BATCH_SIZE = 1000
MAX_TRIAL = 50
BIAS_EVAL_INTERVAL = 2
EMBEDDING_SIZE = 128
MIN_CHILD_NODE_SIZE = 1000

In [None]:
bias_results_runs = []


X_train, X_val, X_test = data.get_data_splits_for_training()
bias_callback = BiasEvaluationCallback(X_train, X_val, data, interval=BIAS_EVAL_INTERVAL, min_child_node_size=MIN_CHILD_NODE_SIZE)
model, history =  fit_recommendation_model(X_train, X_val, user_ids=data.user_ids, item_ids=data.item_ids, 
                                             batch_size=BATCH_SIZE, epochs=NUM_EPOCH, callbacks=[bias_callback], 
                                           embedding_size=EMBEDDING_SIZE)
bias_results_runs += bias_callback.bias_results

In [None]:
bias_results_epochs = pd.DataFrame(bias_results_runs)

bias_results_epochs = bias_results_epochs[bias_results_epochs['epoch']>0]

bias_results_epochs.replace('train-min node', 'train: min node value', inplace=True)
bias_results_epochs.replace('train-max node', 'train: max node value', inplace=True)
bias_results_epochs.replace('val-min node', 'validation: min node value', inplace=True)
bias_results_epochs.replace('val-max node', 'validation: max node value', inplace=True)
bias_results_epochs.replace('avg-train', 'train: average value', inplace=True)
bias_results_epochs.replace('avg-val', 'validation: average value', inplace=True)
bias_results_epochs.rename(columns={'value': 'MSE'}, inplace=True)

plt.figure(figsize=(10,10))
ax = sns.lineplot(data=bias_results_epochs, x='epoch', y='MSE', hue='metric')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=3)

In [None]:
bias_results_epochs.to_excel('bias_node_results_dnn_10runs.xls')

### Hyperparameter tuning

In [None]:
MIN_CHILD_NODE_SIZE = 1000
METRIC = 'squared_error'
NUM_EPOCH = 30

#### Tuning for the global validation set

In [None]:
now = datetime.now().strftime("%Y%m%d %H%M%S")
model =  tune_recommendation_hyperparams(X_train, X_val, user_ids=data.user_ids, item_ids=data.item_ids, 
                                             batch_size=BATCH_SIZE, epochs=NUM_EPOCH, project_suffix=now, max_trials=MAX_TRIAL, 
                                         logdir='hyperparams')

In [None]:
bias_tree_test = get_metric_bias_tree_for_model(model, X_val, data.attributes_dict, 
                                                 metric_name=METRIC,
                                                min_child_node_size=MIN_CHILD_NODE_SIZE)
display(bias_tree_test.leaf_metrics)

##### Maximum and minimum biased nodes

In [None]:
print(bias_tree_test.max_metric_node, round(bias_tree_test.max_metric_value, 3))

In [None]:
print(bias_tree_test.max_metric_node, round(bias_tree_test.min_metric_value, 3))

### Hyperparameter tuning for the biased nodes

In [None]:
BATCH_SIZE_BIAS = 256
mean_test_metric = evaluate_model(model, X_test, METRIC).mean() 
retrain_metrics = []
biased_nodes = bias_tree_test.leaf_metrics[bias_tree_test.leaf_metrics['mean'] > mean_test_metric]
for i, node_rules in biased_nodes.iterrows():
    print(node_rules.name,  node_rules["mean"])
    X_train_filtered = bias_tree_test.get_filtered_df(node_rules.name,  X_train)
    X_val_filtered = BiasDetectionTree.get_filtered_df(node_rules.name,  X_val)
    X_test_filtered = BiasDetectionTree.get_filtered_df(node_rules.name,  X_test)
    model_tuned_bias = tune_recommendation_hyperparams(X_train, X_val_filtered, user_ids=data.user_ids, item_ids=data.item_ids, 
                                 epochs=NUM_EPOCH, project_suffix=hashlib.md5(node_rules.name.encode()).hexdigest()[:5] + now,
                                                      batch_size=BATCH_SIZE_BIAS, max_trials=50)
#     model_bias = retrain_recommendation_model(X_train, X_val_filtered, model=model, epochs=10, 
#                                               retrain_embeddings=False)    
    node_test_metric_after_retraining = evaluate_model(model_tuned_bias, X_test_filtered, METRIC).mean()
    node_val_metric_after_retraining = evaluate_model(model_tuned_bias,  X_val_filtered, METRIC).mean()
    node_test_metric_before_retraining = evaluate_model(model,  X_test_filtered, METRIC).mean()
    node_val_metric_before_retraining = evaluate_model(model, X_val_filtered, METRIC).mean()
    retrain_metrics.append({
        'node_rules': node_rules.name,
        'node_test_metric_before_retraining':  node_test_metric_before_retraining,
        'node_val_metric_before_retraining':  node_val_metric_before_retraining,
        'node_test_metric_after_retraining': node_test_metric_after_retraining,
        'node_val_metric_after_retraining': node_val_metric_after_retraining
    })
retrain_metrics_pd = pd.DataFrame(retrain_metrics)

In [None]:
retrain_metrics_pd['retrain_test_diff'] = 100*(retrain_metrics_pd['node_test_metric_before_retraining'] \
                                          - retrain_metrics_pd['node_test_metric_after_retraining'])\
                              /retrain_metrics_pd['node_test_metric_before_retraining']

retrain_metrics_pd['retrain_val_diff'] = 100*(retrain_metrics_pd['node_val_metric_before_retraining'] \
                                          - retrain_metrics_pd['node_val_metric_after_retraining'])\
                                          /retrain_metrics_pd['node_val_metric_before_retraining']
retrain_metrics_pd.round(3)