In [1]:
from bias_tree import BiasDetectionTree, get_metric_bias_tree_for_model
from data_preparation.movielens_100k import MovieLens100KData
from data_preparation.book_crossing import BookCrossingData
from recommender.surprise_recommender import random_search_fit_surprise_recommendation_model
import pandas as pd
import surprise

### Prepare training data

In [2]:
data = BookCrossingData(data_path='data/book-crossing') #MovieLens100KData(data_path='data/ml-100k')
X_train, X_test = data.get_data_splits_for_training(use_val_set=False)

  


In [3]:
X_train.shape

(67317, 23)

### Train CF models and tune their hyperparameters

In [None]:
MODEL_GRID =[(surprise.SVD,  {'biased': [True, False], 'n_factors': [ 10, 20, 50], 
            'n_epochs': [5, 20, 50],
#                            'reg_all': [0.01, 0.06, 0.1],
     'lr_all': [0.001, 0.01],
    }),
#                (surprise.NMF, {'biased': [True, False], 'n_factors': [ 10, 20, 50], 'n_epochs': [5, 20, 50],
#                            'reg_pu': [0.01, 0.06, 0.1], 'reg_qi': [0.01, 0.06, 0.1],
#     'reg_bu': [0.02, 0.06, 0.1],
#     'reg_bi': [0.02, 0.06, 0.1],
#     'lr_bu': [0.001, 0.005, 0.01],
#      'lr_bi': [0.001, 0.005, 0.01],
#                               }),
#     (surprise.CoClustering,    {'n_cltr_u': [5, 10, 50, 100], 'n_cltr_i': [ 5, 10,50, 100]}),
(surprise.KNNWithMeans, {'user_based': [True, False], 'k': [5, 10, 50, 100],}), 
# (KNNWithZScore, {'user_based': [True, False], 'k': [5, 10, 20, 50, 100],}), 
#                 (surprise.SlopeOne, {}),
            ]

In [None]:
N_ITER = 1
MIN_CHILD_NODE_SIZE = 1000
METRIC = 'squared_error'
ALPHA = 0.01

leaf_metrics_models = []
avg_metric = {}

for model_cls, grid in MODEL_GRID:
    print('-----------------')
    print(model_cls.__name__)
    model = random_search_fit_surprise_recommendation_model(X_train, model_cls, grid, n_iter=N_ITER,
                                                            user_col=data.USER_ID_COL_TRANSFORMED, 
                                                            item_col=data.ITEM_ID_COL_TRANSFORMED, 
                                                           rating_col=data.RATING_COL)
    bias_tree_test = get_metric_bias_tree_for_model(model, X_test, data.attributes_dict, 
                                                 metric_name=METRIC,
                                                min_child_node_size=MIN_CHILD_NODE_SIZE, alpha=ALPHA,
                                                user_col=data.USER_ID_COL_TRANSFORMED, 
                                                item_col=data.ITEM_ID_COL_TRANSFORMED, 
                                                rating_col=data.RATING_COL)
    avg_metric[model_cls.__name__] = X_test[METRIC].mean()
    model_leaf_metrics = bias_tree_test.leaf_metrics
    model_leaf_metrics['model'] = model_cls.__name__
    leaf_metrics_models.append(model_leaf_metrics)
    print("Most biased leaf: ", bias_tree_test.max_metric_node, round(bias_tree_test.max_metric_value, 3))

### Select  model with fairness objective

In [25]:
avg_metrics_pd = pd.Series(avg_metric)
display(avg_metrics_pd.sort_values())
best_model_global = avg_metrics_pd.idxmin()

SVD             0.897179
KNNWithMeans    0.940138
dtype: float64

In [26]:
leaf_metrics_models_pd = pd.concat(leaf_metrics_models)
metrics_global_max_bias = leaf_metrics_models_pd.groupby('model').max()['mean'].to_frame().join(avg_metrics_pd.to_frame())
metrics_global_max_bias.columns=['max bias', 'global']

In [27]:
FAIRNESS_ALPHA = 1
metrics_global_max_bias['weighted_fair_metric'] = FAIRNESS_ALPHA * metrics_global_max_bias['max bias'] \
                                                    + (1-FAIRNESS_ALPHA)* metrics_global_max_bias['global']

In [28]:
metrics_global_max_bias['weighted_fair_metric'].sort_values()

model
SVD             1.126339
KNNWithMeans    1.180301
Name: weighted_fair_metric, dtype: float64