In [None]:
import numpy as np
import pandas as pd
from typing import List

from collections import defaultdict
import itertools
import math

from tqdm import tqdm

u_features = np.load('../datasets/movie_lens/mf/U_features.npy')
i_features = np.load('../datasets/movie_lens/mf/I_features.npy')
print(u_features.shape)
print(i_features.shape)

def get_items_for_user(user_id):
    items_ratings = u_features[:, user_id] @ i_features
    items_ids_w_ratings = [(item_id, rating) for item_id, rating in enumerate(items_ratings)]
    items_ids_w_ratings.sort(key=lambda x: x[1], reverse=True)
    return items_ids_w_ratings

def get_items_for_users(users_id: List):
    items_ratings = i_features.T @ u_features[:, users_id]
    # items_ratings = np.minimum(5, np.maximum(0, i_features.T @ u_features[:, users_id]))
    return items_ratings
    
# ratings = get_items_for_users([10,20,30])
# ratings.shape

def select_top_n_idx(score_list, top_n, top='max', sort=True, exclude_idx=[]):
    if top != 'max' and top != 'min':
        raise ValueError('top must be either Max or Min')
    if top == 'max':
        score_list = -score_list

    select_top_n = top_n + len(exclude_idx)
    top_n_ind = np.argpartition(score_list, select_top_n)[:select_top_n]

    if sort:
        top_n_ind = top_n_ind[np.argsort(score_list[top_n_ind])]

    if exclude_idx:
        top_n_ind = [idx for idx in top_n_ind if idx not in exclude_idx]
    return top_n_ind[0:top_n]


a = np.array([2,1,6,7,8,9,3,4,5,10])
assert np.array_equal(select_top_n_idx(a, 3, top='max'), [9, 5, 4])
assert np.array_equal(select_top_n_idx(a, 3, top='min'), [1, 0, 6])
assert set(select_top_n_idx(a, 3, top='max', sort=False)) == {9, 5, 4}
assert set(select_top_n_idx(a, 3, top='min', sort=False)) == {0, 1, 6}

assert np.array_equal(select_top_n_idx(a, 3, top='max', exclude_idx=[1]), [9, 5, 4])
assert np.array_equal(select_top_n_idx(a, 3, top='min', exclude_idx=[1]), [0, 6, 7])
assert set(select_top_n_idx(a, 3, top='max', sort=False, exclude_idx=[1])) == {9, 5, 4}
assert set(select_top_n_idx(a, 3, top='min', sort=False, exclude_idx=[1])) == {0, 6, 7}


In [None]:
from scipy.stats import rankdata

# borda count that is limited only to top-max_rel_items, if you are not in the top-max_rel_items, you get 0
def get_borda_rel(candidate_group_items_np, max_rel_items):
    rel_idx = select_top_n_idx(candidate_group_items_np, max_rel_items, top='max', sort=False)
    # print(candidate_group_items_np[rel_idx])
    x = candidate_group_items_np[rel_idx]
    rel_borda = rankdata(-candidate_group_items_np[rel_idx], method='max')
    # print(rel_borda)
    
    rel_all = np.zeros(len(candidate_group_items_np))
    rel_all[rel_idx] = rel_borda
    return rel_all

x = np.array([1,1,3,8, 10, 100, 11, 28])
get_borda_rel(x, 5)

In [None]:
from time import sleep
from matplotlib.pyplot import axis

def gfar_algorithm(group_items, top_n: int, relevant_max_items: int, n_candidates: int):
    group_size = group_items.shape[1]

    top_candidates_ids_per_member = np.apply_along_axis(lambda u_items: select_top_n_idx(u_items, n_candidates, sort=False), 0, group_items)
    # these are the original items ids
    top_candidates_idx = np.array(sorted(set(top_candidates_ids_per_member.flatten())))
    # get the candidate group items for each member
    candidate_group_items = group_items[top_candidates_idx, :] # this is the first id mapping (to go back to original, index by top_candidates_idx)

    borda_rel_of_candidates = np.apply_along_axis(lambda items_for_user: get_borda_rel(items_for_user, relevant_max_items), 0, candidate_group_items)
    total_relevance_for_users = borda_rel_of_candidates.sum(axis=0)
    p_relevant = borda_rel_of_candidates / total_relevance_for_users

    selected_items = []
    # this is the inside of the product in calculating the relevance for set of selected
    prob_selected_not_relevant = np.ones(group_size)

    # top-n times select one item to the final list
    for i in range(top_n):
        marginal_gain = p_relevant * prob_selected_not_relevant
        item_marginal_gain = marginal_gain.sum(axis=1)
        # select the item with the highest marginal gain
        item_id = select_top_n_idx(item_marginal_gain, 1, exclude_idx=selected_items)[0]
        selected_items.append(item_id)

        # update the probability of selected items not being relevant
        prob_selected_not_relevant *= (1 - p_relevant[item_id])

    # now we need to get the original item ids from the final_candidates list and then top_candidates_idx
    final_top_candidates = top_candidates_idx[selected_items]

    return selected_items


group_size = 5

# load groups
groups = pd.read_csv('../notebooks/dfs/groups/kgrec/top_k_10.csv')
#concatenate first 5 columns to array of ints
groups = groups.iloc[:,:group_size].values
rec_it = []

for group_members in tqdm(groups):
    items = get_items_for_users(group_members)
    top_n_items = gfar_algorithm(items, 10, relevant_max_items=100, n_candidates=1000)
    # print(items[top_n_items, :])
    rec_it.append(top_n_items)