In [1]:
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split
from lenskit.algorithms import Recommender
from lenskit.algorithms.user_knn import UserUser
import warnings

In [2]:
# Import the ratings and movies csv files, drop the timestamp column (unused)
ratings = pd.read_csv("preprocessed_dataset/ratings.csv")
ratings = ratings.drop('timestamp', axis=1)
movies = pd.read_csv("preprocessed_dataset/movies.csv", index_col="item")

In [3]:
# Method for generating random groups based on the ratings file
def getGroups(ratings):
    user_ids = ratings['user'].unique() # Sort by unique user IDs
    random.shuffle(user_ids)
    group_size = 4 # Group size can be determined here
    random_groups = [user_ids[i:i + group_size] for i in range(0, len(user_ids), group_size)]

    return random_groups

In [4]:
# Method for training the UserUser recommender system using the train_data because of the Hold-Out validation strategy
def trainModel(train_data):
    user_user = UserUser(15, min_nbrs=3)
    recsys = Recommender.adapt(user_user)
    recsys.fit(train_data)
    
    return recsys

In [5]:
# Method for returning the ratings of a specific user
def getUserRatings(ratings,user):
    user_ratings = ratings[ratings['user'] == user]
    user_ratings_series = user_ratings.set_index('item')['rating']
    
    return user_ratings_series

In [6]:
# Method for returning all items in the user_item_matrix for which the user does not have a rating yet (NaN)
def getNaNList(user_item_matrix,user_id):
    cols = user_item_matrix.loc[user_id]
    nan_columns = cols[cols.isna()].index.tolist()
    
    return nan_columns

In [7]:
# Method for returning the recommendations for a specific user using the passed trained recommender model
def getRecommendation(recsys,user_item_matrix,user_id):
    user_ratings = getUserRatings(ratings,user_id) # Get all existing ratings
    items = getNaNList(user_item_matrix,user_id) # Get all unrated items
    predicted_scores = recsys.predict_for_user(user_id, items, user_ratings)
    
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore",category=FutureWarning)
        # Add predicted ratings to the matrix, clipped to a 0-5 interval
        user_item_matrix.loc[user_id, items] = np.clip(predicted_scores,0,5)
        
    return user_item_matrix

In [8]:
# Method for returning the top x recommendations for a given group using the additive aggregation strategy
def getAdditiveOrder(user_item_matrix,group):
    group_ratings = {}
    group_uim = user_item_matrix.loc[group]
    for item in group_uim: # For each item
        total_rating = group_uim[item].sum() # Calculate the added scores of all group members
        if item not in group_ratings: # Add item to array if not already present
            group_ratings[item] = []
        group_ratings[item].append(total_rating) # Add cumulative additive score to ratings array
    result = pd.DataFrame(group_ratings) # Transform into DataFrame
    ordered_items = result.max().sort_values(ascending=False) # Order items by size
    
    return ordered_items.head(5) # Return highest 5 predicted scores

In [9]:
# Method for returning the top x recommendations for a given group using the least misery aggregation strategy
def getLeastMiseryOrder(user_item_matrix,group):
    group_ratings = {}
    group_uim = user_item_matrix.loc[group]
    for item in group_uim: # For each item
        min_rating = group_uim[item].min() # Calculate the lowest score from all group members
        if item not in group_ratings: # Add item to array if not already present
            group_ratings[item] = []
        group_ratings[item].append(min_rating)  # Add lowest score to ratings array
    result = pd.DataFrame(group_ratings) # Transform into DataFrame
    ordered_items = result.max().sort_values(ascending=False) # Order items by size
    
    return ordered_items.head(5) # Return highest 5 predicted scores

In [10]:
# Method for returning the hits on the passed top 5 items (order) for a given group
def getHits(group,order,ratings):
    threshold = 3 # Minimum threshold for a hit can be determined here
    hits = pd.Series(index=order.index, dtype=int) # Instantiate Series
    for item in order.index: # For each passed item to be recommended
        item_ratings = ratings.loc[ratings['item'] == item] # Find ratings for said item
        relevance = sum( # Calculate relevance by calculating how many group members have rated the item above the threshold
            item_ratings.loc[item_ratings['user'] == user, 'rating'].values[0] > threshold
            for user in group if any(item_ratings['user'] == user)
        )
        hits[item] = relevance # Append the item's relevance

    return hits # Return hits Series

In [11]:
# Method for calculating the DCG
def GetDCG(hits):
    dcg = hits.iloc[0] # Start value is the relevance of the first item in the hits Series (rel_1)
    for i, hit_value in enumerate(hits, 1): # For every other item in the hits Series
        if i != 1: # Skip the first to avoid dividing by 0
            value = (hit_value)/np.log2(i) # Calculate relevance / log2(rank)
            dcg += value # Add to DCG

    return dcg

In [12]:
# Method for calclating the IDCG
def GetIDCG(hits):
    hits = hits.sort_values(ascending=False) # Rank the hits Series based on relevance
    ndcg = hits.iloc[0] # Start value is the relevance of the first item in the hits Series (rel_1)
    for i, hit_value in enumerate(hits, 1): # For every other item in the hits Series
        if i != 1: # Skip the first to avoid dividing by 0
            value = (hit_value)/np.log2(i) # Calculate relevance / log2(rank)
            ndcg += value # Add to DCG

    return ndcg

In [13]:
# Method for calculating the nDCG
def GetnDCG(group,order,ratings):
    hits = getHits(group,order,ratings) # Get hits Series
    dcg = GetDCG(hits) # Calculate DCG
    if (dcg == 0): # If DCG is 0, IDCG is also 0 so return 0 as nDCG value
        ndcg = 0
    else: # Else, calculate IDCG and nDCG
        idcg = GetIDCG(hits)
        ndcg = dcg/idcg 

    return ndcg

In [14]:
# Hold-out Validation using 20% test data, stratified on user so there are no missing users/items in any set
train_data, test_data = train_test_split(ratings, test_size=0.2, stratify=ratings['user'])
recsys = trainModel(train_data)

could not load LIBBLAS: Could not find module 'libblas' (or one of its dependencies). Try using the full path with constructor syntax.
