### Download Dataset

In [None]:
!pip install kaggle
!kaggle competitions download -c recommender-system-2021-challenge-polimi

### Unzip data

In [None]:
import zipfile
dataFile = zipfile.ZipFile("recommender-system-2021-challenge-polimi.zip")
dataFile.extractall('data')

### Create dataframes

In [None]:
import pandas as pd

URM_path = 'data/data_train.csv'
URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float})
URM_all_dataframe.columns = ["UserID", "ItemID", "Interaction"]
URM_all_dataframe.head(n=10)

ICM_genre_path = 'data/data_ICM_genre.csv'
ICM_genre_all_dataframe = pd.read_csv(filepath_or_buffer=ICM_genre_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float})
ICM_genre_all_dataframe.columns = ["ItemID", "GenreID", "Match"]
ICM_genre_all_dataframe.head(n=10)

ICM_subgenre_path = 'data/data_ICM_subgenre.csv'
ICM_subgenre_all_dataframe = pd.read_csv(filepath_or_buffer=ICM_subgenre_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float})
ICM_subgenre_all_dataframe.columns = ["ItemID", "SubgenreID", "Match"]
ICM_subgenre_all_dataframe.head(n=10)

ICM_channel_path = 'data/data_ICM_channel.csv'
ICM_channel_all_dataframe = pd.read_csv(filepath_or_buffer=ICM_channel_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float})
ICM_channel_all_dataframe.columns = ["ItemID", "ChannelID", "Match"]
ICM_channel_all_dataframe.head(n=10)

ICM_event_path = 'data/data_ICM_event.csv'
ICM_event_all_dataframe = pd.read_csv(filepath_or_buffer=ICM_event_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float})
ICM_event_all_dataframe.columns = ["ItemID", "EpisodeID", "Match"]
ICM_event_all_dataframe.head(n=10)

In [None]:
ICM_subgenre_all_dataframe.head(n=10)

### Data statistics

In [None]:
genreID_unique = ICM_genre_all_dataframe["GenreID"].unique()
itemID_unique = ICM_genre_all_dataframe["ItemID"].unique()
n_genres = len(genreID_unique)
n_items = len(itemID_unique)
n_matches = len(ICM_genre_all_dataframe)

print ("Number of items\t {}, Number of genres\t {}".format(n_items, n_genres))
print ("Max ID items\t {}, Max Id genres\t {}\n".format(max(itemID_unique), max(genreID_unique)))
print ("Average matches per genre {:.2f}".format(n_matches/n_genres))
print ("Average matches per item {:.2f}\n".format(n_matches/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_matches)/(n_items*n_users))*100))

### To COO

In [2]:
import scipy.sparse as sps

URM_all = sps.coo_matrix((URM_all_dataframe["Interaction"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))

# URM_all.tocsr()

In [3]:
ICM_genre_all = sps.coo_matrix((ICM_genre_all_dataframe["Match"].values, 
                          (ICM_genre_all_dataframe["ItemID"].values, ICM_genre_all_dataframe["GenreID"].values)))

# ICM_genre_all.tocsr()

In [4]:
ICM_subgenre_all = sps.coo_matrix((ICM_subgenre_all_dataframe["Match"].values, 
                          (ICM_subgenre_all_dataframe["ItemID"].values, ICM_subgenre_all_dataframe["SubgenreID"].values)))

# ICM_subgenre_all.tocsr().data

In [5]:
ICM_channel_all = sps.coo_matrix((ICM_channel_all_dataframe["Match"].values, 
                          (ICM_channel_all_dataframe["ItemID"].values, ICM_channel_all_dataframe["ChannelID"].values)))

# ICM_channel_all.tocsr()

In [6]:
ICM_event_all = sps.coo_matrix((ICM_event_all_dataframe["Match"].values, 
                          (ICM_event_all_dataframe["ItemID"].values, ICM_event_all_dataframe["EpisodeID"].values)))

# ICM_subgenre_all.tocsr()

### Validation split

In [7]:
import numpy as np
np.random.seed(1234)

train_test_split = 0.80

n_interactions = URM_all.nnz


train_mask = np.random.choice([True,False], n_interactions, p=[train_test_split, 1-train_test_split])
train_mask

URM_train = sps.csr_matrix((URM_all.data[train_mask],
                            (URM_all.row[train_mask], URM_all.col[train_mask])))

val_mask = np.logical_not(train_mask)

URM_val = sps.csr_matrix((URM_all.data[val_mask],
                            (URM_all.row[val_mask], URM_all.col[val_mask])))

URM_train
URM_val

<13650x18059 sparse matrix of type '<class 'numpy.float64'>'
	with 1057044 stored elements in Compressed Sparse Row format>

In [None]:
user_id = 124
# inefficient way (creates new CSR)
# relevant_items = URM_val[user_id].indices
relevant_items = URM_val.indices[URM_val.indptr[user_id]:URM_val.indptr[user_id+1]]
relevant_items

### Metrics

In [None]:
def precision(recommended_items, relevant_items):
    
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant) #True == 1
    
    return precision_score

def recall(recommended_items, relevant_items):
    
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]
    
    return recall_score

def MAP(recommended_items, relevant_items):
   
    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
    
    # Cumulative sum: precision at 1, at 2, at 3 ...
    # e.g. cumsum on [1,2,3] gets [1,3,6]
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    
    map_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score

In [None]:
# We pass as paramether the recommender class

def evaluate_algorithm(URM_test, recommender_object, at=10):
    
    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_MAP = 0.0
    
    num_eval = 0


    for user_id in range(URM_test.shape[0]):

        relevant_items = URM_test.indices[URM_test.indptr[user_id]:URM_test.indptr[user_id+1]]
        # take the user row and get the cols of non-zero vals
        # uses indptr to optimise
        
        if len(relevant_items)>0:
            
            recommended_items = recommender_object.recommend(user_id, at=at)
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_MAP += MAP(recommended_items, relevant_items)
            
    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    cumulative_MAP /= num_eval
    
    print("Recommender performance is: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, cumulative_MAP)) 

### RandomRecommender

In [None]:
class RandomRecommender(object):

    def fit(self, URM_train):
           
        self.n_items = URM_train.shape[1]
    
    
    def recommend(self, user_id, at=10):
    
        recommended_items = np.random.choice(self.n_items, at)

        return recommended_items

In [None]:
randomRecommender = RandomRecommender()
randomRecommender.fit(URM_train)

In [None]:
evaluate_algorithm(URM_val, randomRecommender)

### TopPop

In [None]:
# Display 10 most popular items
item_popularity = np.ediff1d(URM_all.tocsc().indptr)
popular_items = np.argsort(item_popularity)
popular_items = np.flip(popular_items, axis = 0)

popular_items[:10]

In [None]:
class TopPopRecommender(object):

    def fit(self, URM_train):
        
        self.URM_train = URM_train

        item_popularity = np.ediff1d(URM_all.tocsc().indptr)

        # We are not interested in sorting the popularity value,
        # but to order the items according to it
        self.popular_items = np.argsort(item_popularity)
        self.popular_items = np.flip(self.popular_items, axis = 0)
    
    
    def recommend(self, user_id, at=10, remove_seen=True):
        # obvs we are removing the seen from the specific user recomm, not the general one! it would get super small otherwise

        if remove_seen:
            seen_items = self.URM_train.indices[self.URM_train.indptr[user_id]:self.URM_train.indptr[user_id+1]]
            # always using the internal structures
            
            unseen_items_mask = np.in1d(self.popular_items, seen_items,
                                        assume_unique=True, invert = True)

            unseen_items = self.popular_items[unseen_items_mask]

            recommended_items = unseen_items[0:at]

        else:
            recommended_items = self.popular_items[0:at]
            

        return recommended_items


In [None]:
topPopRecommender = TopPopRecommender()
topPopRecommender.fit(URM_train)

In [None]:
for user_id in range(10):
    print(topPopRecommender.recommend(user_id, at=10))

In [None]:
evaluate_algorithm(URM_val, topPopRecommender)

### Genre Similarity

In [None]:
from Recommenders.Similarity.Compute_Similarity_Python import Compute_Similarity_Python

class ItemKNNCBFRecommender(object):
    
    def __init__(self, URM, ICM):
        self.URM = URM
        self.ICM = ICM.tocsr()
        
            
    def fit(self, topK=50, shrink=100, normalize = True, similarity = "cosine"):
        
        similarity_object = Compute_Similarity_Python(self.ICM.T, shrink=shrink, 
                                                  topK=topK, normalize=normalize, 
                                                  similarity = similarity)
        
        self.W_sparse = similarity_object.compute_similarity()

        
    def recommend(self, user_id, at=None, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.URM[user_id]
        scores = user_profile.dot(self.W_sparse).toarray().ravel()

        if exclude_seen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]
            
        return ranking[:at]
    
    
    def filter_seen(self, user_id, scores):

        start_pos = self.URM.indptr[user_id]
        end_pos = self.URM.indptr[user_id+1]

        user_profile = self.URM.indices[start_pos:end_pos]
        
        scores[user_profile] = -np.inf

        return scores

In [None]:
GenreRecommender = ItemKNNCBFRecommender(URM_train, ICM_genre_all)
GenreRecommender.fit(shrink=0.0, topK=50)

evaluate_algorithm(URM_val, GenreRecommender)

In [None]:
from Evaluation.Evaluator import EvaluatorHoldout
evaluator_val = EvaluatorHoldout(URM_val, cutoff_list=[10])

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on 23/10/17

@author: Maurizio Ferrari Dacrema
"""

from Recommenders.Recommender_utils import check_matrix
from Recommenders.BaseCBFRecommender import BaseItemCBFRecommender
from Recommenders.BaseSimilarityMatrixRecommender import BaseItemSimilarityMatrixRecommender
from Recommenders.IR_feature_weighting import okapi_BM_25, TF_IDF
import numpy as np

from Recommenders.Similarity.Compute_Similarity import Compute_Similarity


class ItemKNNCBFRecommender(BaseItemCBFRecommender, BaseItemSimilarityMatrixRecommender):
    """ ItemKNN recommender"""

    RECOMMENDER_NAME = "ItemKNNCBFRecommender"

    FEATURE_WEIGHTING_VALUES = ["BM25", "TF-IDF", "none"]

    def __init__(self, URM_train, ICM_train, verbose = True):
        super(ItemKNNCBFRecommender, self).__init__(URM_train, ICM_train, verbose = verbose)



    def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting = "none", ICM_bias = None, **similarity_args):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError("Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'".format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if ICM_bias is not None:
            self.ICM_train.data += ICM_bias

        if feature_weighting == "BM25":
            self.ICM_train = self.ICM_train.astype(np.float32)
            self.ICM_train = okapi_BM_25(self.ICM_train)

        elif feature_weighting == "TF-IDF":
            self.ICM_train = self.ICM_train.astype(np.float32)
            self.ICM_train = TF_IDF(self.ICM_train)


        similarity = Compute_Similarity(self.ICM_train.T, shrink=shrink, topK=topK, normalize=normalize, similarity = similarity, **similarity_args)

        self.W_sparse = similarity.compute_similarity()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
        
    def fit_and_ret(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting = "none", ICM_bias = None, **similarity_args):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError("Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'".format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if ICM_bias is not None:
            self.ICM_train.data += ICM_bias

        if feature_weighting == "BM25":
            self.ICM_train = self.ICM_train.astype(np.float32)
            self.ICM_train = okapi_BM_25(self.ICM_train)

        elif feature_weighting == "TF-IDF":
            self.ICM_train = self.ICM_train.astype(np.float32)
            self.ICM_train = TF_IDF(self.ICM_train)


        similarity = Compute_Similarity(self.ICM_train.T, shrink=shrink, topK=topK, normalize=normalize, similarity = similarity, **similarity_args)

        self.W_sparse = similarity.compute_similarity()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
        return self.W_sparse

In [None]:
# from Recommenders.KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender


x_tick = [1, 5, 10, 30, 50]
MAP_per_k = []

for topK in x_tick:
    
    recommender = ItemKNNCBFRecommender(URM_train, ICM_genre_all.tocsr())
    similarity_matrix = recommender.fit_and_ret(shrink=0.5, topK=topK, feature_weighting = 'BM25')
    print(similarity_matrix.data[:10])
    
    result_df, _ = evaluator_val.evaluateRecommender(recommender)
    
    MAP_per_k.append(result_df.loc[10]["MAP"])

In [None]:
import matplotlib.pyplot as pyplot
%matplotlib inline

pyplot.plot(x_tick, MAP_per_k)
pyplot.ylabel('MAP')
pyplot.xlabel('TopK')
pyplot.show()

### Subgenre Similarity

In [8]:
from Evaluation.Evaluator import EvaluatorHoldout
evaluator_val = EvaluatorHoldout(URM_val, cutoff_list=[10])

from Recommenders.KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender

x_tick = [10, 12, 14, 18, 20]
MAP_per_k = []

for topK in x_tick:
    
    recommender = ItemKNNCBFRecommender(URM_train, ICM_subgenre_all.tocsr())
    recommender.fit(shrink=0.0, topK=topK)
    
    result_df, _ = evaluator_val.evaluateRecommender(recommender)
    
    MAP_per_k.append(result_df.loc[10]["MAP"])


EvaluatorHoldout: Ignoring 13645 ( 0.0%) Users that have less than 1 test interactions
ItemKNNCBFRecommender: ICM Detected 487 ( 2.7%) items with no features.
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 18059 (100.0%), 7732.31 column/sec. Elapsed time 2.34 sec
EvaluatorHoldout: Processed 13645 (100.0%) in 6.89 sec. Users per second: 1980
ItemKNNCBFRecommender: ICM Detected 487 ( 2.7%) items with no features.
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 18059 (100.0%), 7669.86 column/sec. Elapsed time 2.35 sec
EvaluatorHoldout: Processed 13645 (100.0%) in 7.07 sec. Users per second: 1930
ItemKNNCBFRecommender: ICM Detected 487 ( 2.7%) items with no features.
Unable to load Cython Compute_Similarity, reverting to Python
Similarity column 18059 (100.0%), 7709.15 column/sec. Elapsed time 2.34 sec
EvaluatorHoldout: Processed 13645 (100.0%) in 7.21 sec. Users per second: 1893
ItemKNNCBFRecommender: ICM Detected 487 ( 2.7%

In [None]:
# shrink tuning

from Evaluation.Evaluator import EvaluatorHoldout
evaluator_val = EvaluatorHoldout(URM_val, cutoff_list=[10])

from Recommenders.KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender

x_tick = [0.0, 0.5, 1.5, 5.0, 50.0]
MAP_per_k = []

for topK in x_tick:
    
    recommender = ItemKNNCBFRecommender(URM_train, ICM_subgenre_all.tocsr())
    recommender.fit(shrink=topK, topK=18)
    
    result_df, _ = evaluator_val.evaluateRecommender(recommender)
    
    MAP_per_k.append(result_df.loc[10]["MAP"])


In [None]:
import matplotlib.pyplot as pyplot
%matplotlib inline

pyplot.plot(x_tick, MAP_per_k)
pyplot.ylabel('MAP')
pyplot.xlabel('TopK')
pyplot.show()

### Evaluation

In [None]:
# CHOOSE ALGORITHM HERE
recommender = TopPopRecommender() # <-----
K = 10

recommender.fit(URM_all.tocsr())

user_test_path = 'data/data_target_users_test.csv'
user_test_dataframe = pd.read_csv(filepath_or_buffer=user_test_path,
                                sep=",",
                                dtype={0:int})

subm_set = user_test_dataframe.to_numpy().T[0]


subm_res = {"user_id":[], "item_list":[]}

for user_id in subm_set:
    subm_res["user_id"].append(user_id)
    res = recommender.recommend(user_id, at=K)
    res = ' '.join(map(str, res))
    if user_id < 3:
        print(user_id)
        print(res)
    subm_res["item_list"].append(res)


# print(subm_res)

submission = pd.DataFrame.from_dict(subm_res)
# submission

from datetime import datetime

now = datetime.now() # current date and time


submission.to_csv('subs/submission {:%Y_%m_%d %H_%M_%S}.csv'.format(now), index=False)
