#Connect to drive

In [1]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/MyDrive/RECSYS

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/MyDrive/RECSYS


#Import Library

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
from typing import Tuple, Callable, Dict, Optional, List

#Load Data

In [3]:
data_train = pd.read_csv('data_train.csv', )
data_target_users_test = pd.read_csv('data_target_users_test.csv')

In [4]:
data_train.columns = ["user_id", "item_id", "Interaction"]

In [5]:
print ("The number of interactions is {}".format(len(data_train)))

The number of interactions is 478730


In [6]:
userID_unique = data_train["user_id"].unique()
itemID_unique = data_train["item_id"].unique()

In [7]:
n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(data_train)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))

Number of items	 22222, Number of users	 12638
Max ID items	 22347, Max Id users	 13024



#Remove empty Index
See that the max ID of items and users is higher than the number of unique values -> empty profiles
We should remove empty indices, to do so we create a new mapping

#Splitting Data

In [8]:
import scipy.sparse as sps

URM_all = sps.coo_matrix((data_train["Interaction"].values,
                          (data_train["user_id"].values, data_train["item_id"].values)))

URM_all

<13025x22348 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in COOrdinate format>

In [9]:
URM_all.tocsr()

<13025x22348 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [10]:
def preprocess_data(interactions: pd.DataFrame):
    unique_users = interactions.user_id.unique()
    unique_items = interactions.item_id.unique()

    num_users, min_user_id, max_user_id = unique_users.size, unique_users.min(), unique_users.max()
    num_items, min_item_id, max_item_id = unique_items.size, unique_items.min(), unique_items.max()

    print(num_users, min_user_id, max_user_id)
    print(num_items, min_item_id, max_item_id)

    mapping_user_id = pd.DataFrame({"mapped_user_id": np.arange(num_users), "user_id": unique_users})
    mapping_item_id = pd.DataFrame({"mapped_item_id": np.arange(num_items), "item_id": unique_items})

    interactions = pd.merge(left=interactions,
                       right=mapping_user_id,
                       how="inner",
                       on="user_id")

    interactions = pd.merge(left=interactions,
                       right=mapping_item_id,
                       how="inner",
                       on="item_id")

    return interactions


In [11]:
interactions = preprocess_data(data_train)

12638 1 13024
22222 1 22347


In [12]:
interactions

Unnamed: 0,user_id,item_id,Interaction,mapped_user_id,mapped_item_id
0,1,7,1.0,0,0
1,2,7,1.0,1,0
2,26,7,1.0,24,0
3,36,7,1.0,34,0
4,41,7,1.0,39,0
...,...,...,...,...,...
478725,12962,20368,1.0,12579,22217
478726,12985,21058,1.0,12601,22218
478727,12989,22317,1.0,12605,22219
478728,13009,22339,1.0,12624,22220


In [13]:
def dataset_splits(interactions, num_users, num_items, validation_percentage: float, testing_percentage: float):
    seed = 1234

    (user_ids_training, user_ids_test,
     item_ids_training, item_ids_test,
     interactions_training, interactions_test) = train_test_split(interactions.mapped_user_id,
                                                        interactions.mapped_item_id,
                                                        interactions.Interaction,
                                                        test_size=testing_percentage,
                                                        shuffle=True,
                                                        random_state=seed)

    (user_ids_training, user_ids_validation,
     item_ids_training, item_ids_validation,
     interactions_training, interactions_validation) = train_test_split(user_ids_training,
                                                              item_ids_training,
                                                              interactions_training,
                                                              test_size=validation_percentage,
                                                             )

    urm_train = sp.csr_matrix((interactions_training, (user_ids_training, item_ids_training)),
                              shape=(num_users, num_items))

    urm_validation = sp.csr_matrix((interactions_validation, (user_ids_validation, item_ids_validation)),
                              shape=(num_users, num_items))

    urm_test = sp.csr_matrix((interactions_test, (user_ids_test, item_ids_test)),
                              shape=(num_users, num_items))



    return urm_train, urm_validation, urm_test



In [14]:
urm_train, urm_validation, urm_test = dataset_splits(interactions,
                                                     num_users=12638,
                                                     num_items=22222,
                                                     validation_percentage=0.10,
                                                     testing_percentage=0.20)

In [15]:
urm_train

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 344685 stored elements in Compressed Sparse Row format>

In [16]:
urm_validation

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 38299 stored elements in Compressed Sparse Row format>

In [17]:
urm_test

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 95746 stored elements in Compressed Sparse Row format>

 # Collaborative Filtering


##Item Item Similarity

In [18]:
def vector_similarity(urm: sp.csc_matrix, shrink: int):
    item_weights = np.sqrt(
        np.sum(urm.power(2), axis=0)
    ).A.flatten()

    num_items = urm.shape[1]
    urm_t = urm.T
    weights = np.empty(shape=(num_items, num_items))
    for item_id in range(num_items):
        numerator = urm_t.dot(urm[:, item_id]).A.flatten()
        denominator = item_weights[item_id] * item_weights + shrink + 1e-6

        weights[item_id] = numerator / denominator
    np.fill_diagonal(weights, 0.0)
    return weights

In [19]:
urm_csc = urm_train.tocsc()
shrink = 5
slice_size = 100

In [20]:
vector_weights = vector_similarity(urm_csc[:slice_size,:slice_size], shrink)
vector_weights

array([[0.        , 0.13907401, 0.18614064, ..., 0.        , 0.09307032,
        0.09307032],
       [0.13907401, 0.        , 0.1010205 , ..., 0.        , 0.        ,
        0.        ],
       [0.18614064, 0.1010205 , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.14854312,
        0.14854312],
       [0.09307032, 0.        , 0.        , ..., 0.14854312, 0.        ,
        0.24999997],
       [0.09307032, 0.        , 0.        , ..., 0.14854312, 0.24999997,
        0.        ]])

#Build the Recommendation System

In [21]:
class CFItemKNN(object):
    def __init__(self, shrink: int):
        self.shrink = shrink
        self.weights = None


    def fit(self, urm_train: sp.csc_matrix, similarity_function):
        if not sp.isspmatrix_csc(urm_train):
            raise TypeError(f"We expected a CSC matrix, we got {type(urm_train)}")

        self.weights = similarity_function(urm_train, self.shrink)

    def recommend(self, user_id: int, urm_train: sp.csr_matrix, at: Optional[int] = None, remove_seen: bool = True):
        user_profile = urm_train[user_id]

        ranking = user_profile.dot(self.weights).flatten()

        if remove_seen:
            user_profile_start = urm_train.indptr[user_id]
            user_profile_end = urm_train.indptr[user_id+1]

            seen_items = urm_train.indices[user_profile_start:user_profile_end]

            ranking[seen_items] = -np.inf

        ranking = np.flip(np.argsort(ranking))
        return ranking[:at]

In [22]:
class TopPopRecommender(object):

    def fit(self, URM_train):

        item_popularity = np.ediff1d(URM_train.tocsc().indptr)

        # We are not interested in sorting the popularity value,
        # but to order the items according to it
        self.popular_items = np.argsort(item_popularity)
        self.popular_items = np.flip(self.popular_items, axis = 0)


    def recommend(self, user_id, at=5):

        recommended_items = self.popular_items[0:at]

        return recommended_items

In [23]:
itemknn_recommender = CFItemKNN(shrink=50)
itemknn_recommender

<__main__.CFItemKNN at 0x7f657bcb73d0>

In [24]:
itemknn_recommender.fit(urm_train.tocsc(), vector_similarity)

In [25]:
for user_id in range(10):
    print(itemknn_recommender.recommend(user_id=user_id, urm_train=urm_train, at=10, remove_seen=True))

[674  44 189 284 587 285 808   4 119 517]
[ 517  189  284  808  557   46  206  288    1 1266]
[ 842 1424 4502 1427  135  557 4506 4515  841   44]
[  44   49  517  227  189  812  284  808 1265  557]
[ 189  808  517 1266  285  950 1265    0  206  284]
[ 517  808  557  206 1265  284 1266    0  809  285]
[ 885 1970  518 1997   51 1171 1262 3327 1328 1100]
[1353  559  517  813  189 1265 3592  808 1266  285]
[11493  2806   328   677   699 19655 15913 11496 16178  5871]
[ 189  517  284   44 1266  288  808    0  452  453]


#Evaluate the Recommendation System

In [26]:
def recall(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)

    recall_score = np.sum(is_relevant) / relevant_items.shape[0]

    return recall_score


def precision(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)

    precision_score = np.sum(is_relevant) / recommendations.shape[0]

    return precision_score

def mean_average_precision(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)

    precision_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    map_score = np.sum(precision_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score

def AP(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    ap_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return ap_score

In [27]:
def evaluator(recommender: object, urm_train: sp.csr_matrix, urm_test: sp.csr_matrix):
    recommendation_length = 10
    accum_precision = 0
    accum_recall = 0
    accum_map = 0

    num_users = urm_train.shape[0]

    num_users_evaluated = 0
    num_users_skipped = 0
    for user_id in range(num_users):
        user_profile_start = urm_test.indptr[user_id]
        user_profile_end = urm_test.indptr[user_id+1]

        relevant_items = urm_test.indices[user_profile_start:user_profile_end]

        if relevant_items.size == 0:
            num_users_skipped += 1
            continue

        recommendations = recommender.recommend(user_id=user_id,
                                               at=recommendation_length,
                                               urm_train=urm_train,
                                               remove_seen=True)

        accum_precision += precision(recommendations, relevant_items)
        accum_recall += recall(recommendations, relevant_items)
        accum_map += mean_average_precision(recommendations, relevant_items)

        num_users_evaluated += 1


    accum_precision /= max(num_users_evaluated, 1)
    accum_recall /= max(num_users_evaluated, 1)
    accum_map /=  max(num_users_evaluated, 1)

    return accum_precision, accum_recall, accum_map, num_users_evaluated, num_users_skipped

In [28]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = evaluator(itemknn_recommender,
                                                                                            urm_train,
                                                                                            urm_test)

In [29]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped

(0.061300781995044025, 0.10489312222895496, 0.05567696414929808, 10486, 2152)

#Prepare Submission

In [40]:
#forse ho sbagliato ad agggiungere anche il test set qui
best_shrink = 5
urm_train_validation = urm_train + urm_validation + urm_test
urm_train_validation

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [31]:
best_recommender = CFItemKNN(shrink=best_shrink)
best_recommender.fit(urm_train_validation.tocsc(), vector_similarity)

##Top popular for those users without info

In [32]:
toppop_recommender = TopPopRecommender()
toppop_recommender.fit(urm_train_validation)

In [33]:
users_to_recommend = np.array(data_target_users_test["user_id"])
len(users_to_recommend)

10882

In [34]:
mapping_to_item_id = dict(zip(interactions.mapped_item_id, interactions.item_id))

In [35]:
def prepare_submission(interactions: pd.DataFrame, users_to_recommend: np.array, urm_train: sp.csr_matrix, recommender: object, toppop_recommender: object):
    users_ids_and_mappings = interactions[interactions.user_id.isin(users_to_recommend)][['user_id', 'mapped_user_id']].drop_duplicates()
    items_ids_and_mappings = interactions[["item_id", "mapped_item_id"]].drop_duplicates()

    mapping_to_item_id = dict(zip(interactions.mapped_item_id, interactions.item_id))


    recommendation_length = 10
    submission = []
    for idx, row in users_ids_and_mappings.iterrows():
        user_id = row.user_id
        mapped_user_id = row.mapped_user_id

        recommendations = recommender.recommend(user_id=mapped_user_id,
                                                urm_train=urm_train,
                                                at=recommendation_length,
                                                remove_seen=True)

        submission.append((user_id, [mapping_to_item_id[item_id] for item_id in recommendations]))

    #assign top popular to the user without interactions
    users_without_info = np.setdiff1d(users_to_recommend,np.array(interactions[interactions.user_id.isin(users_to_recommend)][['user_id', 'mapped_user_id']].drop_duplicates().user_id))
    for user_id in users_without_info:
      recommendations = toppop_recommender.recommend(user_id, at = 10)
      submission.append((user_id, [mapping_to_item_id[item_id] for item_id in recommendations]))

    return submission

In [36]:
submission = prepare_submission(interactions, users_to_recommend, urm_train_validation, best_recommender, toppop_recommender)

In [37]:
len(submission)

10882

In [38]:
def write_submission(submissions):
    with open("./submission.csv", "w") as f:
        f.write("user_id,item_list\n")
        for user_id, items in submissions:
            f.write(f"{user_id},{' '.join([str(item) for item in items])}\n")


In [39]:
write_submission(submission)