In [21]:
import os
from typing import Tuple, Callable, Dict, Optional, List
from Utils.writeSubmission import write_submission


import numpy as np
import pandas as pd
import scipy.sparse as sp

from sklearn.model_selection import train_test_split
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

# Create URM

In [22]:
dataset = pd.read_csv('../Input/interactions_and_impressions.csv')
dataset = dataset.drop(columns=['Impressions'])

  dataset = pd.read_csv('../Input/interactions_and_impressions.csv')


In [23]:
datasetCOO = sp.coo_matrix((dataset["Data"].values,
                          (dataset["UserID"].values, dataset["ItemID"].values)))
userIDS = dataset['UserID'].unique()
itemIDS = dataset['ItemID'].unique()

In [24]:
x = 0
URM = np.zeros((len(userIDS), len(itemIDS)), dtype=int)
while x < len(datasetCOO.data):
    if datasetCOO.data[x] == 0:
        URM[datasetCOO.row[x]][datasetCOO.col[x]] = int(1)
    x = x+1

URM

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [25]:
URM = sp.csr_matrix(URM)
URM

<41629x24507 sparse matrix of type '<class 'numpy.int64'>'
	with 1051828 stored elements in Compressed Sparse Row format>

## Train, Validation, Test SPLIT

In [26]:
URM_train, URM_test = split_train_in_two_percentage_global_sample(URM, train_percentage = 0.80)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train, train_percentage = 0.80)



In [27]:
URM_train

<41629x24507 sparse matrix of type '<class 'numpy.float64'>'
	with 673170 stored elements in Compressed Sparse Row format>

In [28]:
URM_validation

<41629x24507 sparse matrix of type '<class 'numpy.float64'>'
	with 168292 stored elements in Compressed Sparse Row format>

In [29]:
URM_test

<41629x24507 sparse matrix of type '<class 'numpy.float64'>'
	with 210366 stored elements in Compressed Sparse Row format>

## Cosine Similarity

In [30]:
def naive_similarity(urm: sp.csc_matrix, shrink: int):
    num_items = urm.shape[1]
    weights = np.empty(shape=(num_items, num_items))
    for item_i in range(num_items):
        item_i_profile = urm[:, item_i] # mx1 vector

        for item_j in range(num_items):
            item_j_profile = urm[:, item_j] # mx1 vector

            numerator = item_i_profile.T.dot(item_j_profile).todense()[0,0]
            denominator = (np.sqrt(np.sum(item_i_profile.power(2)))
                           * np.sqrt(np.sum(item_j_profile.power(2)))
                           + shrink
                           + 1e-6)

            weights[item_i, item_j] = numerator / denominator

    np.fill_diagonal(weights, 0.0)
    return weights

In [31]:
def vector_similarity(urm: sp.csc_matrix, shrink: int):
    item_weights = np.sqrt(
        np.sum(urm.power(2), axis=0)
    ).A.flatten()

    num_items = urm.shape[1]
    urm_t = urm.T
    weights = np.empty(shape=(num_items, num_items))
    for item_id in range(num_items):
        numerator = urm_t.dot(urm[:, item_id]).A.flatten()
        denominator = item_weights[item_id] * item_weights + shrink + 1e-6

        weights[item_id] = numerator / denominator

    np.fill_diagonal(weights, 0.0)
    return weights

In [32]:
def matrix_similarity(urm: sp.csc_matrix, shrink: int):
    item_weights = np.sqrt(
        np.sum(urm.power(2), axis=0)
    ).A

    numerator = urm.T.dot(urm)
    denominator = item_weights.T.dot(item_weights) + shrink + 1e-6
    weights = numerator / denominator
    np.fill_diagonal(weights, 0.0)

    return weights

In [33]:
urm_csc = URM_train.tocsc()
shrink = 5
slice_size = 100

# Collaborative Filtering ItemKNN Recommender

In [34]:
class CFItemKNN(object):
    def __init__(self, shrink: int):
        self.shrink = shrink
        self.weights = None


    def fit(self, urm_train: sp.csc_matrix, similarity_function):
        if not sp.isspmatrix_csc(urm_train):
            raise TypeError(f"We expected a CSC matrix, we got {type(urm_train)}")

        self.weights = similarity_function(urm_train, self.shrink)

    def recommend(self, user_id: int, urm_train: sp.csr_matrix, at: Optional[int] = None, remove_seen: bool = True):
        user_profile = urm_train[user_id]

        ranking = user_profile.dot(self.weights).A.flatten()

        if remove_seen:
            user_profile_start = urm_train.indptr[user_id]
            user_profile_end = urm_train.indptr[user_id+1]

            seen_items = urm_train.indices[user_profile_start:user_profile_end]

            ranking[seen_items] = -np.inf

        ranking = np.flip(np.argsort(ranking))
        return ranking[:at]

In [35]:
itemknn_recommender = CFItemKNN(shrink=50)
itemknn_recommender

<__main__.CFItemKNN at 0x7f8838a1bac0>

In [36]:
%%time

itemknn_recommender.fit(URM_train.tocsc(), matrix_similarity)

CPU times: user 3.58 s, sys: 8.07 s, total: 11.6 s
Wall time: 12.4 s


# Submission

In [37]:
best_shrink = 10
urm_train_validation = URM_train + URM_validation

In [38]:
best_recommender = CFItemKNN(shrink=best_shrink)
best_recommender.fit(urm_train_validation.tocsc(), matrix_similarity)

In [39]:
users_to_recommend = pd.read_csv("../Input/data_target_users_test.csv")['user_id']
users_to_recommend = users_to_recommend.tolist()

In [40]:
write_submission(recommender=best_recommender, urm_train=URM_train, target_users_path="../Input/data_target_users_test.csv",
                     out_path='../Output/{}_submission.csv'.format('CFItemKNN'))