In [1]:
import pandas as pd
import scipy as sc
import scipy.sparse as sps
import numpy as np
import time
import sys
import os
from matplotlib import pyplot
from sklearn.preprocessing import MultiLabelBinarizer

# addresses of the files                               
train_file ='../data/train.csv'
target_playlists_file = '../data/target_playlists.csv'
tracks_file = '../data/tracks.csv'

# reading of all files and renaming columns
train_data = pd.read_csv(train_file)
train_data.columns = ['playlist_id', 'track_id']

tracks_data = pd.read_csv(tracks_file)
tracks_data.columns = ['track_id', 'album_id', 'artist_id', 'duration_sec']

target_playlists = pd.read_csv(target_playlists_file)
target_playlists.columns = ['playlist_id']

# building the URM matrix
grouped_playlists = train_data.groupby('playlist_id', as_index=True).apply(lambda x: list(x['track_id']))
URM = MultiLabelBinarizer(sparse_output=True).fit_transform(grouped_playlists)
URM_csr = URM.tocsr()

# building the ICM matrix
artists = tracks_data.reindex(columns=['track_id', 'artist_id'])
artists.sort_values(by='track_id', inplace=True) # this seems not useful, values are already ordered
artists_list = [[a] for a in artists['artist_id']]
icm_artists = MultiLabelBinarizer(sparse_output=True).fit_transform(artists_list)
icm_artists_csr = icm_artists.tocsr()

albums = tracks_data.reindex(columns=['track_id', 'album_id'])
albums.sort_values(by='track_id', inplace=True) # this seems not useful, values are already ordered
albums_list = [[a] for a in albums['album_id']]
icm_albums = MultiLabelBinarizer(sparse_output=True).fit_transform(albums_list)
icm_albums_csr = icm_albums.tocsr()

durations = tracks_data.reindex(columns=['track_id', 'duration_sec'])
durations.sort_values(by='track_id', inplace=True) # this seems not useful, values are already ordered
durations_list = [[d] for d in durations['duration_sec']]
icm_durations = MultiLabelBinarizer(sparse_output=True).fit_transform(durations_list)
icm_durations_csr = icm_durations.tocsr()

ICM = sc.sparse.hstack((icm_albums_csr, icm_artists_csr, icm_durations_csr))
ICM_csr = ICM.tocsr()

In [2]:
%load_ext Cython

In [3]:
%%cython
import time, sys
import numpy as np
cimport numpy as np
from cpython.array cimport array, clone
from libc.math cimport sqrt
import scipy.sparse as sps

def check_matrix(X, format='csc', dtype=np.float32):
    if format == 'csc' and not isinstance(X, sps.csc_matrix):
        return X.tocsc().astype(dtype)
    elif format == 'csr' and not isinstance(X, sps.csr_matrix):
        return X.tocsr().astype(dtype)
    elif format == 'coo' and not isinstance(X, sps.coo_matrix):
        return X.tocoo().astype(dtype)
    elif format == 'dok' and not isinstance(X, sps.dok_matrix):
        return X.todok().astype(dtype)
    elif format == 'bsr' and not isinstance(X, sps.bsr_matrix):
        return X.tobsr().astype(dtype)
    elif format == 'dia' and not isinstance(X, sps.dia_matrix):
        return X.todia().astype(dtype)
    elif format == 'lil' and not isinstance(X, sps.lil_matrix):
        return X.tolil().astype(dtype)
    else:
        return X.astype(dtype)

    

import time, sys
import numpy as np
cimport numpy as np
from cpython.array cimport array, clone
from libc.math cimport sqrt
import scipy.sparse as sps

cdef class Compute_Similarity_Cython:

    cdef int TopK
    cdef long n_columns, n_rows
    cdef double[:] this_item_weights
    cdef int[:] this_item_weights_mask, this_item_weights_id
    cdef int this_item_weights_counter
    cdef int[:] user_to_item_row_ptr, user_to_item_cols
    cdef int[:] item_to_user_rows, item_to_user_col_ptr
    cdef double[:] user_to_item_data, item_to_user_data
    cdef double[:] sumOfSquared, sumOfSquared_to_1_minus_alpha, sumOfSquared_to_alpha
    cdef int shrink, normalize, adjusted_cosine, pearson_correlation, tanimoto_coefficient, asymmetric_cosine, dice_coefficient, tversky_coefficient
    cdef float asymmetric_alpha, tversky_alpha, tversky_beta
    cdef int use_row_weights
    cdef double[:] row_weights
    cdef double[:,:] W_dense

    def __init__(self, dataMatrix, topK = 100, shrink=5, normalize = True, row_weights = None):
        """
        Computes the cosine similarity on the columns of dataMatrix
        If it is computed on URM=|users|x|items|, pass the URM as is.
        If it is computed on ICM=|items|x|features|, pass the ICM transposed.
        :param dataMatrix:
        :param topK:
        :param shrink:
        :param normalize:           If True divide the dot product by the product of the norms
        :param row_weights:         Multiply the values in each row by a specified value. Array

        """

        super(Compute_Similarity_Cython, self).__init__()

        self.n_columns = dataMatrix.shape[1]
        self.n_rows = dataMatrix.shape[0]
        self.shrink = shrink
        self.normalize = normalize

        self.TopK = min(topK, self.n_columns)
        self.this_item_weights = np.zeros(self.n_columns, dtype=np.float64)
        self.this_item_weights_id = np.zeros(self.n_columns, dtype=np.int32)
        self.this_item_weights_mask = np.zeros(self.n_columns, dtype=np.int32)
        self.this_item_weights_counter = 0

        # Copy data to avoid altering the original object
        dataMatrix = dataMatrix.copy()

        # Compute sum of squared values to be used in normalization
        self.sumOfSquared = np.array(dataMatrix.power(2).sum(axis=0), dtype=np.float64).ravel()

        # Apply weight after sumOfSquared has been computed but before the matrix is
        # split in its inner data structures
        self.use_row_weights = False

        if row_weights is not None:

            if dataMatrix.shape[0] != len(row_weights):
                raise ValueError("Cosine_Similarity: provided row_weights and dataMatrix have different number of rows."
                                 "Row_weights has {} rows, dataMatrix has {}.".format(len(row_weights), dataMatrix.shape[0]))

            self.use_row_weights = True
            self.row_weights = np.array(row_weights, dtype=np.float64)

        dataMatrix = check_matrix(dataMatrix, 'csr')

        self.user_to_item_row_ptr = dataMatrix.indptr
        self.user_to_item_cols = dataMatrix.indices
        self.user_to_item_data = np.array(dataMatrix.data, dtype=np.float64)

        dataMatrix = check_matrix(dataMatrix, 'csc')
        self.item_to_user_rows = dataMatrix.indices
        self.item_to_user_col_ptr = dataMatrix.indptr
        self.item_to_user_data = np.array(dataMatrix.data, dtype=np.float64)

        if self.TopK == 0:
            self.W_dense = np.zeros((self.n_columns,self.n_columns))


    cdef int[:] getUsersThatRatedItem(self, long item_id):
        return self.item_to_user_rows[self.item_to_user_col_ptr[item_id]:self.item_to_user_col_ptr[item_id+1]]

    cdef int[:] getItemsRatedByUser(self, long user_id):
        return self.user_to_item_cols[self.user_to_item_row_ptr[user_id]:self.user_to_item_row_ptr[user_id+1]]


    cdef computeItemSimilarities(self, long item_id_input):
        """
    
        The implementation here used is:
        - Select the first item
        - Initialize a zero valued array for the similarities
        - Get the users who rated the first item
        - Loop through the users
        -- Given a user, get the items he rated (second item)
        -- Update the similarity of the items he rated
        
        """

        cdef long user_index, user_id, item_index, item_id, item_id_second
        cdef int[:] users_that_rated_item = self.getUsersThatRatedItem(item_id_input)
        cdef int[:] items_rated_by_user
        cdef double rating_item_input, rating_item_second, row_weight

        # Clean previous item
        for item_index in range(self.this_item_weights_counter):
            item_id = self.this_item_weights_id[item_index]
            self.this_item_weights_mask[item_id] = False
            self.this_item_weights[item_id] = 0.0

        self.this_item_weights_counter = 0

        # Get users that rated the items
        for user_index in range(len(users_that_rated_item)):

            user_id = users_that_rated_item[user_index]
            rating_item_input = self.item_to_user_data[self.item_to_user_col_ptr[item_id_input]+user_index]

            if self.use_row_weights:
                row_weight = self.row_weights[user_id]
            else:
                row_weight = 1.0

            # Get all items rated by that user
            items_rated_by_user = self.getItemsRatedByUser(user_id)

            for item_index in range(len(items_rated_by_user)):

                item_id_second = items_rated_by_user[item_index]

                # Do not compute the similarity on the diagonal
                if item_id_second != item_id_input:
                    # Increment similairty
                    rating_item_second = self.user_to_item_data[self.user_to_item_row_ptr[user_id]+item_index]

                    self.this_item_weights[item_id_second] += rating_item_input*rating_item_second*row_weight

                    # Update global data structure
                    if not self.this_item_weights_mask[item_id_second]:

                        self.this_item_weights_mask[item_id_second] = True
                        self.this_item_weights_id[self.this_item_weights_counter] = item_id_second
                        self.this_item_weights_counter += 1


    def compute_similarity(self, start_col=None, end_col=None):
        """
        Compute the similarity for the given dataset
        :param self:
        :param start_col: column to begin with
        :param end_col: column to stop before, end_col is excluded
        :return:
        """

        cdef int print_block_size = 500
        cdef int itemIndex, innerItemIndex, item_id, local_topK
        cdef long long topKItemIndex
        cdef long long[:] top_k_idx

        # Declare numpy data type to use vetor indexing and simplify the topK selection code
        cdef np.ndarray[long, ndim=1] top_k_partition, top_k_partition_sorting
        cdef np.ndarray[np.float64_t, ndim=1] this_item_weights_np
        cdef long processedItems = 0

        # Data structure to incrementally build sparse matrix
        # Preinitialize max possible length
        cdef double[:] values = np.zeros((self.n_columns*self.TopK))
        cdef int[:] rows = np.zeros((self.n_columns*self.TopK,), dtype=np.int32)
        cdef int[:] cols = np.zeros((self.n_columns*self.TopK,), dtype=np.int32)
        cdef long sparse_data_pointer = 0
        cdef int start_col_local = 0, end_col_local = self.n_columns
        cdef array[double] template_zero = array('d')

        if start_col is not None and start_col>0 and start_col<self.n_columns:
            start_col_local = start_col

        if end_col is not None and end_col>start_col_local and end_col<self.n_columns:
            end_col_local = end_col

        start_time = time.time()
        last_print_time = start_time

        itemIndex = start_col_local

        # Compute all similarities for each item
        while itemIndex < end_col_local:

            processedItems += 1

            if processedItems % print_block_size==0 or processedItems==end_col_local:

                current_time = time.time()

                # Set block size to the number of items necessary in order to print every 30 seconds
                itemPerSec = processedItems/(time.time()-start_time)

                print_block_size = int(itemPerSec*30)

                if current_time - last_print_time > 30  or processedItems==end_col_local:

                    print("Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min".format(
                        processedItems, processedItems*1.0/(end_col_local-start_col_local)*100, itemPerSec, (time.time()-start_time) / 60))

                    last_print_time = current_time

                    sys.stdout.flush()
                    sys.stderr.flush()


            # Computed similarities go in self.this_item_weights
            self.computeItemSimilarities(itemIndex)

            # Apply normalization and shrinkage, ensure denominator != 0
            if self.normalize:
                for innerItemIndex in range(self.n_columns):

                    if self.asymmetric_cosine:
                        self.this_item_weights[innerItemIndex] /= self.sumOfSquared_to_alpha[itemIndex] * self.sumOfSquared_to_1_minus_alpha[innerItemIndex]\
                                                             + self.shrink + 1e-6

                    else:
                        self.this_item_weights[innerItemIndex] /= self.sumOfSquared[itemIndex] * self.sumOfSquared[innerItemIndex]\
                                                             + self.shrink + 1e-6

            elif self.shrink != 0:
                for innerItemIndex in range(self.n_columns):
                    self.this_item_weights[innerItemIndex] /= self.shrink


            if self.TopK == 0:

                for innerItemIndex in range(self.n_columns):
                    self.W_dense[innerItemIndex,itemIndex] = self.this_item_weights[innerItemIndex]

            else:

                # Sort indices and select TopK
                # Using numpy implies some overhead, unfortunately the plain C qsort function is even slower
                #top_k_idx = np.argsort(this_item_weights) [-self.TopK:]

                # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
                # because we avoid sorting elements we already know we don't care about
                # - Partition the data to extract the set of TopK items, this set is unsorted
                # - Sort only the TopK items, discarding the rest
                # - Get the original item index
                #

                #this_item_weights_np = clone(template_zero, self.this_item_weights_counter, zero=False)
                this_item_weights_np = np.zeros(self.n_columns, dtype=np.float64)

                # Add weights in the same ordering as the self.this_item_weights_id data structure
                for innerItemIndex in range(self.this_item_weights_counter):
                    item_id = self.this_item_weights_id[innerItemIndex]
                    this_item_weights_np[innerItemIndex] = - self.this_item_weights[item_id]


                local_topK = min([self.TopK, self.this_item_weights_counter])

                # Get the unordered set of topK items
                top_k_partition = np.argpartition(this_item_weights_np, local_topK-1)[0:local_topK]
                # Sort only the elements in the partition
                top_k_partition_sorting = np.argsort(this_item_weights_np[top_k_partition])
                # Get original index
                top_k_idx = top_k_partition[top_k_partition_sorting]


                # Incrementally build sparse matrix, do not add zeros
                for innerItemIndex in range(len(top_k_idx)):

                    topKItemIndex = top_k_idx[innerItemIndex]

                    item_id = self.this_item_weights_id[topKItemIndex]

                    if self.this_item_weights[item_id] != 0.0:

                        values[sparse_data_pointer] = self.this_item_weights[item_id]
                        rows[sparse_data_pointer] = item_id
                        cols[sparse_data_pointer] = itemIndex

                        sparse_data_pointer += 1

            itemIndex += 1

        # End while on columns

        if self.TopK == 0:

            return np.array(self.W_dense)

        else:

            values = np.array(values[0:sparse_data_pointer])
            rows = np.array(rows[0:sparse_data_pointer])
            cols = np.array(cols[0:sparse_data_pointer])

            W_sparse = sps.csr_matrix((values, (rows, cols)),
                                    shape=(self.n_columns, self.n_columns),
                                    dtype=np.float32)

            return W_sparse


def cosine_common(X):
    """
    Function that pairwise cosine similarity of the columns in X.
    It takes only the values in common between each pair of columns
    :param X: instance of scipy.sparse.csc_matrix
    :return:
        the result of co_prodsum
        the number of co_rated elements for every column pair
    """

    X = check_matrix(X, 'csc')

    # use Cython MemoryViews for fast access to the sparse structure of X
    cdef int [:] indices = X.indices
    cdef int [:] indptr = X.indptr
    cdef float [:] data = X.data

    # initialize the result variables
    cdef int n_cols = X.shape[1]
    cdef np.ndarray[np.float32_t, ndim=2] result = np.zeros([n_cols, n_cols], dtype=np.float32)
    cdef np.ndarray[np.int32_t, ndim=2] common = np.zeros([n_cols, n_cols], dtype=np.int32)

    # let's declare all the variables that we'll use in the loop here
    # NOTE: declaring the type of your variables makes your Cython code run MUCH faster
    # NOTE: Cython allows cdef's only in the main scope
    # cdef's in nested codes will result in compilation errors
    cdef int current_col, second_col, n_i, n_j, ii, jj, n_common
    cdef float ii_sum, jj_sum, ij_sum, x_i, x_j

    for current_col in range(n_cols):
        n_i = indptr[current_col+1] - indptr[current_col]
        # the correlation matrix is symmetric,
        # let's compute only the values for the upper-right triangle
        for second_col in range(current_col+1, n_cols):
            n_j = indptr[second_col+1] - indptr[second_col]

            ij_sum, ii_sum, jj_sum = 0.0, 0.0, 0.0
            ii, jj = 0, 0
            n_common = 0

            # here we exploit the fact that the two subvectors in indices are sorted
            # to compute the dot product of the rows in common between i and j in linear time.
            # (indices[indptr[i]:indptr[i]+n_i] and indices[indptr[j]:indptr[j]+n_j]
            # contain the row indices of the non-zero items in columns i and j)
            while ii < n_i and jj < n_j:
                if indices[indptr[current_col] + ii] < indices[indptr[second_col] + jj]:
                    ii += 1
                elif indices[indptr[current_col] + ii] > indices[indptr[second_col] + jj]:
                    jj += 1
                else:
                    x_i = data[indptr[current_col] + ii]
                    x_j = data[indptr[second_col] + jj]
                    ij_sum += x_i * x_j
                    ii_sum += x_i ** 2
                    jj_sum += x_j ** 2
                    ii += 1
                    jj += 1
                    n_common += 1

            if n_common > 0:
                result[current_col, second_col] = ij_sum / np.sqrt(ii_sum * jj_sum)
                result[second_col, current_col] = result[current_col, second_col]
                common[current_col, second_col] = n_common
                common[second_col, current_col] = n_common

    return result, common



###################################################################################################################
#########################       ARGSORT

from libc.stdlib cimport malloc, free#, qsort

# Declaring QSORT as "gil safe", appending "nogil" at the end of the declaration
# Otherwise I will not be able to pass the comparator function pointer
# https://stackoverflow.com/questions/8353076/how-do-i-pass-a-pointer-to-a-c-function-in-cython
cdef extern from "stdlib.h":
    ctypedef void const_void "const void"
    void qsort(void *base, int nmemb, int size,
            int(*compar)(const_void *, const_void *)) nogil


# Node struct
ctypedef struct matrix_element_s:
    long coordinate
    double data


cdef int compare_struct_on_data(const void * a_input, const void * b_input):
    """
    The function compares the data contained in the two struct passed.
    If a.data > b.data returns >0  
    If a.data < b.data returns <0      
    
    :return int: +1 or -1
    """

    cdef matrix_element_s * a_casted = <matrix_element_s *> a_input
    cdef matrix_element_s * b_casted = <matrix_element_s *> b_input

    if (a_casted.data - b_casted.data) > 0.0:
        return +1
    else:
        return -1


cdef long[:] argsort(double[:] this_item_weights, int TopK):

    cdef array[long] template_zero = array('l')
    cdef array[long] result = clone(template_zero, TopK, zero=False)

    cdef matrix_element_s *matrix_element_array
    cdef int index, num_elements

    num_elements = len(this_item_weights)

    # Allocate vector that will be used for sorting
    matrix_element_array = < matrix_element_s *> malloc(num_elements * sizeof(matrix_element_s))

    # Fill vector wit pointers to list elements
    for index in range(num_elements):
        matrix_element_array[index].coordinate = index
        matrix_element_array[index].data = this_item_weights[index]

    # Sort array elements on their data field
    qsort(matrix_element_array, num_elements, sizeof(matrix_element_s), compare_struct_on_data)

    # Sort is from lower to higher, therefore the elements to be considered are from len-topK to len
    for index in range(TopK):
        result[index] = matrix_element_array[num_elements - index - 1].coordinate

    free(matrix_element_array)

    return result

CompileError: command 'C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\BuildTools\\VC\\Tools\\MSVC\\14.16.27023\\bin\\HostX86\\x64\\cl.exe' failed with exit status 2

In [4]:
# Evaluator

def precision(is_relevant, relevant_items):
    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)
    return precision_score

def recall(is_relevant, relevant_items):
    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]
    return recall_score

def MAP(is_relevant, relevant_items):
    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))
    map_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])
    return map_score

def evaluate_algorithm(URM_test, target_playlists, recommender_object, alfa, beta, at=10):

    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_MAP = 0.0

    num_eval = 0

    URM_test = sc.sparse.csr_matrix(URM_test)

    n_users = URM_test.shape[0]

    for user_id in target_playlists:
        
        start_pos = URM_test.indptr[user_id]
        end_pos = URM_test.indptr[user_id+1]

        if end_pos-start_pos>0:

            relevant_items = URM_test.indices[start_pos:end_pos]
            
            recommended_items = recommender_object.recommend(user_id, alfa, beta, at=at)
            num_eval+=1

            is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

            cumulative_precision += precision(is_relevant, relevant_items)
            cumulative_recall += recall(is_relevant, relevant_items)
            cumulative_MAP += MAP(is_relevant, relevant_items)


    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    cumulative_MAP /= num_eval

    print("Recommender performance is: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, cumulative_MAP))

    result_dict = {
        "precision": cumulative_precision,
        "recall": cumulative_recall,
        "MAP": cumulative_MAP,
    }

    return result_dict

In [5]:
# splitter
def train_test_holdout(URM_all, train_perc = 0.8):

    numInteractions = URM_all.nnz
    URM_all = URM_all.tocoo()

    train_mask = np.random.choice([True,False], numInteractions, [train_perc, 1-train_perc])

    URM_train = sps.coo_matrix((URM_all.data[train_mask], (URM_all.row[train_mask], URM_all.col[train_mask])))
    URM_train = URM_train.tocsr()

    test_mask = np.logical_not(train_mask)

    URM_test = sps.coo_matrix((URM_all.data[test_mask], (URM_all.row[test_mask], URM_all.col[test_mask])))
    URM_test = URM_test.tocsr()

    return URM_train, URM_test

In [6]:
class Hybrid_recommender(object):
    
    def __init__(self, URM, ICM):
        self.URM = URM
        self.ICM = ICM
            
    def fit_content_based(self, topK=50, shrink=100, normalize = True):
        
        similarity_object_content_based = Compute_Similarity_Cython(self.ICM.T, shrink=shrink, 
                                                  topK=topK, normalize=normalize)
    
        self.W_sparse_content_based = similarity_object_content_based.compute_similarity()
    
    def fit_item_based(self, topK=50, shrink=100, normalize = True):
        
        similarity_object_item_cf = Compute_Similarity_Cython(self.URM, shrink=shrink, 
                                                  topK=topK, normalize=normalize)
    
        self.W_sparse_item_cf = similarity_object_item_cf.compute_similarity()
    
    def fit_user_based(self, topK=50, shrink=100, normalize = True):
        
        similarity_object_user_based = Compute_Similarity_Cython(self.URM.T, shrink=shrink, 
                                                                topK=topK, normalize=normalize)
        
        self.W_sparse_user_cf = similarity_object_user_based.compute_similarity()
    
    def recommend(self, user_id, alfa, beta, at=None, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.URM[user_id]
        user_profile_ub = self.W_sparse_user_cf[user_id]
        
        scores_content_based = user_profile.dot(self.W_sparse_content_based).toarray().ravel()
        scores_item_cf = user_profile.dot(self.W_sparse_item_cf).toarray().ravel()
        scores_user_cf = user_profile_ub.dot(self.URM).toarray().ravel()
        
        scores = (1 - alfa - beta) * scores_content_based + alfa * scores_item_cf + beta * scores_user_cf
        
        if exclude_seen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]
            
        return ranking[:at]
    
    def filter_seen(self, user_id, scores):

        start_pos = self.URM.indptr[user_id]
        end_pos = self.URM.indptr[user_id+1]

        user_profile = self.URM.indices[start_pos:end_pos]
        
        scores[user_profile] = -np.inf

        return scores

In [7]:
URM_train, URM_test = train_test_holdout(URM_csr, train_perc = 0.8)

# to avoid dimension mismatch due to the random nature of the split
t1 = np.shape(URM_train)
t2 = np.shape(URM_test)
while(t1 != t2):
    URM_train, URM_test = train_test_holdout(URM_csr, train_perc = 0.8)
    t1 = np.shape(URM_train)
    t2 = np.shape(URM_test)


In [8]:
target = target_playlists["playlist_id"].unique()

In [None]:
# tuning the hybrid recommender
recommender = Hybrid_recommender(URM_train, ICM_csr)

# parameters to be tuned: cb_shrink, ib_shrink, ub_shrink, cb_topk, ib_topk, ub_topk, alfa, beta
#alfa = 0.5
#beta = 0.3
#cb_topk = 750
#ib_topk = 750
#ub_topk = 100
#cb_shrink = 3
#ib_shrink = 20
#ub_shrink = 20


cb_topk_range = [750, 1000]
ib_topk_range = [750, 1000]
ub_topk_range = [50, 100, 500]

cb_shrink_range = [5]
ib_shrink_range = [20]
ub_shrink_range = [20, 50]

alfa_range = [0.5, 0.6]
beta_range = [0.3, 0.4]

x = []
x_verbose = []
map_res = []
prec_res = []
rec_res = []
index = 1



for alfa in alfa_range:
    for beta in beta_range:
        for cb_shrink in cb_shrink_range:
            for ib_shrink in ib_shrink_range:
                for ub_shrink in ub_shrink_range:
                    for cb_topk in cb_topk_range:
                        for ib_topk in ib_topk_range:
                            for ub_topk in ub_topk_range:
                                recommender.fit_content_based(shrink=cb_shrink, topK=cb_topk)
                                recommender.fit_item_based(shrink=ib_shrink, topK=ib_topk)
                                recommender.fit_user_based(shrink=ub_shrink, topK=ub_topk)

                                temp_result = evaluate_algorithm(URM_test, target, recommender, alfa, beta, 10)
                                map_res.append(temp_result["MAP"])
                                prec_res.append(temp_result["precision"])
                                rec_res.append(temp_result["recall"])

                                x.append(index)
                                x_verbose.append("ITERATION " + str(index) + '\n' +
                                                 "alfa =  " + str(alfa) + '\n' + 
                                                 "beta = " + str(beta) + '\n' + 
                                                 "topk(cb,ib,ub) = " + str(cb_topk) + ',' + str(ib_topk) + ',' + str(ub_topk) + '\n' + 
                                                 "shrink(cb,ib,ub) = " + str(cb_shrink) + ',' + str(ib_shrink) + ',' + str(ub_shrink) + '\n' +
                                                 "MAP = " + str(temp_result["MAP"]) + '\n') 

                                index += 1



                                

In [None]:
pyplot.plot(x, map_res)
pyplot.ylabel('MAP')
pyplot.xlabel('iteration')
pyplot.show()

In [None]:
for triple in x_verbose:
    print(triple)

In [None]:
def initialize_output_file():
    file = open("submission.csv", 'a')
    file.write("playlist_id,track_ids" + '\n')
    return file

# useful to print to file with the right structure
def print_to_file(playlist, tracks, file):
    file.write(str(playlist) + ',')
    index = 0
    while index < 9:
        file.write(str(tracks[index]) + ' ')
        index += 1
    file.write(str(tracks[index]) + '\n')

In [None]:
# execution of the recommendations for submission
file = initialize_output_file()

recommender = Hybrid_recommender(URM_csr, ICM_csr)

recommender.fit_content_based(shrink=5, topK=100)
recommender.fit_item_based(shrink=20, topK=200)
recommender.fit_user_based(shrink=20, topK=100)

alfa = 0.43
beta = 0.37

for playlist in target_playlists.itertuples(index=True, name='Pandas'):
    playlist_id = getattr(playlist, "playlist_id")
    tracks = recommender.recommend(playlist_id, alfa, beta, 10, True)
    print_to_file(playlist_id, tracks, file)
file.close()