In [None]:
import numpy as np
import pandas as pd 
import random
from tqdm import tqdm
import scipy.sparse as sp
import sklearn.preprocessing as sk
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MultiLabelBinarizer, normalize
import time
import sys
from matplotlib import pyplot as plt

In [None]:
def check_matrix(X, format='csc', dtype=np.float32):
    if format == 'csc' and not isinstance(X, sp.csc_matrix):
        return X.tocsc().astype(dtype)
    elif format == 'csr' and not isinstance(X, sp.csr_matrix):
        return X.tocsr().astype(dtype)
    elif format == 'coo' and not isinstance(X, sp.coo_matrix):
        return X.tocoo().astype(dtype)
    elif format == 'dok' and not isinstance(X, sp.dok_matrix):
        return X.todok().astype(dtype)
    elif format == 'bsr' and not isinstance(X, sp.bsr_matrix):
        return X.tobsr().astype(dtype)
    elif format == 'dia' and not isinstance(X, sp.dia_matrix):
        return X.todia().astype(dtype)
    elif format == 'lil' and not isinstance(X, sp.lil_matrix):
        return X.tolil().astype(dtype)
    else:
        return X.astype(dtype)



class Cosine_Similarity:


    def __init__(self, dataMatrix, topK=100, shrink = 0, normalize = True,
                 asymmetric_alpha = 0.5, tversky_alpha = 1.0, tversky_beta = 1.0,
                 similarity = "cosine", row_weights = None):
        """
        Computes the cosine similarity on the columns of dataMatrix
        If it is computed on URM=|users|x|items|, pass the URM as is.
        If it is computed on ICM=|items|x|features|, pass the ICM transposed.
        :param dataMatrix:
        :param topK:
        :param shrink:
        :param normalize:           If True divide the dot product by the product of the norms
        :param row_weights:         Multiply the values in each row by a specified value. Array
        :param asymmetric_alpha     Coefficient alpha for the asymmetric cosine
        :param similarity:  "cosine"        computes Cosine similarity
                            "adjusted"      computes Adjusted Cosine, removing the average of the users
                            "asymmetric"    computes Asymmetric Cosine
                            "pearson"       computes Pearson Correlation, removing the average of the items
                            "jaccard"       computes Jaccard similarity for binary interactions using Tanimoto
                            "dice"          computes Dice similarity for binary interactions
                            "tversky"       computes Tversky similarity for binary interactions
                            "tanimoto"      computes Tanimoto coefficient for binary interactions

        """
        """
        Asymmetric Cosine as described in: 
        Aiolli, F. (2013, October). Efficient top-n recommendation for very large scale binary rated datasets. In Proceedings of the 7th ACM conference on Recommender systems (pp. 273-280). ACM.
        
        """

        super(Cosine_Similarity, self).__init__()

        self.TopK = topK
        self.shrink = shrink
        self.normalize = normalize
        self.n_columns = dataMatrix.shape[1]
        self.n_rows = dataMatrix.shape[0]
        self.asymmetric_alpha = asymmetric_alpha
        self.tversky_alpha = tversky_alpha
        self.tversky_beta = tversky_beta

        self.dataMatrix = dataMatrix.copy()

        self.adjusted_cosine = False
        self.asymmetric_cosine = False
        self.pearson_correlation = False
        self.tanimoto_coefficient = False
        self.dice_coefficient = False
        self.tversky_coefficient = False

        if similarity == "adjusted":
            self.adjusted_cosine = True
        elif similarity == "asymmetric":
            self.asymmetric_cosine = True
        elif similarity == "pearson":
            self.pearson_correlation = True
        elif similarity == "jaccard" or similarity == "tanimoto":
            self.tanimoto_coefficient = True
            # Tanimoto has a specific kind of normalization
            self.normalize = False

        elif similarity == "dice":
            self.dice_coefficient = True
            self.normalize = False

        elif similarity == "tversky":
            self.tversky_coefficient = True
            self.normalize = False

        elif similarity == "cosine":
            pass
        else:
            raise ValueError("Cosine_Similarity: value for paramether 'mode' not recognized."
                             " Allowed values are: 'cosine', 'pearson', 'adjusted', 'asymmetric', 'jaccard', 'tanimoto',"
                             "dice, tversky."
                             " Passed value was '{}'".format(similarity))



        if self.TopK == 0:
            self.W_dense = np.zeros((self.n_columns, self.n_columns))


        self.use_row_weights = False

        if row_weights is not None:

            if dataMatrix.shape[0] != len(row_weights):
                raise ValueError("Cosine_Similarity: provided row_weights and dataMatrix have different number of rows."
                                 "Col_weights has {} columns, dataMatrix has {}.".format(len(row_weights), dataMatrix.shape[0]))

            self.use_row_weights = True
            self.row_weights = row_weights.copy()
            self.row_weights_diag = sps.diags(self.row_weights)

            self.dataMatrix_weighted = self.dataMatrix.T.dot(self.row_weights_diag).T






    def applyAdjustedCosine(self):
        """
        Remove from every data point the average for the corresponding row
        :return:
        """

        self.dataMatrix = check_matrix(self.dataMatrix, 'csr')


        interactionsPerRow = np.diff(self.dataMatrix.indptr)

        nonzeroRows = interactionsPerRow > 0
        sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel()

        rowAverage = np.zeros_like(sumPerRow)
        rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[nonzeroRows]


        # Split in blocks to avoid duplicating the whole data structure
        start_row = 0
        end_row= 0

        blockSize = 1000


        while end_row < self.n_rows:

            end_row = min(self.n_rows, end_row + blockSize)

            self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \
                np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row])

            start_row += blockSize




    def applyPearsonCorrelation(self):
        """
        Remove from every data point the average for the corresponding column
        :return:
        """

        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')


        interactionsPerCol = np.diff(self.dataMatrix.indptr)

        nonzeroCols = interactionsPerCol > 0
        sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel()

        colAverage = np.zeros_like(sumPerCol)
        colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[nonzeroCols]


        # Split in blocks to avoid duplicating the whole data structure
        start_col = 0
        end_col= 0

        blockSize = 1000


        while end_col < self.n_columns:

            end_col = min(self.n_columns, end_col + blockSize)

            self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \
                np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col])

            start_col += blockSize


    def useOnlyBooleanInteractions(self):

        # Split in blocks to avoid duplicating the whole data structure
        start_pos = 0
        end_pos= 0

        blockSize = 1000


        while end_pos < len(self.dataMatrix.data):

            end_pos = min(len(self.dataMatrix.data), end_pos + blockSize)

            self.dataMatrix.data[start_pos:end_pos] = np.ones(end_pos-start_pos)

            start_pos += blockSize




    def compute_similarity(self, start_col=None, end_col=None, block_size = 100):
        """
        Compute the similarity for the given dataset
        :param self:
        :param start_col: column to begin with
        :param end_col: column to stop before, end_col is excluded
        :return:
        """

        values = []
        rows = []
        cols = []

        start_time = time.time()
        start_time_print_batch = start_time
        processedItems = 0


        if self.adjusted_cosine:
            self.applyAdjustedCosine()

        elif self.pearson_correlation:
            self.applyPearsonCorrelation()

        elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient:
            self.useOnlyBooleanInteractions()


        # We explore the matrix column-wise
        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')


        # Compute sum of squared values to be used in normalization
        sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel()

        # Tanimoto does not require the square root to be applied
        if not (self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient):
            sumOfSquared = np.sqrt(sumOfSquared)

        if self.asymmetric_cosine:
            sumOfSquared_to_1_minus_alpha = np.power(sumOfSquared, 2 * (1 - self.asymmetric_alpha))
            sumOfSquared_to_alpha = np.power(sumOfSquared, 2 * self.asymmetric_alpha)


        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')

        start_col_local = 0
        end_col_local = self.n_columns

        if start_col is not None and start_col>0 and start_col<self.n_columns:
            start_col_local = start_col

        if end_col is not None and end_col>start_col_local and end_col<self.n_columns:
            end_col_local = end_col




        start_col_block = start_col_local

        this_block_size = 0

        # Compute all similarities for each item using vectorization
        while start_col_block < end_col_local:

            # Add previous block size
            processedItems += this_block_size

            end_col_block = min(start_col_block + block_size, end_col_local)
            this_block_size = end_col_block-start_col_block


            if time.time() - start_time_print_batch >= 30 or end_col_block==end_col_local:
                columnPerSec = processedItems / (time.time() - start_time)

                print("Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min".format(
                    processedItems, processedItems / (end_col_local - start_col_local) * 100, columnPerSec, (time.time() - start_time)/ 60))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print_batch = time.time()


            # All data points for a given item
            item_data = self.dataMatrix[:, start_col_block:end_col_block]
            item_data = item_data.toarray().squeeze()

            if self.use_row_weights:
                #item_data = np.multiply(item_data, self.row_weights)
                #item_data = item_data.T.dot(self.row_weights_diag).T
                this_block_weights = self.dataMatrix_weighted.T.dot(item_data)

            else:
                # Compute item similarities
                this_block_weights = self.dataMatrix.T.dot(item_data)



            for col_index_in_block in range(this_block_size):

                if this_block_size == 1:
                    this_column_weights = this_block_weights
                else:
                    this_column_weights = this_block_weights[:,col_index_in_block]


                columnIndex = col_index_in_block + start_col_block
                this_column_weights[columnIndex] = 0.0

                # Apply normalization and shrinkage, ensure denominator != 0
                if self.normalize:

                    if self.asymmetric_cosine:
                        denominator = sumOfSquared_to_alpha[columnIndex] * sumOfSquared_to_1_minus_alpha + self.shrink + 1e-6
                    else:
                        denominator = sumOfSquared[columnIndex] * sumOfSquared + self.shrink + 1e-6

                    this_column_weights = np.multiply(this_column_weights, 1 / denominator)


                # Apply the specific denominator for Tanimoto
                elif self.tanimoto_coefficient:
                    denominator = sumOfSquared[columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights, 1 / denominator)

                elif self.dice_coefficient:
                    denominator = sumOfSquared[columnIndex] + sumOfSquared + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights, 1 / denominator)

                elif self.tversky_coefficient:
                    denominator = this_column_weights + \
                                  (sumOfSquared[columnIndex] - this_column_weights)*self.tversky_alpha + \
                                  (sumOfSquared - this_column_weights)*self.tversky_beta + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights, 1 / denominator)

                # If no normalization or tanimoto is selected, apply only shrink
                elif self.shrink != 0:
                    this_column_weights = this_column_weights/self.shrink


                #this_column_weights = this_column_weights.toarray().ravel()

                if self.TopK == 0:
                    self.W_dense[:, columnIndex] = this_column_weights

                else:
                    # Sort indices and select TopK
                    # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
                    # - Partition the data to extract the set of relevant items
                    # - Sort only the relevant items
                    # - Get the original item index
                    relevant_items_partition = (-this_column_weights).argpartition(self.TopK-1)[0:self.TopK]
                    relevant_items_partition_sorting = np.argsort(-this_column_weights[relevant_items_partition])
                    top_k_idx = relevant_items_partition[relevant_items_partition_sorting]

                    # Incrementally build sparse matrix, do not add zeros
                    notZerosMask = this_column_weights[top_k_idx] != 0.0
                    numNotZeros = np.sum(notZerosMask)

                    values.extend(this_column_weights[top_k_idx][notZerosMask])
                    rows.extend(top_k_idx[notZerosMask])
                    cols.extend(np.ones(numNotZeros) * columnIndex)





            start_col_block += block_size

        # End while on columns


        if self.TopK == 0:
            return self.W_dense

        else:

            W_sparse = sp.csr_matrix((values, (rows, cols)),
                                      shape=(self.n_columns, self.n_columns),
                                      dtype=np.float32)


            return W_sparse

In [None]:
class Utils(object):

    def __init__(self, train, tracks, target_playlists):
        self.train = train
        self.tracks = tracks
        self.target_playlists = target_playlists

    def get_target_playlists(self):
        return self.target_playlists

    @staticmethod
    def get_top_10(URM, target_playlist, row):
        my_songs = URM.indices[URM.indptr[target_playlist]:URM.indptr[target_playlist + 1]]
        row[my_songs] = -np.inf
        relevant_items_partition = (-row).argpartition(10)[0:10]
        relevant_items_partition_sorting = np.argsort(-row[relevant_items_partition])
        ranking = relevant_items_partition[relevant_items_partition_sorting]
        return ranking

    @staticmethod
    def get_similarity_normalized(matrix, normalize, knn, shrink, mode):
        if normalize == False:
            shrink = 0
        similarity = Cosine_Similarity(dataMatrix=matrix, normalize=normalize, shrink=shrink, similarity=mode, topK=knn)
        S = similarity.compute_similarity()
        return S.tocsr()

    def get_URM(self):
        grouped = self.train.groupby('playlist_id', as_index=True).apply((lambda playlist: list(playlist['track_id'])))
        URM = MultiLabelBinarizer(classes=self.tracks['track_id'].unique(), sparse_output=True).fit_transform(grouped)
        return URM

    def get_weighted_URM(self, URM):
        S = []
        cols = URM.shape[1]
        for i, row in tqdm(enumerate(URM)):
            if i in list(self.target_playlists['playlist_id'][:5000]):
                column_indexes = np.array(range(len(row.indices)))
                row_values = row.data / np.log2(column_indexes + 2)
                row_index = np.zeros((len(row.indices)), dtype=int)
                new_row = sp.csr_matrix((row_values, (row_index, row.indices[row.indices.argsort()])), shape=(1, cols))
            else:
                new_row = row
            S.append(new_row)
        return sp.vstack(S).tocsr()

    def get_UCM(self, URM):
        UCM = TfidfTransformer().fit_transform(URM.T).T
        return UCM

    def get_ICM(self):  # returns Item Content Matrix
        grouped = self.tracks.groupby('track_id', as_index=True).apply((lambda track: list(track['artist_id'])))

        ICM_artists = MultiLabelBinarizer(classes=self.tracks['artist_id'].unique(), sparse_output=True).fit_transform(
            grouped)
        ICM_artists = ICM_artists * 0.8  # best weight for the artis feature
        #ICM_artists = TfidfTransformer().fit_transform(ICM_artists.T).T

        grouped = self.tracks.groupby('track_id', as_index=True).apply((lambda track: list(track['album_id'])))
        ICM_albums = MultiLabelBinarizer(classes=self.tracks['album_id'].unique(), sparse_output=True).fit_transform(
            grouped)
        #ICM_albums = TfidfTransformer().fit_transform(ICM_albums.T).T

        ICM = sp.hstack((ICM_artists, ICM_albums))
        return ICM

    def get_itemsim_CB(self, knn, shrink, mode, normalize):
        ICM = self.get_ICM()
        return self.get_similarity_normalized(ICM.T, normalize, knn, shrink, mode)

    def get_itemsim_CF(self, URM, knn, shrink, mode, normalize):
        UCM = self.get_UCM(URM)
        return self.get_similarity_normalized(UCM, normalize, knn, shrink, mode)

    def get_usersim_CF(self, URM, knn, shrink, mode, normalize):
        # UCM = self.get_UCM(URM)
        return self.get_similarity_normalized(URM.T, normalize, knn, shrink, mode)


In [None]:
class Eval(object):

    def __init__(self, u):
        self.URM = u.get_URM()
        self.target_playlists = None
        self.target_tracks = None
        self.URM_train = None
        self.build_URM_test()

    def build_URM_test(self):
        total_users = self.URM.shape[0]
        self.URM_train = self.URM.copy().tolil()
        possibile_playlists = [i for i in range(total_users) if len(
            self.URM.indices[self.URM.indptr[i]:self.URM.indptr[i + 1]]) > 10]  # playlists with more than 10 songs

        self.target_playlists = pd.DataFrame(data=random.sample(possibile_playlists, int(0.20 * total_users)),
                                             columns=['playlist_id'])  # target playlists, 20% of total playlists
        self.target_tracks = []

        for idx in list(self.target_playlists['playlist_id']):
            target_songs = random.sample(list(self.URM.indices[self.URM.indptr[idx]:self.URM.indptr[idx + 1]]), 10)
            self.URM_train[idx, target_songs] = 0
            self.target_tracks.append(target_songs)

        self.target_tracks = np.array(self.target_tracks)
        self.URM_train = self.URM_train.tocsr()

    def get_URM_train(self):
        return self.URM_train

    def get_target_playlists(self):
        return self.target_playlists

    def get_target_tracks(self):
        return self.target_tracks

    @staticmethod
    def AP(recommended_items, relevant_items):
        relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)
        p_at_k = relevant * np.cumsum(relevant, dtype=np.float32) / (1 + np.arange(relevant.shape[0]))
        map_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], relevant.shape[0]])
        return map_score

    # input has to be the URM and the dataframe returned by the recommender
    # NB: the songs in the dataframe must be a list (or ndarray), not a string!
    def MAP(self, df, relevant_items):
        print("Evaluating", flush=True)
        MAP = 0.0
        num_eval = 0

        for i in range(df.shape[0]):
            relevant = relevant_items[i]
            if len(relevant_items) > 0:
                recommended_items = df['track_ids'][i]
                num_eval += 1
                MAP += self.AP(recommended_items, relevant)

        MAP /= num_eval
        print("Recommender performance is {:.8f}".format(MAP))
        return MAP

    def result_diff(self, result_dfs):

        # load  all results form various recommenders
        # for file in files:
        #   results.append(pd.read_csv(file))

        for i, result in enumerate(result_dfs):
            for j, result_2 in enumerate(result_dfs):
                tot_diff = 0
                for row, row_2 in zip(result['track_ids'], result_2['track_ids']):
                    row, row_2 = list(row), list(row_2)
                    row = [el for el in row if el != ' ']
                    row_2 = [el for el in row_2 if el != ' ']
                    tot_diff += [1 for x, y in zip(row, row_2) if x != y].count(1)
                print('Total differences between res %d and res %d are: %d' % (i, j, tot_diff))

In [None]:
class SlimBPR(object):

    def __init__(self,
                 URM,
                 learning_rate=0.01,
                 epochs=1,
                 positive_item_regularization=1.0,
                 negative_item_regularization=1.0,
                 nnz=1):
        self.URM = URM
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.positive_item_regularization = positive_item_regularization
        self.negative_item_regularization = negative_item_regularization
        self.nnz = nnz
        self.n_playlist = self.URM.shape[0]
        self.n_track = self.URM.shape[1]

        self.similarity_matrix = sp.lil_matrix((self.n_track, self.n_playlist))

    def sample(self):

        playlist_id = np.random.choice(self.n_playlist)

        # get tracks in playlist and choose one
        tracks = self.URM[playlist_id, :].indices
        pos_track_id = np.random.choice(tracks)

        negTrackSelected = False

        while not negTrackSelected:
            neg_track_id = np.random.choice(self.n_track)
            if neg_track_id not in tracks:
                negTrackSelected = True
        return playlist_id, pos_track_id, neg_track_id

    def epochIteration(self):

        numPosInteractions = int(self.URM.nnz * self.nnz)

        # sampling without replacement
        # tqdm performs range op with progress visualization
        for num_sample in tqdm(range(numPosInteractions)):

            playlist_id, pos_track_id, neg_track_id = self.sample()

            tracks = self.URM[playlist_id, :].indices

            # Prediction
            x_i = self.similarity_matrix[pos_track_id, tracks].sum()
            x_j = self.similarity_matrix[neg_track_id, tracks].sum()

            # Gradient
            x_ij = x_i - x_j

            gradient = 1 / (1 + np.exp(x_ij))

            for i in tracks:
                # dp and dn outside for?
                dp = gradient - self.positive_item_regularization * x_i
                self.similarity_matrix[pos_track_id, i] = self.similarity_matrix[
                                                              pos_track_id, i] + self.learning_rate * dp
                dn = gradient - self.negative_item_regularization * x_j
                self.similarity_matrix[neg_track_id, i] = self.similarity_matrix[
                                                              neg_track_id, i] - self.learning_rate * dn

            self.similarity_matrix[pos_track_id, pos_track_id] = 0
            self.similarity_matrix[pos_track_id, pos_track_id] = 0

    def get_S_SLIM_BPR(self, knn):
        print('get S Slim BPR...')

        for numEpoch in range(self.epochs):
            print('Epoch: ', numEpoch)
            self.epochIteration()

        # replace with our own knn methods
        print('Keeping only knn =', knn, '...')
        similarity_matrix_csr = self.similarity_matrix.tocsr()

        for row in tqdm(range(0, similarity_matrix_csr.shape[0])):
            ordered_indices = similarity_matrix_csr[row, :].data.argsort()[:-knn]
            similarity_matrix_csr[row, :].data[ordered_indices] = 0
        sp.csr_matrix.eliminate_zeros(similarity_matrix_csr)

        return similarity_matrix_csr

In [None]:
class Item_CBR(object):

    def __init__(self, u):
        self.u = u
        self.URM = None
        self.target_playlists = None
        self.S = None

    def fit(self, URM, target_playlists, knn, shrink, mode, normalize):
        self.URM = URM
        self.target_playlists = target_playlists
        self.S = self.u.get_itemsim_CB(knn, shrink, mode, normalize)

    def recommend(self, target_playlist):
        row = self.URM[target_playlist].dot(self.S).toarray().ravel()
        return self.u.get_top_10(self.URM, target_playlist, row)

In [None]:
class Item_CFR(object):

    def __init__(self, u):
        self.u = u
        self.URM = None
        self.target_playlists = None
        self.S = None

    def fit(self, URM, target_playlists, knn, shrink, mode, normalize):
        self.URM = URM
        self.target_playlists = target_playlists
        self.S = self.u.get_itemsim_CF(self.URM, knn, shrink, mode, normalize)

    def recommend(self, target_playlist):
        row = self.URM[target_playlist].dot(self.S).toarray().ravel()
        return self.u.get_top_10(self.URM, target_playlist, row)

In [None]:
class User_CFR(object):

    def __init__(self, u):
        self.u = u
        self.URM = None
        self.target_playlists = None
        self.S = None

    def fit(self, URM, target_playlists, knn, shrink, mode, normalize):
        self.URM = URM
        self.target_playlists = target_playlists
        self.S = self.u.get_usersim_CF(self.URM, knn, shrink, mode, normalize)

    def recommend(self, target_playlist):
        row = self.S[target_playlist].dot(self.URM).toarray().ravel()
        return self.u.get_top_10(self.URM, target_playlist, row)

In [None]:
class Ensemble_item(object):

    def __init__(self, u):
        self.u = u
        self.S_CB = None
        self.S_CF = None
        self.target_playlists = None
        self.URM = None
        self.alfa = None

    def fit(self, URM, target_playlists, knn1, knn2, shrink, mode, normalize, alfa):
        self.URM = URM
        self.alfa = alfa
        self.target_playlists = target_playlists
        self.S_CF = self.u.get_itemsim_CF(self.URM, knn1, shrink, mode, normalize)
        self.S_CB = self.u.get_itemsim_CB(knn2, shrink, mode, normalize)

    def recommend(self, target_playlist):
        row_cb = self.URM[target_playlist].dot(self.S_CB)
        row_cf = self.URM[target_playlist].dot(self.S_CF)
        row = ((self.alfa*row_cb) + ((1-self.alfa)*row_cf)).toarray().ravel()
        return self.u.get_top_10(self.URM, target_playlist, row)

In [None]:
class Ensemble_cf(object):

    def __init__(self, u):
        self.u = u
        self.S_CF_I = None
        self.S_CF_U = None
        self.target_playlists = None
        self.URM = None
        self.alfa = None

    def fit(self, URM, target_playlists, knn1, knn2, shrink, mode, normalize, alfa):
        self.URM = URM
        self.alfa = alfa
        self.target_playlists = target_playlists
        self.S_CF_I = self.u.get_itemsim_CF(self.URM, knn1, shrink, mode, normalize)
        self.S_CF_U = self.u.get_usersim_CF(self.URM, knn2, shrink, mode, normalize)

    def recommend(self, target_playlist):
        row_cf_i = self.URM[target_playlist].dot(self.S_CF_I)
        row_cf_u = self.S_CF_U[target_playlist].dot(self.URM)
        row = ((self.alfa * row_cf_i) + ((1-self.alfa) * row_cf_u)).toarray().ravel()
        return self.u.get_top_10(self.URM, target_playlist, row)

In [None]:
class Hybrid(object):

    def __init__(self, u):
        self.u = u
        self.S_CB = None
        self.S_CF_item = None
        self.S_CF_user = None
        self.S_user = None
        self.S_item = None
        self.target_playlists = None
        self.URM = None
        self.weights = None

    def fit(self, URM, target_playlists, knn1, knn2, knn3, shrink, mode, normalize, weights):
        self.URM = URM
        self.weights = weights
        self.target_playlists = target_playlists
        self.S_CF_item = self.u.get_itemsim_CF(self.URM, knn1, shrink, mode, normalize)
        self.S_user = self.u.get_usersim_CF(self.URM, knn2, shrink, mode, normalize)
        self.S_CB = self.u.get_itemsim_CB(knn3, shrink, mode, normalize)
        self.S_item = (weights[0] * self.S_CF_item) + ((1 - weights[0]) * self.S_CB)

    def recommend(self, target_playlist):
        row_user = self.S_user[target_playlist].dot(self.URM)
        row_item = self.URM[target_playlist].dot(self.S_item)
        row = ((self.weights[1] * row_item) + ((1 - self.weights[1]) * row_user)).toarray().ravel()
        return self.u.get_top_10(self.URM, target_playlist, row)

In [None]:
class Ensemble_cfcb(object):
    
    def __init__(self, u):
      self.u = u
      self.S_CB = None
      self.S_CF_I = None
      self.S_CF_U = None
      self.target_playlists = None
      self.URM = None
      self.weights = None

    def fit(self, URM, target_playlists, knn1, knn2, knn3, shrink, mode, normalize, weights):
        self.URM = URM
        self.weights = weights
        self.target_playlists = target_playlists
        self.S_CF_I = self.u.get_itemsim_CF(self.URM, knn1, shrink, mode, normalize)
        self.S_CF_U = self.u.get_usersim_CF(self.URM, knn2, shrink, mode, normalize)
        self.S_CB = self.u.get_itemsim_CB(knn3, shrink, mode, normalize)

    def recommend(self, target_playlist):
        row_cb = self.URM[target_playlist].dot(self.S_CB)
        row_cf_i = self.URM[target_playlist].dot(self.S_CF_I)
        row_cf_u = self.S_CF_U[target_playlist].dot(self.URM)
        row = ((self.weights[0] * row_cf_i) + (self.weights[1] * row_cf_u) + (
                    self.weights[2] * row_cb)).toarray().ravel()
        return self.u.get_top_10(self.URM, target_playlist, row)

In [None]:
class SlimBPR(object):

    def __init__(self):
        self.URM = None
        self.target_playlists = None
        self.num_playlist_to_recommend = None
        self.Slim = None
        self.u = None

    def fit(self, URM, Slim, target_playlists, num_playlist_to_recommend,
            learning_rate, epochs, positive_item_regularization,
            negative_item_regularization, nzz, u):
        self.URM = URM
        self.target_playlists = target_playlists
        self.num_playlist_to_recommend = num_playlist_to_recommend
        self.Slim = Slim
        self.u = u

    def recommend(self, is_test):
        self.is_test = is_test

        final_result = pd.DataFrame(index=range(self.target_playlists.shape[0]), columns=('playlist_id', 'track_ids'))

        print('Predicting...', flush=True)
        for j, i in tqdm(enumerate(np.array(self.target_playlists))):
            row = self.URM[i].dot(self.Slim)

            # Make prediction
            result_tracks = self.u.get_top10_tracks(self.URM, i[0], row)
            string_rec = ' '.join(map(str, result_tracks.reshape(1, 10)[0]))
            final_result['playlist_id'][j] = int(i)
            if is_test:
                final_result['track_ids'][j] = result_tracks
            else:
                final_result['track_ids'][j] = string_rec

        return final_result

In [None]:
class Recommender(object):

    def __init__(self):
        self.train = pd.read_csv("../input/train.csv")
        self.tracks = pd.read_csv("../input/tracks.csv")
        self.target_playlists = pd.read_csv("../input/target_playlists.csv")
        self.u = Utils(self.train, self.tracks, self.target_playlists)
        self.e = Eval(self.u)
        self.URM_full = self.u.get_URM()
        self.URM_train = self.e.get_URM_train()
        
    @staticmethod
    def evaluate(recommender, is_test, target_playlists):
        final_result = pd.DataFrame(index=range(target_playlists.shape[0]), columns=('playlist_id', 'track_ids'))

        for i, target_playlist in tqdm(enumerate(np.array(target_playlists))):
            result_tracks = recommender.recommend(int(target_playlist))
            string_rec = ' '.join(map(str, result_tracks.reshape(1, 10)[0]))
            final_result['playlist_id'][i] = int(target_playlist)
            if is_test:
                final_result['track_ids'][i] = result_tracks
            else:
                final_result['track_ids'][i] = string_rec
        return final_result

    def recommend_itemCBR(self, is_test, knn=150, shrink=10, mode='cosine', normalize=True):
        rec = Item_CBR(self.u)
        if is_test:
            target_playlists = self.e.get_target_playlists()
            rec.fit(self.URM_train, target_playlists, knn, shrink, mode, normalize)
            result = self.evaluate(rec, True, target_playlists)
            return self.e.MAP(result, self.e.get_target_tracks())
        else:
            target_playlists = self.u.get_target_playlists()
            rec.fit(self.URM_full, target_playlists, knn, shrink, mode, normalize)
            result = self.evaluate(rec, False, target_playlists)
            result.to_csv("predictions/item_cbr.csv", index=False)

    def recommend_itemCFR(self, is_test, knn=250, shrink=10, mode='cosine', normalize=True):
        rec = Item_CFR(self.u)
        if is_test:
            target_playlists = self.e.get_target_playlists()
            rec.fit(self.URM_train, target_playlists, knn, shrink, mode, normalize)
            result = self.evaluate(rec, True, target_playlists)
            return self.e.MAP(result, self.e.get_target_tracks())
        else:
            target_playlists = self.u.get_target_playlists()
            rec.fit(self.URM_full, target_playlists, knn, shrink, mode, normalize)
            result = self.evaluate(rec, False, target_playlists)
            result.to_csv("predictions/item_cfr.csv", index=False)

    def recommend_userCFR(self, is_test, knn=250, shrink=10, mode='cosine', normalize=True):
        rec = User_CFR(self.u)
        if is_test:
            target_playlists = self.e.get_target_playlists()
            rec.fit(self.URM_train, target_playlists, knn, shrink, mode, normalize)
            result = self.evaluate(rec, True, target_playlists)
            return self.e.MAP(result, self.e.get_target_tracks())
        else:
            target_playlists = self.u.get_target_playlists()
            rec.fit(self.URM_full, target_playlists, knn, shrink, mode, normalize)
            result = self.evaluate(rec, False, target_playlists)
            result.to_csv("predictions/user_cfr1.csv", index=False)

    def recommend_ensemble_item(self, is_test, alfa=0.6, knn1=250, knn2=250, shrink=10, mode='cosine', normalize=True):
        rec = Ensemble_item(self.u)
        if is_test:
            target_playlists = self.e.get_target_playlists()
            rec.fit(self.URM_train, target_playlists, knn1, knn2, shrink, mode, normalize, alfa)
            result = self.evaluate(rec, True, target_playlists)
            self.e.MAP(result, self.e.get_target_tracks())
        else:
            target_playlists = self.u.get_target_playlists()
            rec.fit(self.URM_full, target_playlists, knn1, knn2, shrink, mode, normalize, alfa)
            result = self.evaluate(rec, False, target_playlists)
            result.to_csv("predictions/ensemble_item.csv", index=False)

    def recommend_ensemble_cf(self, is_test, alfa=0.6, knn1=250, knn2=250, shrink=10, mode='cosine', normalize=True):
        rec = Ensemble_cf(self.u)
        if is_test:
            target_playlists = self.e.get_target_playlists()
            rec.fit(self.URM_train, target_playlists, knn1, knn2, shrink, mode, normalize, alfa)
            result = self.evaluate(rec, True, target_playlists)
            self.e.MAP(result, self.e.get_target_tracks())
        else:
            target_playlists = self.u.get_target_playlists()
            rec.fit(self.URM_full, target_playlists, knn1, knn2, shrink, mode, normalize, alfa)
            result = self.evaluate(rec, False, target_playlists)
            result.to_csv("predictions/ensemble_cf.csv", index=False)

    def recommend_ensemble_cfcb(self, is_test, weights=[0.6, 0.4, 0.5], knn1=250, knn2=250, knn3=200, shrink=10,
                                mode='cosine', normalize=True):
        rec = Ensemble_cfcb(self.u)
        if is_test:
            target_playlists = self.e.get_target_playlists()
            rec.fit(self.URM_train, target_playlists, knn1, knn2, knn3, shrink, mode, normalize, weights)
            result = self.evaluate(rec, True, target_playlists)
            self.e.MAP(result, self.e.get_target_tracks())
        else:
            target_playlists = self.u.get_target_playlists()
            rec.fit(self.URM_full, target_playlists, knn1, knn2, knn3, shrink, mode, normalize, weights)
            result = self.evaluate(rec, False, target_playlists)
            result.to_csv("predictions/ensemble_cfcb.csv", index=False)

    def recommend_hybrid(self, is_test, weights=[0.6, 0.7], knn1=250, knn2=250, knn3=200, shrink=10, mode='cosine',
                         normalize=True):
        rec = Hybrid(self.u)
        if is_test:
            target_playlists = self.e.get_target_playlists()
            rec.fit(self.URM_train, target_playlists, knn1, knn2, knn3, shrink, mode, normalize, weights)
            result = self.evaluate(rec, True, target_playlists)
            self.e.MAP(result, self.e.get_target_tracks())
        else:
            target_playlists = self.u.get_target_playlists()
            rec.fit(self.URM_full, target_playlists, knn1, knn2, knn3, shrink, mode, normalize, weights)
            result = self.evaluate(rec, False, target_playlists)
            result.to_csv("predictions/hybrid.csv", index=False)

    def recommend_slimBPR(self, is_test, knn=100):
        rec = SlimBPR()
        if is_test:
            BPR_gen = SlimBPR_utils(self.URM_train)
            S_bpr = BPR_gen.get_S_SLIM_BPR(knn)
            target_playlists = self.e.get_target_playlists()
            rec.fit(self.URM_train, S_bpr, target_playlists, 10000,
                    learning_rate=0.1, epochs=1, positive_item_regularization=1.0,
                    negative_item_regularization=1.0, nzz=1, u=self.u)
            result = rec.recommend(True)
            self.e.MAP(result, self.e.get_target_tracks())
        else:
            BPR_gen = SlimBPR_utils(self.URM_full)
            S_bpr = BPR_gen.get_S_SLIM_BPR(knn)
            target = self.u.get_target_playlists()
            rec.fit(self.URM_full, S_bpr, target, 10000,
                    learning_rate=0.1, epochs=1, positive_item_regularization=1.0,
                    negative_item_regularization=1.0, nzz=1, u=self.u)
            result = rec.recommend(False)
            result.to_csv("predictions/slimBPR.csv", index=False)

    def recommend_ensemble_cfcb_SlimBPR(self, is_test, weights=[0.6, 0.5, 0.5, 0.6], knn1=150, knn2=150, knn3=200,
                                        knn4=800, shrink=10, mode='cosine', normalize=True):
        rec = Ensemble_cfcb_sbpr(self.u)
        if is_test:
            BPR_gen = SlimBPR_utils(self.URM_train)
            S_bpr = BPR_gen.get_S_SLIM_BPR(knn4)
            target_playlists = self.e.get_target_playlists()
            rec.fit(self.URM_train, S_bpr, target_playlists, knn1, knn2, knn3, shrink, mode, normalize, weights)
            result = self.evaluate(rec, True, target_playlists)
            self.e.MAP(result, self.e.get_target_tracks())
        else:
            BPR_gen = SlimBPR_utils(self.URM_full)
            S_bpr = BPR_gen.get_S_SLIM_BPR(knn4)
            target_playlists = self.e.get_target_playlists()
            rec.fit(self.URM_train, S_bpr, target_playlists, knn1, knn2, knn3, shrink, mode, normalize, weights)
            result = self.evaluate(rec, False, target_playlists)
            result.to_csv("predictions/ensemble_cfcb_bpr.csv", index=False)

In [None]:
results = []
for i in range(15):
    rec = Recommender()
    results.append(rec.recommend_itemCBR(True))
print(np.array(results).mean())

In [None]:
np.array(results).mean()

In [None]:
0.04440846754886828


In [None]:
run = Recommender()

In [51]:
URM = run.u.get_URM()

In [96]:
train_perc = 0.8
target_playlists = np.random.choice(URM.shape[0], 10000, replace=False)
target_tracks = []
URM_all = URM[target_playlists]
URM_seq = URM_all[:5000].tolil()
print(URM_seq.nnz)
URM_random = URM_all[5000:].tocoo()

118137


In [97]:
numInteractions = URM_random.nnz

train_mask = np.random.choice([True,False], numInteractions, [train_perc, 1-train_perc])
test_mask = np.logical_not(train_mask)

URM_train = sp.coo_matrix((URM_random.data[train_mask], (URM_random.row[train_mask], URM_random.col[train_mask])),shape=(5000,URM_random.shape[1]))
URM_train = URM_train.tocsr()

URM_test = sp.coo_matrix((URM_random.data[test_mask], (URM_random.row[test_mask], URM_random.col[test_mask])),shape=(5000,URM_random.shape[1]))
URM_test = URM_test.tocsr()

for idx in range(URM_seq.shape[0]):
    length = int(len(URM_seq[idx].indices)*0.2)
    target_songs = URM_seq[idx].indices[-length:]
    URM_seq[idx, target_songs] = 0
    target_tracks.append(target_songs)

for idx in range(URM_test.shape[0]):
    target_songs = URM_test[idx].indices
    target_tracks.append(target_songs)

target_tracks = np.array(target_tracks)
URM_final = sp.vstack([URM_seq,URM_train])

In [101]:
target_tracks

array([array([13980, 15771,  5141, 12436, 11661, 14724,  2306], dtype=int32),
       array([ 7465, 16275,  8843,  6784, 20480], dtype=int32),
       array([9122, 4098, 9121], dtype=int32), ...,
       array([  371,  3199,  3445,  3816,  4148,  4347,  6343,  7410,  7518,
        8284,  8526, 12197, 13747, 14949, 16622, 17042, 19970, 20189],
      dtype=int32),
       array([ 1534,  1746,  2037,  2067,  2506,  2705,  4480,  5611,  5685,
        5837,  6583,  6788,  7964,  8822,  8956,  9394,  9757,  9941,
       10251, 10264, 12119, 13866, 13971, 14088, 16312, 18947, 19812,
       20082, 20327], dtype=int32),
       array([ 3244, 11512, 13442, 13908, 18910, 20331], dtype=int32)],
      dtype=object)

In [98]:
URM_final

<10000x20635 sparse matrix of type '<class 'numpy.int64'>'
	with 178574 stored elements in Compressed Sparse Row format>

In [99]:
URM_train

<5000x20635 sparse matrix of type '<class 'numpy.int64'>'
	with 60437 stored elements in Compressed Sparse Row format>

In [100]:
URM_seq

<5000x20635 sparse matrix of type '<class 'numpy.int64'>'
	with 118137 stored elements in Compressed Sparse Row format>