# Similarity Ensemble

In [9]:
from src.utils.loader import *
from scipy.sparse import *
import numpy as np
from src.utils.feature_weighting import *
from src.utils.matrix_utils import compute_cosine, top_k_filtering, yadistance
from src.utils.BaseRecommender import BaseRecommender


class ContentBasedFiltering(BaseRecommender):

    """
    Good conf: tag aggr 3,10; tfidf l1 norm over all matrix
    MAP@5  0.11772497678137457 with 10 shrinkage,
                                    100 k_filtering and other as before
    MAP@5  0.12039006297936491 urm weight 0.7
    MAP@5  0.12109109578826009 without playcount and duration

    Current best:
    CBF (album 1.0, artists 1.0, no duration/playcount)
        + URM 0.8
        + TOP-55 (TFIDF (tags 1.0))
        MAP@5 0.11897304011860126
        Public leaderboard: 0.09616


    """

    def __init__(self, shrinkage=10, k_filtering=100):
        # final matrix of predictions
        self.R_hat = None

        # for keeping reference between playlist and row index
        self.pl_id_list = []
        # for keeping reference between tracks and column index
        self.tr_id_list = []

        self.shrinkage = shrinkage
        self.k_filtering = k_filtering

    def fit(self, urm, target_playlist, target_tracks, dataset, topK_tag=55):
        """
        urm: user rating matrix
        target playlist is a list of playlist id
        target_tracks is a list of track id
        shrinkage: shrinkage factor for significance weighting
        S = ICM' ICM
        R = URM S
        In between eliminate useless row of URM and useless cols of S
        """
        # initialization

        self.pl_id_list = list(target_playlist)
        self.tr_id_list = list(target_tracks)
        self.dataset = dataset
        S = None
        urm = urm.tocsr()
        print("CBF started")
        # get ICM from dataset, assume it already cleaned
        icm = dataset.build_icm()

        # Build the tag matrix, apply TFIDF
        print("Build tags matrix and apply TFIDF...")
        icm_tag = dataset.build_tags_matrix()
        #tag_weight = np.array(urm.dot(icm_tag.transpose()).sum(axis=0)).squeeze()
        #print(tag_weight[tag_weight>5].shape)
        #icm_tag = icm_tag[tag_weight>5].multiply(tag_weight[tag_weight>5])
        tags = applyTFIDF(icm_tag, norm='l1')
        #tags = icm_tag[tag_weight>5]
        # Before stacking tags with the rest of the ICM, we keep only
        # the top K tags for each item. This way we try to reduce the
        # natural noise added by such sparse features.
        tags = top_k_filtering(tags.transpose(), topK=topK_tag).transpose()
        #tags.data = tags.data / np.max(tags.data)
        #newvalue= (max'-min')/(max-min)*(value-max)+max'

        # User augmented UCM
        # print("Building User augmented ICM")
        # ucm = dataset.build_ucm()
        # ua_icm = user_augmented_icm(urm, ucm)
        # ua_icm = top_k_filtering(ua_icm.transpose(), topK=55).transpose()

        # stack all
        icm = vstack([icm, tags, urm * 1], format='csr')
        # icm = vstack([icm, tags, applyTFIDF(urm)], format='csr')

        S = compute_cosine(icm.transpose()[[dataset.get_track_index_from_id(x)
                                            for x in self.tr_id_list]],
                       icm,
                       k_filtering=self.k_filtering,
                       shrinkage=self.shrinkage,
                       n_threads=4,
                       chunksize=1000)
        s_norm = S.sum(axis=1)

        # Normalize S
        S = S.multiply(csr_matrix(np.reciprocal(s_norm)))
        print("Similarity matrix ready!")

        # Keep only the target playlists in the URM
        urm_cleaned = urm[[dataset.get_playlist_index_from_id(x)
                           for x in self.pl_id_list]]
        self.S = S.transpose()

        # Compute ratings
        R_hat = urm_cleaned.dot(S.transpose().tocsc()).tocsr()
        print("R_hat done")

        # Remove the entries in R_hat that are already present in the URM
        urm_cleaned = urm_cleaned[:, [dataset.get_track_index_from_id(x)
                                      for x in self.tr_id_list]]
        R_hat[urm_cleaned.nonzero()] = 0
        R_hat.eliminate_zeros()

        print("Shape of final matrix: ", R_hat.shape)
        self.R_hat = R_hat

    def getW(self):
        """
        Returns the similary matrix with dimensions I x I
        S is IxT
        """
        return self.S.tocsr()

    def predict(self, at=5):
        """
        returns a dictionary of
        'pl_id': ['tr_1', 'tr_at'] for each playlist in target playlist
        """
        recs = {}
        for i in range(0, self.R_hat.shape[0]):
            pl_id = self.pl_id_list[i]
            pl_row = self.R_hat.data[self.R_hat.indptr[i]:
                                     self.R_hat.indptr[i + 1]]
            # get top 5 indeces. argsort, flip and get first at-1 items
            sorted_row_idx = np.flip(pl_row.argsort(), axis=0)[0:at]
            track_cols = [self.R_hat.indices[self.R_hat.indptr[i] + x]
                          for x in sorted_row_idx]
            tracks_ids = [self.tr_id_list[x] for x in track_cols]
            recs[pl_id] = tracks_ids
        return recs

    def getR_hat(self):
        return self.R_hat

    def get_model(self):
        """
        Returns the complete R_hat
        """
        return self.R_hat.copy()


def applyTFIDF(matrix, norm='l1'):
    from sklearn.feature_extraction.text import TfidfTransformer
    transformer = TfidfTransformer(norm=norm, use_idf=True,
                                   smooth_idf=True, sublinear_tf=False)
    tfidf = transformer.fit_transform(matrix.transpose())
    return tfidf.transpose()


def produceCsv():
    # export csv
    dataset = Dataset(load_tags=True,
                      filter_tag=False,
                      weight_tag=False)
    dataset.set_track_attr_weights_2(1.0, 1.0, 0.0, 0.0, 0.0,
                                     1.0, 1.0, 0.0, 0.0)
    cbf_exporter = ContentBasedFiltering()
    urm = dataset.build_train_matrix()
    tg_playlist = list(dataset.target_playlists.keys())
    tg_tracks = list(dataset.target_tracks.keys())
    # Train the model with the best shrinkage found in cross-validation
    cbf_exporter.fit(urm,
                     tg_playlist,
                     tg_tracks,
                     dataset)
    recs = cbf_exporter.predict()
    with open('submission_cbf.csv', mode='w', newline='') as out:
        fieldnames = ['playlist_id', 'track_ids']
        writer = csv.DictWriter(out, fieldnames=fieldnames, delimiter=',')
        writer.writeheader()
        for k in tg_playlist:
            track_ids = ''
            for r in recs[k]:
                track_ids = track_ids + r + ' '
            writer.writerow({'playlist_id': k,
                             'track_ids': track_ids[:-1]})


def evaluateMap():
    from src.utils.evaluator import Evaluator
    dataset = Dataset(load_tags=True,
                      filter_tag=False,
                      weight_tag=False)
    dataset.set_track_attr_weights_2(1.0, 1.0, 0.0, 0.0, 0.0,
                                     1.0, 1.0, 0.0, 0.0)
    # seed = 0xcafebabe
    # print("Evaluating with initial seed: {}".format(seed))
    ev = Evaluator(seed=False)
    ev.cross_validation(5, dataset.train_final.copy())
    cbf = ContentBasedFiltering()
    for i in range(0, 5):
        urm, tg_tracks, tg_playlist = ev.get_fold(dataset)
        cbf.fit(urm,
                list(tg_playlist),
                list(tg_tracks),
                dataset)
        recs = cbf.predict()
        ev.evaluate_fold(recs)

    map_at_five = ev.get_mean_map()
    print("MAP@5 ", map_at_five)


def crossValidation():
    from src.utils.evaluator import Evaluator
    pass

# Item Based Similarity

In [16]:
def ibs(urm, dataset):
    urm_sim = applyTFIDF(urm)
    #urm_sim = top_k_filtering(urm_sim.transpose()).transpose()
    S = compute_cosine(urm_sim.transpose()[[dataset.get_track_index_from_id(x)
                                    for x in tg_tracks]],
                       urm_sim,
                       k_filtering=100,
                       shrinkage=10,
                       n_threads=4,
                       chunksize=1000)
    s_norm = S.sum(axis=1)
    s_norm[s_norm==0] = 1
    # Normalize S
    S = S.multiply(csr_matrix(np.reciprocal(s_norm)))
    print("Similarity matrix ready!")
    return S.transpose()

# User Based Similarity

In [15]:
def ubs(urm, ucm, dataset):
    ucm = dataset.build_ucm()
    iucm = ucm.dot(urm)
    iucm = applyTFIDF(iucm)
    iucm = top_k_filtering(iucm.transpose(), 100).transpose()
    S = compute_cosine(iucm.transpose()[[dataset.get_track_index_from_id(x)
                                                for x in tg_tracks]],
                           iucm,
                           k_filtering=100,
                           shrinkage=10,
                           n_threads=4,
                           chunksize=1000)
    s_norm = S.sum(axis=1)
    s_norm[s_norm==0] = 1
    print("S done")
    # Normalize S
    S = S.multiply(csr_matrix(np.reciprocal(s_norm)))
    print("Similarity matrix ready!")
    return S.transpose()

# Only Content Based

In [17]:
def icm_sim(icm, dataset):
    icm_tag = dataset.build_tags_matrix()
    #tag_weight = np.array(urm.dot(icm_tag.transpose()).sum(axis=0)).squeeze()
    #print(tag_weight[tag_weight>5].shape)
    #icm_tag = icm_tag[tag_weight>5].multiply(tag_weight[tag_weight>5])
    tags = applyTFIDF(icm_tag, norm='l1')
    #tags = icm_tag[tag_weight>5]
    # Before stacking tags with the rest of the ICM, we keep only
    # the top K tags for each item. This way we try to reduce the
    # natural noise added by such sparse features.
    tags = top_k_filtering(tags.transpose(), topK=55).transpose()
    #tags.data = tags.data / np.max(tags.data)
    #newvalue= (max'-min')/(max-min)*(value-max)+max'

    # User augmented UCM
    # print("Building User augmented ICM")
    # ucm = dataset.build_ucm()
    # ua_icm = user_augmented_icm(urm, ucm)
    # ua_icm = top_k_filtering(ua_icm.transpose(), topK=55).transpose()

    # stack all
    icm = vstack([icm, tags*0.4], format='csr')
    #icm = applyTFIDF(icm)
    #icm = top_k_filtering(icm.transpose(), topK=200).transpose()
    # icm = vstack([icm, tags, applyTFIDF(urm)], format='csr')

    S = compute_cosine(icm.transpose()[[dataset.get_track_index_from_id(x)
                                        for x in tg_tracks]],
                   icm,
                   k_filtering=100,
                   shrinkage=10,
                   n_threads=4,
                   chunksize=1000)
    s_norm = S.sum(axis=1)
    s_norm[s_norm==0] = 1
    print("S done")
    # Normalize S
    S = S.multiply(csr_matrix(np.reciprocal(s_norm)))
    print("Similarity matrix ready!")
    return S.transpose()

# MF-Based Similarity

In [2]:
from src.CBF.CBF_MF import ContentBasedFiltering as CBF
from src.MF.MF_BPR.MF_BPR import MF_BPR
cbf = CBF()
    
cbf.fit(urm, tg_playlist,
        tg_tracks,
        dataset)

# get R_hat
R_hat_aug = cbf.getR_hat()
print(R_hat_aug.nnz)

CBF started
Build tags matrix and apply TFIDF...
Running 4 workers...
[ 7554 ] Building cosine similarity matrix for [0, 1000)...
[ 7555 ] Building cosine similarity matrix for [25000, 26000)...
[ 7556 ] Building cosine similarity matrix for [50000, 51000)...
[ 7554 ] Building cosine similarity matrix for [1000, 2000)...
[ 7555 ] Building cosine similarity matrix for [26000, 27000)...
[ 7557 ] Building cosine similarity matrix for [75000, 76000)...
[ 7554 ] Building cosine similarity matrix for [2000, 3000)...
[ 7556 ] Building cosine similarity matrix for [51000, 52000)...
[ 7555 ] Building cosine similarity matrix for [27000, 28000)...
[ 7554 ] Building cosine similarity matrix for [3000, 4000)...
[ 7556 ] Building cosine similarity matrix for [52000, 53000)...
[ 7557 ] Building cosine similarity matrix for [76000, 77000)...
[ 7555 ] Building cosine similarity matrix for [28000, 29000)...
[ 7554 ] Building cosine similarity matrix for [4000, 5000)...
[ 7556 ] Building cosine similari



Shape of final matrix:  (57561, 100000)
1231858


In [5]:
mf = MF_BPR()

In [6]:
# MAP@5: 0.08256503053607782 with 500 factors after 10 epochs
# MAP@5: 0.08594586443489391 with 500 factors afetr 4 epochs no_components=500, epoch_multiplier=2, l_rate=1e-2
mf.fit(R_hat_aug, dataset, list(tg_playlist), list(tg_tracks), n_epochs=1, no_components=500, epoch_multiplier=6, l_rate=1e-2, use_icm=False)
recs = mf.predict_dot_custom(urm)
ev.evaluate_fold(recs)

# MAP@5: 0.09407901681369218 with neighborhood
# MAP@5: 0.09854105406016736 with neighborhood after 4 epochs
recs = mf.predict_knn_custom(urm)
ev.evaluate_fold(recs)

S_mf = mf.S

Processed 5000000 ( 67.65% ) in 493.97 seconds. Sample per second: 10122
Processed 7391148 ( 100.00% ) in 239.09 seconds. Sample per second: 10096
Training finished
R_hat done
MAP@5: 0.07459871516467634
Computing dot product for chunk [0, 1000)...
Computing dot product for chunk [1000, 2000)...
Computing dot product for chunk [2000, 3000)...
Computing dot product for chunk [3000, 4000)...
Computing dot product for chunk [4000, 5000)...
Computing dot product for chunk [5000, 6000)...
Computing dot product for chunk [6000, 7000)...
Computing dot product for chunk [7000, 8000)...
Computing dot product for chunk [8000, 9000)...
Computing dot product for chunk [9000, 10000)...
Computing dot product for chunk [10000, 11000)...
Computing dot product for chunk [11000, 12000)...
Computing dot product for chunk [12000, 13000)...
Computing dot product for chunk [13000, 14000)...
Computing dot product for chunk [14000, 15000)...
Computing dot product for chunk [15000, 16000)...
Computing dot produ

AttributeError: 'MF_BPR' object has no attribute 'S'

# SLIM Similarity

In [33]:
from src.ML.BPRSLIM import BPRSLIM


dataset.set_track_attr_weights_2(1, 1, 0, 0, 0, 0, 0, 0, 0)
#ds.set_track_attr_weights(1, 1, 0, 0, 1)
print('Building the ICM...')
icm = dataset.build_icm()

icm_tag = dataset.build_tags_matrix()

tags = applyTFIDF(icm_tag, norm='l1')

# Before stacking tags with the rest of the ICM, we keep only
# the top K tags for each item. This way we try to reduce the
# natural noise added by such sparse features.
tags = top_k_filtering(tags.transpose(), topK=55).transpose()
tags.data = np.ones_like(tags.data)

# stack all
icm = vstack([icm, tags], format='csr')

recommender = BPRSLIM(epochs=50,
                      epochMultiplier=1.0,
                      sgd_mode='rmsprop',
                      learning_rate=5e-02,
                      topK=300,
                      urmSamplingChances=1 / 5,
                      icmSamplingChances=4 / 5)
recommender.set_evaluation_every(10, ev)
recommender.fit(urm.tocsr(),
                icm.tocsr(),
                tg_playlist,
                tg_tracks,
                dataset)

S_bprslim = recommender.getParameters()
S_bprslim = S_bprslim[:,[dataset.get_track_index_from_id(x) for x in tg_tracks]]

# keep only 100 elements
S_bprslim = top_k_filtering(S_bprslim.transpose(), 100).transpose()

# normalize
s_norm = S_bprslim.sum(axis=0)
s_norm[s_norm==0] = 1
print("S done")

S_bprslim = S_bprslim.multiply(csr_matrix(np.reciprocal(s_norm)))


Building the ICM...
Running fit process.
Processed 500000 ( 29.85% ) in 41.19 seconds. Sample per second: 12138
Processed 1000000 ( 59.70% ) in 11.96 seconds. Sample per second: 18883
Processed 1500000 ( 89.54% ) in 10.11 seconds. Sample per second: 24150
Processed 1675142 ( 100.00% ) in 3.38 seconds. Sample per second: 25621
Return S matrix to python caller...
Epoch 0 of 50 complete in 1.30 minutes
Processed 500000 ( 29.85% ) in 8.65 seconds. Sample per second: 57799
Processed 1000000 ( 59.70% ) in 6.16 seconds. Sample per second: 70643
Processed 1500000 ( 89.54% ) in 5.76 seconds. Sample per second: 75919
Processed 1675142 ( 100.00% ) in 2.73 seconds. Sample per second: 77080
Return S matrix to python caller...
Epoch 1 of 50 complete in 0.70 minutes
Processed 500000 ( 29.85% ) in 6.97 seconds. Sample per second: 71728
Processed 1000000 ( 59.70% ) in 6.63 seconds. Sample per second: 79182
Processed 1500000 ( 89.54% ) in 6.52 seconds. Sample per second: 81005
Processed 1675142 ( 100.00



MAP@5: 0.06927693583653736
Epoch 10 of 50 complete in 1.13 minutes
Processed 500000 ( 29.85% ) in 8.20 seconds. Sample per second: 60957
Processed 1000000 ( 59.70% ) in 7.12 seconds. Sample per second: 66158
Processed 1500000 ( 89.54% ) in 6.61 seconds. Sample per second: 69399
Processed 1675142 ( 100.00% ) in 2.97 seconds. Sample per second: 69882
Return S matrix to python caller...
Epoch 11 of 50 complete in 0.92 minutes
Processed 500000 ( 29.85% ) in 8.85 seconds. Sample per second: 56529
Processed 1000000 ( 59.70% ) in 7.13 seconds. Sample per second: 66081
Processed 1500000 ( 89.54% ) in 6.49 seconds. Sample per second: 69786
Processed 1675142 ( 100.00% ) in 2.82 seconds. Sample per second: 70329
Return S matrix to python caller...
Epoch 12 of 50 complete in 0.91 minutes
Processed 500000 ( 29.85% ) in 8.74 seconds. Sample per second: 57182
Processed 1000000 ( 59.70% ) in 7.25 seconds. Sample per second: 65579
Processed 1500000 ( 89.54% ) in 6.59 seconds. Sample per second: 69490
P

Processed 1000000 ( 59.70% ) in 6.96 seconds. Sample per second: 52739
Processed 1500000 ( 89.54% ) in 8.27 seconds. Sample per second: 57089
Processed 1675142 ( 100.00% ) in 2.92 seconds. Sample per second: 57913
Return S matrix to python caller...
Epoch 33 of 50 complete in 1.13 minutes
Processed 500000 ( 29.85% ) in 10.01 seconds. Sample per second: 49957
Processed 1000000 ( 59.70% ) in 8.09 seconds. Sample per second: 55277
Processed 1500000 ( 89.54% ) in 7.79 seconds. Sample per second: 58173
Processed 1675142 ( 100.00% ) in 3.03 seconds. Sample per second: 59754
Return S matrix to python caller...
Epoch 34 of 50 complete in 1.05 minutes
Processed 500000 ( 29.85% ) in 9.37 seconds. Sample per second: 53384
Processed 1000000 ( 59.70% ) in 8.58 seconds. Sample per second: 56879
Processed 1500000 ( 89.54% ) in 7.88 seconds. Sample per second: 60285
Processed 1675142 ( 100.00% ) in 3.46 seconds. Sample per second: 61008
Return S matrix to python caller...
Epoch 35 of 50 complete in 1.

# Initialization

In [32]:
from src.utils.evaluator import Evaluator
from src.utils.loader import *
print("started")
print("hey")
dataset = Dataset(load_tags=True,
                  filter_tag=False,
                  weight_tag=False)
dataset.set_track_attr_weights_2(1.5, 1.6, 0, 0, 0.0,
                                 1.0, 0, 0.0, 0.0)
# seed = 0xcafebabe
# print("Evaluating with initial seed: {}".format(seed))
ev = Evaluator(seed=False)
ev.cross_validation(5, dataset.train_final.copy())
urm, tg_tracks, tg_playlist = ev.get_fold(dataset)
print("Done")

started
hey
File found, retrieving urm from it.
Load from file takes 0.25 seconds
Done


In [24]:
print("Start exporting")
dataset = Dataset(load_tags=True,
                  filter_tag=False,
                  weight_tag=False)
dataset.set_track_attr_weights_2(1.5, 1.6, 0, 0, 0.0,
                                 1.0, 0, 0.0, 0.0)
# seed = 0xcafebabe
# print("Evaluating with initial seed: {}".format(seed))
urm = dataset.build_train_matrix()
tg_playlist = list(dataset.target_playlists.keys())
tg_tracks = list(dataset.target_tracks.keys())

Start exporting
File found, retrieving urm from it.
Load from file takes 0.44 seconds


# Ensemble

In [25]:
print("ok")
cbf = ContentBasedFiltering()
cbf.fit(urm,
        list(tg_playlist),
        list(tg_tracks),
        dataset, 
        topK_tag=55)
#recs = cbf.predict()
#ev.evaluate_fold(recs)
S_cbf_full = cbf.S

ok
CBF started
Build tags matrix and apply TFIDF...
Running 4 workers...
[ 896 ] Building cosine similarity matrix for [0, 1000)...
[ 897 ] Building cosine similarity matrix for [8048, 9048)...
[ 898 ] Building cosine similarity matrix for [16096, 17096)...
[ 899 ] Building cosine similarity matrix for [24144, 25144)...
[ 896 ] Building cosine similarity matrix for [1000, 2000)...
[ 897 ] Building cosine similarity matrix for [9048, 10048)...
[ 898 ] Building cosine similarity matrix for [17096, 18096)...
[ 896 ] Building cosine similarity matrix for [2000, 3000)...
[ 899 ] Building cosine similarity matrix for [25144, 26144)...
[ 897 ] Building cosine similarity matrix for [10048, 11048)...
[ 898 ] Building cosine similarity matrix for [18096, 19096)...
[ 896 ] Building cosine similarity matrix for [3000, 4000)...
[ 899 ] Building cosine similarity matrix for [26144, 27144)...
[ 897 ] Building cosine similarity matrix for [11048, 12048)...
[ 898 ] Building cosine similarity matrix for

In [26]:
dataset.set_playlist_attr_weights(1, 1, 1, 0, 0)
ucm = dataset.build_ucm()
S_user = ubs(urm, ucm, dataset)

Running 4 workers...
[ 901 ] Building cosine similarity matrix for [0, 1000)...
[ 902 ] Building cosine similarity matrix for [8048, 9048)...
[ 903 ] Building cosine similarity matrix for [16096, 17096)...
[ 904 ] Building cosine similarity matrix for [24144, 25144)...
[ 901 ] Building cosine similarity matrix for [1000, 2000)...
[ 902 ] Building cosine similarity matrix for [9048, 10048)...
[ 903 ] Building cosine similarity matrix for [17096, 18096)...
[ 904 ] Building cosine similarity matrix for [25144, 26144)...
[ 901 ] Building cosine similarity matrix for [2000, 3000)...
[ 902 ] Building cosine similarity matrix for [10048, 11048)...
[ 903 ] Building cosine similarity matrix for [18096, 19096)...
[ 904 ] Building cosine similarity matrix for [26144, 27144)...
[ 901 ] Building cosine similarity matrix for [3000, 4000)...
[ 902 ] Building cosine similarity matrix for [11048, 12048)...
[ 903 ] Building cosine similarity matrix for [19096, 20096)...
[ 904 ] Building cosine similarit

In [27]:
S_urm = ibs(urm, dataset)

Running 4 workers...
[ 914 ] Building cosine similarity matrix for [0, 1000)...
[ 915 ] Building cosine similarity matrix for [8048, 9048)...
[ 916 ] Building cosine similarity matrix for [16096, 17096)...
[ 917 ] Building cosine similarity matrix for [24144, 25144)...
[ 914 ] Building cosine similarity matrix for [1000, 2000)...
[ 915 ] Building cosine similarity matrix for [9048, 10048)...
[ 914 ] Building cosine similarity matrix for [2000, 3000)...
[ 915 ] Building cosine similarity matrix for [10048, 11048)...
[ 916 ] Building cosine similarity matrix for [17096, 18096)...
[ 914 ] Building cosine similarity matrix for [3000, 4000)...
[ 915 ] Building cosine similarity matrix for [11048, 12048)...
[ 916 ] Building cosine similarity matrix for [18096, 19096)...
[ 917 ] Building cosine similarity matrix for [25144, 26144)...
[ 914 ] Building cosine similarity matrix for [4000, 5000)...
[ 915 ] Building cosine similarity matrix for [12048, 13048)...
[ 917 ] Building cosine similarity 

In [28]:
dataset.set_track_attr_weights_2(1.5, 1.6, 0, 0, 0.0,
                                 1.0, 0, 0.0, 0.0)
S_icm = icm_sim(dataset.build_icm(), dataset)

[ 919 ] Building cosine similarity matrix for [0, 1000)...
Running 4 workers...
[ 920 ] Building cosine similarity matrix for [8048, 9048)...
[ 921 ] Building cosine similarity matrix for [16096, 17096)...
[ 922 ] Building cosine similarity matrix for [24144, 25144)...
[ 919 ] Building cosine similarity matrix for [1000, 2000)...
[ 920 ] Building cosine similarity matrix for [9048, 10048)...
[ 921 ] Building cosine similarity matrix for [17096, 18096)...
[ 922 ] Building cosine similarity matrix for [25144, 26144)...
[ 919 ] Building cosine similarity matrix for [2000, 3000)...
[ 921 ] Building cosine similarity matrix for [18096, 19096)...
[ 920 ] Building cosine similarity matrix for [10048, 11048)...
[ 922 ] Building cosine similarity matrix for [26144, 27144)...
[ 919 ] Building cosine similarity matrix for [3000, 4000)...
[ 921 ] Building cosine similarity matrix for [19096, 20096)...
[ 920 ] Building cosine similarity matrix for [11048, 12048)...
[ 922 ] Building cosine similarit

In [29]:
# Best params 0.53119031862654198, 0.8622528471283929, 0.449568357270693, 0.84725181341512967]
# [0.24995989924853496, 0.35681721890423695, 0.12389358733625891, 0.78110404084094665]
# weights = [0.53, 0.86, 0.45, 0.85]
S = S_icm * 0.24995989924853496 + S_urm * 0.35681721890423695 + S_user * 0.12389358733625891 + S_cbf_full * 0.78110404084094665
print("Similarity matrix ready!")

s_norm = S.sum(axis=0)
s_norm[s_norm==0] = 1
# Normalize S
S = S.multiply(csr_matrix(np.reciprocal(s_norm)))

# Keep only the target playlists in the URM
urm_cleaned = urm[[dataset.get_playlist_index_from_id(x)
                   for x in tg_playlist]]

# Compute ratings
R_hat = urm_cleaned.dot(S.tocsc()).tocsr()
print("R_hat done")

# Remove the entries in R_hat that are already present in the URM
urm_cleaned = urm_cleaned[:, [dataset.get_track_index_from_id(x)
                              for x in tg_tracks]]
R_hat[urm_cleaned.nonzero()] = 0
R_hat.eliminate_zeros()

print("Shape of final matrix: ", R_hat.shape)
recs = predict(R_hat, list(tg_playlist), list(tg_tracks))
#ev.evaluate_fold(recs)

Similarity matrix ready!
R_hat done
Shape of final matrix:  (10000, 32195)


In [22]:
def predict(R_hat, pl_id_list, tr_id_list, at=5):
        """
        returns a dictionary of
        'pl_id': ['tr_1', 'tr_at'] for each playlist in target playlist
        """
        recs = {}
        for i in range(0, R_hat.shape[0]):
            pl_id = pl_id_list[i]
            pl_row = R_hat.data[R_hat.indptr[i]:
                                     R_hat.indptr[i + 1]]
            # get top 5 indeces. argsort, flip and get first at-1 items
            sorted_row_idx = np.flip(pl_row.argsort(), axis=0)[0:at]
            track_cols = [R_hat.indices[R_hat.indptr[i] + x]
                          for x in sorted_row_idx]
            tracks_ids = [tr_id_list[x] for x in track_cols]
            recs[pl_id] = tracks_ids
        return recs

In [30]:
with open('submission_ensemble.csv', mode='w', newline='') as out:
    fieldnames = ['playlist_id', 'track_ids']
    writer = csv.DictWriter(out, fieldnames=fieldnames, delimiter=',')
    writer.writeheader()
    for k in tg_playlist:
        track_ids = ''
        for r in recs[k]:
            track_ids = track_ids + r + ' '
        writer.writerow({'playlist_id': k,
                         'track_ids': track_ids[:-1]})

# Bayesian opt

In [22]:
def mixS(params):
    global S_icm, S_user, S_urm, S_cbf_full, urm, tg_playlist, evaluator
    print(params)

    S = S_icm * params[0] + S_urm * params[1] + S_user * params[2] + S_cbf_full * params[3]
    print("Similarity matrix ready!")

    s_norm = S.sum(axis=0)
    s_norm[s_norm==0] = 1
    # Normalize S
    S= S.multiply(csr_matrix(np.reciprocal(s_norm)))
    print("Similarity matrix ready!")

    # Keep only the target playlists in the URM
    urm_cleaned = urm[[dataset.get_playlist_index_from_id(x)
                       for x in tg_playlist]]

    # Compute ratings
    R_hat = urm_cleaned.dot(S.tocsc()).tocsr()
    print("R_hat done")

    # Remove the entries in R_hat that are already present in the URM
    urm_cleaned = urm_cleaned[:, [dataset.get_track_index_from_id(x)
                                  for x in tg_tracks]]
    R_hat[urm_cleaned.nonzero()] = 0
    R_hat.eliminate_zeros()

    print("Shape of final matrix: ", R_hat.shape)
    recs = predict(R_hat, list(tg_playlist), list(tg_tracks))
    map_at_five = ev.evaluate_fold(recs)
    return -map_at_five

In [24]:
from skopt import forest_minimize
from skopt.space import Integer, Real
space = [Real(0.0, 1.0),  # ICM
         Real(0.0, 1.0),  # URM
         Real(0.0, 1.0),  # USER
         Real(0.0, 1.0),  # CBF_FULL
             ]
x0 = [1, 0, 0, 0]
x1 = [0, 1, 0, 0]
x2 = [0, 0, 1, 0]
x3 = [0, 0, 0, 1]
x4 = [0.53119031862654198, 0.8622528471283929, 0.449568357270693, 0.84725181341512967]
x0s = [x0, x1, x2, x3, x4]
res = forest_minimize(mixS, space, x0=x0s, verbose=True, n_random_starts=50, n_calls=1000, n_jobs=-1)
print('Maximimum p@k found: {:6.5f}'.format(-res.fun))
print('Optimal parameters:')
params = ['ICM', 'URM', 'USER', 'FULL']
for (p, x_) in zip(params, res.x):
    print('{}: {}'.format(p, x_))

Iteration No: 1 started. Evaluating function at provided point.
[1, 0, 0, 0]
Similarity matrix ready!
Similarity matrix ready!
R_hat done
Shape of final matrix:  (5526, 20730)
MAP@5: 0.09656894679696001
[0, 1, 0, 0]
Similarity matrix ready!
Similarity matrix ready!
R_hat done
Shape of final matrix:  (5526, 20730)
MAP@5: 0.08265773917239722
[0, 0, 1, 0]
Similarity matrix ready!
Similarity matrix ready!
R_hat done
Shape of final matrix:  (5526, 20730)
MAP@5: 0.0711895282905054
[0, 0, 0, 1]
Similarity matrix ready!
Similarity matrix ready!
R_hat done
Shape of final matrix:  (5526, 20730)
MAP@5: 0.10914706237181827
[0.531190318626542, 0.8622528471283929, 0.449568357270693, 0.8472518134151297]
Similarity matrix ready!
Similarity matrix ready!
R_hat done
Shape of final matrix:  (5526, 20730)
MAP@5: 0.11102002654119943
Iteration No: 1 ended. Evaluation done at provided point.
Time taken: 147.7587
Function value obtained: -0.1110
Current minimum: -0.1110
Iteration No: 2 started. Evaluating fun

Similarity matrix ready!
Similarity matrix ready!
R_hat done
Shape of final matrix:  (5526, 20730)
MAP@5: 0.1100307636626856
Iteration No: 20 ended. Evaluation done at random point.
Time taken: 30.0728
Function value obtained: -0.1100
Current minimum: -0.1112
Iteration No: 21 started. Evaluating function at random point.
[0.20960644552278335, 0.98174549019420865, 0.34502655875138782, 0.50129713374349893]
Similarity matrix ready!
Similarity matrix ready!
R_hat done
Shape of final matrix:  (5526, 20730)
MAP@5: 0.10766859693569845
Iteration No: 21 ended. Evaluation done at random point.
Time taken: 32.1869
Function value obtained: -0.1077
Current minimum: -0.1112
Iteration No: 22 started. Evaluating function at random point.
[0.2014585341804665, 0.48495266638890444, 0.95904980481268454, 0.09675223558044535]
Similarity matrix ready!
Similarity matrix ready!
R_hat done
Shape of final matrix:  (5526, 20730)
MAP@5: 0.10442031608155433
Iteration No: 22 ended. Evaluation done at random point.
T

R_hat done
Shape of final matrix:  (5526, 20730)
MAP@5: 0.10988056460369192
Iteration No: 40 ended. Evaluation done at random point.
Time taken: 29.8054
Function value obtained: -0.1099
Current minimum: -0.1114
Iteration No: 41 started. Evaluating function at random point.
[0.73307302615189085, 0.80614410332001873, 0.43994953292918149, 0.97830718622784762]
Similarity matrix ready!
Similarity matrix ready!
R_hat done
Shape of final matrix:  (5526, 20730)
MAP@5: 0.11102967788635568
Iteration No: 41 ended. Evaluation done at random point.
Time taken: 30.0286
Function value obtained: -0.1110
Current minimum: -0.1114
Iteration No: 42 started. Evaluating function at random point.
[0.18019860611998875, 0.91618732181741314, 0.60513908377904835, 0.66025155227094046]
Similarity matrix ready!
Similarity matrix ready!
R_hat done
Shape of final matrix:  (5526, 20730)
MAP@5: 0.10750452406804233
Iteration No: 42 ended. Evaluation done at random point.
Time taken: 29.9681
Function value obtained: -0.1

KeyboardInterrupt: 