In [2]:
import pandas as pd
import scipy as sc
import numpy as np
import time
import sys
from sklearn.preprocessing import MultiLabelBinarizer

# addresses of the files
train_file = '../data/train.csv'
target_playlists_file = '../data/target_playlists.csv'
tracks_file = '../data/tracks.csv'

# reading of all files and renaming columns
train_data = pd.read_csv(train_file)
train_data.columns = ['playlist_id', 'track_id']

tracks_data = pd.read_csv(tracks_file)
tracks_data.columns = ['track_id', 'album_id', 'artist_id', 'duration_sec']

target_playlists = pd.read_csv(target_playlists_file)
target_playlists.columns = ['playlist_id']

# building the URM matrix
grouped_playlists = train_data.groupby('playlist_id', as_index=True).apply(lambda x: list(x['track_id']))
URM = MultiLabelBinarizer(sparse_output=True).fit_transform(grouped_playlists)
URM_csr = URM.tocsr()

# building the ICM matrix
artists = tracks_data.reindex(columns=['track_id', 'artist_id'])
artists.sort_values(by='track_id', inplace=True) # this seems not useful, values are already ordered
artists_list = [[a] for a in artists['artist_id']]
icm_artists = MultiLabelBinarizer(sparse_output=True).fit_transform(artists_list)
icm_artists_csr = icm_artists.tocsr()

albums = tracks_data.reindex(columns=['track_id', 'album_id'])
albums.sort_values(by='track_id', inplace=True) # this seems not useful, values are already ordered
albums_list = [[a] for a in albums['album_id']]
icm_albums = MultiLabelBinarizer(sparse_output=True).fit_transform(albums_list)
icm_albums_csr = icm_albums.tocsr()

durations = tracks_data.reindex(columns=['track_id', 'duration_sec'])
durations.sort_values(by='track_id', inplace=True) # this seems not useful, values are already ordered
durations_list = [[d] for d in durations['duration_sec']]
icm_durations = MultiLabelBinarizer(sparse_output=True).fit_transform(durations_list)
icm_durations_csr = icm_durations.tocsr()

ICM = sc.sparse.hstack((icm_albums_csr, icm_artists_csr, icm_durations_csr))
ICM_csr = ICM.tocsr()

In [42]:
def initialize_output_file():
    file = open("submission.csv", 'a')
    file.write("playlist_id,track_ids" + '\n')
    return file

# useful to print to file with the right structure
def print_to_file(playlist, tracks, file):
    file.write(str(playlist) + ',')
    index = 0
    while index < 9:
        file.write(str(tracks[index]) + ' ')
        index += 1
    file.write(str(tracks[index]) + '\n')

In [108]:
class Recommender(object):
    
    def __init__(self, URM, ICM):
        self.URM = URM
        self.ICM = ICM
        
            
    def fit(self, topK=50, shrink=100, normalize = True, similarity = "tversky"):
        
        similarity_object_content_based = Compute_Similarity_Python(self.ICM.T, shrink=shrink, 
                                                  topK=topK, normalize=normalize, 
                                                  similarity = similarity)
        
        similarity_object_user_cf = Compute_Similarity_Python(self.URM.T, shrink=shrink, 
                                                  topK=topK, normalize=normalize, 
                                                  similarity = similarity)
        
        similarity_object_item_cf = Compute_Similarity_Python(self.URM, shrink=shrink, 
                                                  topK=topK, normalize=normalize, 
                                                  similarity = similarity)
    
        self.W_sparse_content_based = similarity_object_content_based.compute_similarity()
        print(np.shape(self.W_sparse_content_based))
        """
        self.W_sparse_user_cf = similarity_object_user_cf.compute_similarity()
        print(np.shape(self.W_sparse_user_cf))
        """
        self.W_sparse_item_cf = similarity_object_item_cf.compute_similarity()
        print(np.shape(self.W_sparse_item_cf))
        
    def recommend(self, user_id, alfa, at=None, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.URM[user_id]
        
        scores_content_based = user_profile.dot(self.W_sparse_content_based).toarray().ravel()
        #scores_user_cf = user_id.dot(self.W_sparse_user_cf).toarray().ravel()
        scores_item_cf = user_profile.dot(self.W_sparse_item_cf).toarray().ravel()
        
        #print ("score item cf")
        #print (np.shape(scores_item_cf))
        
        # print ("score user cf")
        # print (np.shape(scores_user_cf))
        
        scores = alfa * scores_content_based + (1 - alfa) * scores_item_cf
        
        if exclude_seen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]
            
        return ranking[:at]
    
    
    def filter_seen(self, user_id, scores):

        start_pos = self.URM.indptr[user_id]
        end_pos = self.URM.indptr[user_id+1]

        user_profile = self.URM.indices[start_pos:end_pos]
        
        scores[user_profile] = -np.inf

        return scores

In [106]:
def precision(is_relevant, relevant_items):

    #is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)

    return precision_score



def recall(is_relevant, relevant_items):

    #is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]

    return recall_score



def MAP(is_relevant, relevant_items):

    #is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    map_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score



def evaluate_algorithm(URM_test, recommender_object, alfa, at=10):

    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_MAP = 0.0

    num_eval = 0

    URM_test = sc.sparse.csr_matrix(URM_test)

    n_users = URM_test.shape[0]


    for user_id in range(n_users):

        if user_id % 10000 == 0:
            print("Evaluated user {} of {}".format(user_id, n_users))

        start_pos = URM_test.indptr[user_id]
        end_pos = URM_test.indptr[user_id+1]

        if end_pos-start_pos>0:

            relevant_items = URM_test.indices[start_pos:end_pos]

            recommended_items = recommender_object.recommend(user_id, alfa, at=at)
            num_eval+=1

            is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

            cumulative_precision += precision(is_relevant, relevant_items)
            cumulative_recall += recall(is_relevant, relevant_items)
            cumulative_MAP += MAP(is_relevant, relevant_items)


    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    cumulative_MAP /= num_eval

    print("Recommender performance is: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, cumulative_MAP))

    result_dict = {
        "precision": cumulative_precision,
        "recall": cumulative_recall,
        "MAP": cumulative_MAP,
    }

    return result_dict

In [101]:
from data_splitter import train_test_holdout


URM_train, URM_test = train_test_holdout(URM_csr, train_perc = 0.8)


from Compute_Similarity_Python import Compute_Similarity_Python


In [98]:
np.shape(URM_csr)

(50446, 20635)

In [102]:
np.shape(URM_train)

(50446, 20635)

In [103]:
np.shape(URM_test)

(50446, 20635)

In [109]:
recommender = Recommender(URM_train, ICM_csr)
recommender.fit(shrink=5, topK=200)

Similarity column 20600 ( 100 % ), 1931.32 column/sec, elapsed time 0.18 min
(20635, 20635)
Similarity column 20600 ( 100 % ), 860.26 column/sec, elapsed time 0.40 min
(20635, 20635)


In [110]:
evaluate_algorithm(URM_test, recommender, 0.1)

Evaluated user 0 of 50446
Evaluated user 10000 of 50446
Evaluated user 20000 of 50446
Evaluated user 30000 of 50446
Evaluated user 40000 of 50446
Evaluated user 50000 of 50446
Recommender performance is: Precision = 0.1706, Recall = 0.1550, MAP = 0.1241


{'precision': 0.1705933769046123,
 'recall': 0.1549582518579134,
 'MAP': 0.1241464879705452}

In [None]:
evaluate_algorithm(URM_test, recommender)

In [44]:
# execution of the recommendations
file = initialize_output_file()

for playlist in target_playlists.itertuples(index=True, name='Pandas'):
    playlist_id = getattr(playlist, "playlist_id")
    tracks = recommender.recommend(playlist_id, 10, True)
    print_to_file(playlist_id, tracks, file)
    
file.close()