In [1]:
import pandas as pd
import scipy as sc
import numpy as np
import sys
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# addresses of the files
train_file = '../data/train.csv'
target_playlists_file = '../data/target_playlists.csv'
tracks_file = '../data/tracks.csv'

In [3]:
# reading of all files and renaming columns
train_data = pd.read_csv(train_file)
train_data.columns = ['playlist_id', 'track_id']

tracks_data = pd.read_csv(tracks_file)
tracks_data.columns = ['track_id', 'album_id', 'artist_id', 'duration_sec']

target_playlists = pd.read_csv(target_playlists_file)
target_playlists.columns = ['playlist_id']

In [4]:
# building the URM matrix
grouped_playlists = train_data.groupby('playlist_id', as_index=True).apply(lambda x: list(x['track_id']))
URM = MultiLabelBinarizer(sparse_output=True).fit_transform(grouped_playlists)
URM_csr = URM.tocsr()

In [5]:
# building the similarity matrix
similarity_matrix = URM_csr.dot(URM_csr.T)

In [7]:
def initialize_output_file():
    file = open("submission.csv", 'a')
    file.write("playlist_id,track_ids" + '\n')
    return file

In [8]:
# useful to print to file with the right structure
def print_to_file(playlist, tracks, file):
    file.write(str(playlist) + ',')
    index = 0
    while index < 9:
        file.write(str(tracks[index]) + ' ')
        index += 1
    file.write(str(tracks[index]) + '\n')

In [6]:
def recommend(playlist_id, playlists):
    target_playlist = grouped_playlists.loc[playlist_id]
    
    row_data = similarity_matrix[playlist_id,:]

    reverse_similarity_ranking = np.argsort(row_data.todense())

    reverse_ranking_array = np.squeeze(np.asarray(reverse_similarity_ranking))
    
    similarity_ranking = reverse_ranking_array[::-1]

    to_be_recommended = []
    
    recommendations = 0
    index = 1
    while(recommendations < 10):
        pos = similarity_ranking[index]
        considered_playlist = playlists.loc[pos]
        index += 1
        for track in considered_playlist:
            if track not in target_playlist and recommendations < 10:
                recommendations += 1
                to_be_recommended.append(track)
    
    return to_be_recommended
    

In [10]:
# execution of the recommendations
file = initialize_output_file()

for playlist in target_playlists.itertuples(index=True, name='Pandas'):
    playlist_id = getattr(playlist, "playlist_id")
    to_be_recommended = recommend(playlist_id, grouped_playlists)
    print_to_file(playlist_id, to_be_recommended, file)

file.close()

In [7]:
def precision(is_relevant, relevant_items):

    #is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)

    return precision_score



def recall(is_relevant, relevant_items):

    #is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]

    return recall_score



def MAP(is_relevant, relevant_items):

    #is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    map_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score



def evaluate_algorithm(URM_test):

    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_MAP = 0.0

    num_eval = 0

    n_users = URM_test.shape[0]


    for user_id in range(n_users):

        if user_id % 10000 == 0:
            print("Evaluated user {} of {}".format(user_id, n_users))

        start_pos = URM_test.indptr[user_id]
        end_pos = URM_test.indptr[user_id+1]

        if end_pos-start_pos>0:

            relevant_items = URM_test.indices[start_pos:end_pos]

            recommended_items = recommend(user_id, grouped_playlists)
            num_eval+=1

            is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

            cumulative_precision += precision(is_relevant, relevant_items)
            cumulative_recall += recall(is_relevant, relevant_items)
            cumulative_MAP += MAP(is_relevant, relevant_items)


    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    cumulative_MAP /= num_eval

    print("Recommender performance is: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, cumulative_MAP))

    result_dict = {
        "precision": cumulative_precision,
        "recall": cumulative_recall,
        "MAP": cumulative_MAP,
    }

    return result_dict

In [9]:
from data_splitter import train_test_holdout

URM_train, URM_test = train_test_holdout(URM_csr, train_perc = 0.8)

In [10]:
import time

start_time = time.time()

evaluate_algorithm(URM_test)

print("Time required {:.2f} min".format((time.time()-start_time)/60))

Evaluated user 0 of 50446
Evaluated user 10000 of 50446
Evaluated user 20000 of 50446
Evaluated user 30000 of 50446
Evaluated user 40000 of 50446
Evaluated user 50000 of 50446
Recommender performance is: Precision = 0.0002, Recall = 0.0001, MAP = 0.0002
Time required 0.64 min
