In [46]:
import random
import time
import math
import sys
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd


In [1]:
#Some Utility functions for accessing the files from MSD

def song_to_count(if_str):
    stc = dict()
    with open(if_str, "r") as f:
        for line in f:
            _, song, _ = line.strip().split('\t')
            if song in stc:
                stc[song] += 1
            else:
                stc[song] = 1
    return stc


def user_to_count(if_str):
    utc = dict()
    with open(if_str, "r") as f:
        for line in f:
            user, _, _ = line.strip().split('\t')
            if user in utc:
                utc[user] += 1
            else:
                utc[user] = 1
    return utc


def sort_dict_dec(d):
    return sorted(d.keys(), key=lambda s: d[s], reverse=True)


def song_to_users(if_str, set_users=None, ratio=1.0):
    stu = dict()
    with open(if_str, "r") as f:
        for line in f:
            if random.random() < ratio:
                user, song, _ = line.strip().split('\t')
                if not set_users or user in set_users:
                    if song in stu:
                        stu[song].add(user)
                    else:
                        stu[song] = set([user])
    return stu


def user_to_songs(if_str):
    uts = dict()
    with open(if_str, "r") as f:
        for line in f:
            user, song, _ = line.strip().split('\t')
            if user in uts:
                uts[user].add(song)
            else:
                uts[user] = set([song])
    return uts


def load_unique_tracks(if_str):
    ut = []
    with open(if_str, "r") as f:
        for line in f:
            a_id, s_id, a, s = line.strip().split('<SEP>')
            ut.append((a_id, s_id, a, s))
    return ut


def load_users(if_str):
    with open(if_str, "r") as f:
        u = map(lambda line: line.strip(), f.readlines())
    return u


def song_to_idx(if_str):
    with open(if_str, "r") as f:
         sti = dict(map(lambda line: line.strip().split(' '), f.readlines()))
    return sti


def unique_users(if_str):
    u = set()
    with open(if_str, "r") as f:
        for line in f:
            user, _,_ = line.strip().split('\t')
            if user not in u:
                u.add(user)
    return u

def save_recommendations(r, songs_file, ofile):
    print("Loading song indices from " + songs_file)
    s2i = song_to_idx(songs_file)
    print("Saving recommendations")
    f = open(ofile, "w")
    for r_songs in r:
        indices = map(lambda s: s2i[s], r_songs)
        f.write(" ".join(indices)+"\n")
    f.close()
    print("Ok.")

def songToSongName():
    pass

In [50]:
#Utility functions for evaluation

def sort_dict_dec_util(d):
    sorted_keys = sorted(d.keys(), key=lambda s: len(d[s]), reverse=True)
    sorted_dict = {}
    for i in range(0,len(d)):
        sorted_dict[sorted_keys[i]] = d[sorted_keys[i]]
    return sorted_dict

def user_to_songs_util(if_str):
    uts = dict()
    with open(if_str, "r") as f:
        for line in f:
            user, song, _ = line.strip().split('\t')
            if user in uts:
                uts[user].add(song)
            else:
                uts[user] = set([song])
    return sort_dict_dec_util(uts)

def AP_mod(l_rec, user_given_songs, tau):

    np = len(user_given_songs)
    #print "np:", np
    nc = 0.0
    ap = 0.0
    for j, s in enumerate(l_rec):
        if s in user_given_songs:
            # print "s in sMu"
            nc += 1.0
            ap += nc/(j+1)
    ap /= min(np, tau)
    return ap

def mAP_mod(first_500, l_rec_songs, u2s_testing, tau):
    mapr = 0
    n_users = len(first_500)
    # for i, l_rec in enumerate(l_rec_songs):
    #     if not first_500[i] in u2s_testing:
    #         continue
    #     mapr += AP(l_rec, u2s_testing[first_500[i]], tau)
    # return mapr/n_users

    for user in first_500:
        if not user in u2s_testing:
            continue
        mapr += AP_mod(l_rec_songs[user], u2s_testing[user], tau)
    return mapr/n_users


In [51]:
#Predictor, Recommendors and mAP

def AP(l_rec, sMu, tau):

    np = len(sMu)
    #print "np:", np
    nc = 0.0
    mapr_user = 0.0
    for j, s in enumerate(l_rec):
        if j >= tau:
            break
        if s in sMu:
            # print "s in sMu"
            nc += 1.0
            mapr_user += nc/(j+1)
    mapr_user /= min(np, tau)
    return mapr_user


def mAP(l_users, l_rec_songs, u2s, tau):
    mapr = 0
    n_users = len(l_users)
    for i, l_rec in enumerate(l_rec_songs):
        if not l_users[i] in u2s:
            continue
        mapr += AP(l_rec, u2s[l_users[i]], tau)
    return mapr/n_users



#Predictors
class Pred:

    def __init__(self):
        pass

    def Score(self, user_songs,  all_songs):
        return {}

#ItemBased Predictor
class PredSI(Pred):

    def __init__(self, _s2u_tr, _A=0, _Q=1):
        Pred.__init__(self)
        self.s2u_tr = _s2u_tr
        self.Q = _Q
        self.A = _A

    def printAandQ(self):
        print("PredSI(A=%f,Q=%f)" % (self.A, self.Q))

    def Match(self, song1, song2):
        l1 = len(self.s2u_tr[song1])
        l2 = len(self.s2u_tr[song2])
        up = float(len(self.s2u_tr[song1] & self.s2u_tr[song2]))
        if up > 0:
            dn = math.pow(l1, self.A)*math.pow(l2, (1.0-self.A))
            return up/dn
        return 0.0

    def Score(self, user_songs,  all_songs):
        s_scores = {}
        for s in all_songs:
            s_scores[s] = 0.0
            if not (s in self.s2u_tr):
                continue
            for u_song in user_songs:
                if not (u_song in self.s2u_tr):
                    continue
                s_match = self.Match(s, u_song)
                s_scores[s] += math.pow(s_match, self.Q)
        return s_scores

#UserBased SImilarity Model
class PredSU(Pred):     
    def __init__(self, _u2s_tr, _A=0, _Q=1):
        Pred.__init__(self)
        self.u2s_tr = _u2s_tr
        self.Q = _Q
        self.A = _A

    def printAandQ(self):
        print("PredSU(A=%f,Q=%f)" % (self.A, self.Q),)

    def Score(self, user1_songs,  all_songs):
        s_scores = {}
        for u_tr in self.u2s_tr:
            if not u_tr in self.u2s_tr:
                continue
            user2_songs = self.u2s_tr[u_tr]
            w = float(len(user2_songs & user1_songs))
            if w > 0:
                l1 = len(user1_songs)
                l2 = len(user2_songs)
                w /= (math.pow(l1, self.A)*(math.pow(l2, (1.0-self.A))))
                w = math.pow(w, self.Q)
            for s in user2_songs:
                if s in s_scores:
                    s_scores[s] += w
                else:
                    s_scores[s] = w
        return s_scores


def fl():
    sys.stdout.flush()
#Recommenders
class Reco:
    def __init__(self, _all_songs):
        self.predictors = []
        self.all_songs = _all_songs
        self.tau = 50

    def Add(self, p):
        self.predictors.append(p)

#Recomender to Randomly recomend songs from the selected list.
class SReco(Reco):

    def __init__(self, _all_songs):
        Reco.__init__(self, _all_songs)
        self.Gamma = []

    def RandomIndex(self, n, distr):
        r = random.random()
        for i in range(n):
            if r < distr[i]:
                return i
            r -= distr[i]
        return 0

    def RandomRec(self, songs_sorted, distr):
        nPreds = len(self.predictors)
        r = []
        ii = [0]*nPreds
        while len(r) < self.tau:
            pi = self.RandomIndex(nPreds, distr)
            s = songs_sorted[pi][ii[pi]]
            if not s in r:
                r.append(s)
            ii[pi] += 1
        return r


    def RecommendToUser(self, user, u2s_v):
        songs_sorted = []
        for p in self.predictors:
            ssongs = []
            if user in u2s_v:
                ssongs = sort_dict_dec(
                    p.Score(u2s_v[user], self.all_songs))
            else:
                ssongs = list(self.all_songs)

            cleaned_songs = []
            for x in ssongs:
                if len(cleaned_songs) >= self.tau:
                    break
                if x not in u2s_v[user]:
                    cleaned_songs.append(x)

            songs_sorted += [cleaned_songs]

        return self.RandomRec(songs_sorted, self.Gamma)

    def RecommendToUsers(self, l_users, u2s_v):
        sti = time.time()
        rec4users = []
        for i, u in enumerate(l_users):
            if not (i+1) % 10:
                if u in u2s_v:
                    print("%d] %s w/ %d songs " %
                          (i+1, l_users[i], len(u2s_v[u])))
                else:
                    print("%d] %s w/ 0 songs" % (i+1, l_users[i]))
            fl()
            rec4users.append(self.RecommendToUser(u, u2s_v))
            cti = time.time()-sti
            if not (i+1) % 10:
                print(" tot secs: %f (%f)" % (cti, cti/(i+1)))
                fl()
        return rec4users


In [52]:
#Storing Testing and Training Data
u2s_testing = user_to_songs_util('year1_test_triplets_hidden.txt')
u2s_training = user_to_songs_util('year1_test_triplets_visible.txt')

users = u2s_training.keys()
users = list(users)
first_500 = users[:500]


Song-Based Recommender System

In [8]:
sys.stdout.flush()

# TRIPLETS
f_tr = "year1_test_triplets_hidden.txt"
f_tev = "year1_test_triplets_visible.txt"

print('loading users in %s'%"kaggle_users.txt")
sys.stdout.flush()
users_v = list(load_users("kaggle_users.txt"))

print('default ordering by popularity')
sys.stdout.flush()
songs_ordered=sort_dict_dec(song_to_count(f_tr))

print("loading unique users indexes")
uu = unique_users(f_tr)
u2i = {}
for i,u in enumerate(uu):
    u2i[u]=i

print('song to users on %s'%f_tr)
s2u_tr=song_to_users(f_tr)

print("converting users to indices")
for s in s2u_tr:
    s_set = set()
    for u in s2u_tr[s]:
        s_set.add(u2i[u])
    s2u_tr[s]=s_set

del u2i

print('user to songs on %s'%f_tev) 
u2s_ev=user_to_songs(f_tev)     
u2s_tr=user_to_songs(f_tr)     

print('Creating predictor..')

alpha = 0.15
q_value = 3
NumOfRecommendations = 30

pr=PredSI(s2u_tr, alpha, q_value)       #Song Based Prediction
print('Item-Item Similarity Based Predictor Created.')


print('Creating recommender..')
cp = SReco(songs_ordered)
cp.Add(pr)
cp.Gamma=[1.0]
cp.tau = NumOfRecommendations
r=cp.RecommendToUsers(users[:100],u2s_ev)

loading users in kaggle_users.txt
default ordering by popularity
loading unique users indexes
song to users on year1_test_triplets_hidden.txt
converting users to indices
user to songs on year1_test_triplets_visible.txt
Creating predictor..
Item-Item Similarity Based Predictor Created.
Creating recommender..
10] 992fea335363e87217e1b052fd7380b3fd7e35b4 w/ 51 songs 
 tot secs: 272.768773 (27.276877)
20] 7a1da2cfab87398a137441a85aee84793a376302 w/ 49 songs 
 tot secs: 495.571044 (24.778552)
30] 597c85c79b8dde655ec3fb8a7c41e3427d7f716f w/ 48 songs 
 tot secs: 727.916728 (24.263891)
40] dfadfaa39598cf760125439346d903ea0467e7fa w/ 48 songs 
 tot secs: 965.832902 (24.145823)
50] 3233c598c50f0ddbd351504e773cd51de79691db w/ 47 songs 
 tot secs: 1217.400358 (24.348007)
60] 970fafbf218e788e97784de3d681f7486fe5376a w/ 47 songs 
 tot secs: 1439.621354 (23.993689)
70] 126a1a844437af98ac07350e1e3dbec9681d5be4 w/ 46 songs 
 tot secs: 1741.685360 (24.881219)
80] c51db436e58a0d44d8d6c171182254eaf0d136a4

In [45]:
l_rec_songs_dic = dict()
for i,rlist in enumerate(r):
    l_rec_songs_dic[users[i]] = rlist

In [48]:
temp = pd.read_csv(
    'unique_tracks.txt', sep='<SEP>', header=None)
temp.columns = ['track_id', 'song_id', 'artist_name', 'song_title']

song_id_to_title = {}

for index, row in temp.iterrows():
  song_id_to_title[row['song_id']] = row['song_title']

  temp = pd.read_csv(


In [55]:
#3 Recommendations for all the users using SongBased Similarity
for user in l_rec_songs_dic:
    print(f'Songs for User[{user}]:')
    for rsong in l_rec_songs_dic[user][:3]:
        print(song_id_to_title[rsong])

Songs for User[7d90be8dfdbde170f036ce8a4b915440137cb11c]:
One More Reason
Ne-Ne Na-Na Na-Na Nu-Nu
Polish Girls
Songs for User[016a24e91a72c159a5048ab1b9b2ba5ce761b526]:
Chic tu chic
The Black Crow Knows
Black Cherry
Songs for User[03ad93fdb01506ce205f4708decf8e4b1ae90fff]:
Due Mondi
Quello Che Le Donne Non Dicono
Gli Angeli (Italian Mainstream Mix)
Songs for User[0f8308935bcbb9a1e04ebb7c4d41c037e5f23b90]:
Slow Crows Over
Houria
Real Life (feat. B Real)
Songs for User[2e424b28bff1f62a2dae22a918f09f9c30c06d1b]:
Thousand Mile Wish
A Little Miracle
Spank Me Baby
Songs for User[d30e18323f15426c3cdc8585252ed34459916f51]:
Coldest Night Of The Year
If She's Near
Room 108
Songs for User[316110734d8da7478cc33237458814f770a9eb7a]:
Promise (featuring Chris Cornell)
Please Play This Song On The Radio
You Will Lose Faith
Songs for User[7e27789eae69bc946c51833c0b833a49c58ed9ed]:
Physical Overdrive
On A Saturday Nite
Drip
Songs for User[9769ebffc543fa42b58daf6f52f8816abf5f408e]:
Te Hablaré
Como Se O M

In [56]:
mAP_SongBased = mAP_mod(users[:100],l_rec_songs_dic,u2s_testing,30)

In [18]:
print(mAP_SongBased)

0.041726901853522964


User-Based Model

In [26]:
sys.stdout.flush()

# TRIPLETS
f_tr = "year1_test_triplets_hidden.txt"
f_tev = "year1_test_triplets_visible.txt"

print('loading users in %s' % "kaggle_users.txt")
sys.stdout.flush()
users_v = list(load_users("kaggle_users.txt"))

print('default ordering by popularity')
sys.stdout.flush()
songs_ordered = sort_dict_dec(song_to_count(f_tr))

print("loading unique users indexes")
uu = unique_users(f_tr)
u2i = {}
for i, u in enumerate(uu):
    u2i[u] = i

print('song to users on %s' % f_tr)
s2u_tr = song_to_users(f_tr)

print("converting users to indices")
for s in s2u_tr:
    s_set = set()
    for u in s2u_tr[s]:
        s_set.add(u2i[u])
    s2u_tr[s] = s_set

del u2i

print('user to songs on %s' % f_tev)
u2s_ev = user_to_songs(f_tev)
u2s_tr = user_to_songs(f_tr)

print('Creating predictor..')

alpha = 0.15
q_value = 3
NumOfRecommendations = 30

UB_pred = PredSU(u2s_training, alpha, q_value)  # User Based Prediction
print('User-User Similarity Based Predictor Created.')


print('Creating recommender..')
UB_Reco = SReco(songs_ordered)
UB_Reco.Add(UB_pred)
UB_Reco.Gamma = [1.0]
UB_Reco.tau = NumOfRecommendations
r2 = UB_Reco.RecommendToUsers(users[:100], u2s_ev)
print(len(r2))
save_recommendations(r2, "kaggle_songs.txt", "UserBasedResults.txt")

l_rec_songs_dic2 = dict()
for i, rlist in enumerate(r2):
    l_rec_songs_dic2[users[i]] = rlist

print(mAP_mod(users[:100],l_rec_songs_dic2,u2s_testing,30))

loading users in kaggle_users.txt
default ordering by popularity
loading unique users indexes
song to users on year1_test_triplets_hidden.txt
converting users to indices
user to songs on year1_test_triplets_visible.txt
Creating predictor..
User-User Similarity Based Predictor Created.
Creating recommender..
10] 992fea335363e87217e1b052fd7380b3fd7e35b4 w/ 51 songs 
 tot secs: 6.640214 (0.664021)
20] 7a1da2cfab87398a137441a85aee84793a376302 w/ 49 songs 
 tot secs: 13.126846 (0.656342)
30] 597c85c79b8dde655ec3fb8a7c41e3427d7f716f w/ 48 songs 
 tot secs: 19.547867 (0.651596)
40] dfadfaa39598cf760125439346d903ea0467e7fa w/ 48 songs 
 tot secs: 25.867864 (0.646697)
50] 3233c598c50f0ddbd351504e773cd51de79691db w/ 47 songs 
 tot secs: 32.110866 (0.642217)
60] 970fafbf218e788e97784de3d681f7486fe5376a w/ 47 songs 
 tot secs: 38.516567 (0.641943)
70] 126a1a844437af98ac07350e1e3dbec9681d5be4 w/ 46 songs 
 tot secs: 44.860546 (0.640865)
80] c51db436e58a0d44d8d6c171182254eaf0d136a4 w/ 46 songs 
 tot

In [57]:
#3 Recommendations for all the users using UserBased Similarity
for user in l_rec_songs_dic2:
    print(f'Songs for User[{user}]:')
    for rsong in l_rec_songs_dic2[user][:3]:
        print(song_id_to_title[rsong])


Songs for User[7d90be8dfdbde170f036ce8a4b915440137cb11c]:
Vim (Album Version)
O.N.E. (XXXChange Remix)
(The Symphony Of) Blase'
Songs for User[016a24e91a72c159a5048ab1b9b2ba5ce761b526]:
Sehr kosmisch
Undo
Secrets
Songs for User[03ad93fdb01506ce205f4708decf8e4b1ae90fff]:
Undo
You're The One
Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile)
Songs for User[0f8308935bcbb9a1e04ebb7c4d41c037e5f23b90]:
Undo
You're The One
Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile)
Songs for User[2e424b28bff1f62a2dae22a918f09f9c30c06d1b]:
Catch You Baby (Steve Pitron & Max Sanna Radio Edit)
I Get Joy
The Invisible Man
Songs for User[d30e18323f15426c3cdc8585252ed34459916f51]:
You're The One
Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile)
Revelry
Songs for User[316110734d8da7478cc33237458814f770a9eb7a]:
Undo
Sehr kosmisch
Revelry
Songs for User[7e27789eae69bc946c51833c0b833a49c58ed9ed]:
Jezebel
Hang On To Your Love
Your Love Is King
Songs for User[97