# ABOUT

Million Musical Tweets Dataset: http://www.cp.jku.at/datasets/MMTD/

Music Brainz: https://musicbrainz.org/doc/MusicBrainz_Database/Download

Number of unique artists: 24673

Number of unique users: 214741

Number of unique tweets: 1074713

Number of unique tracks: 133228

ALS Implicit Collaborative Filtering: https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe

Artist Gender: 
1 = Male
2 = Female
3 = Group 
4 = Other 
5 = No Info

Artist Type: 
1 = Person 
2 = Group
3 = Other 
4 = Character 
5 = Choir 
6 = No Info

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
import random
import math
import implicit
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree
from statistics import mean

In [2]:
path = "/Users/MAC/Desktop/MIP 2019/Code/"
data = pd.read_csv(path+"data.csv", header = 0, sep=',', low_memory=False)
artist_gender = pd.read_csv(path+"gender_bias.csv", header = 0, sep=',')
artist_type = pd.read_csv(path + "artist_type.csv", header = 0, sep=',')
ttime = pd.read_csv(path+"ttime.csv", header = 0, sep=',')

In [3]:
#----------------------------------
# BUILDING 2D RECOMMENDER SYSTEM 
#----------------------------------
alpha_val = 15
users = list(np.sort(data.user_id.unique()))
artists = list(data.artists_name.unique())
songs = list(data.track_title.unique())

data['usercatcode'] = data['user_id'].astype("category").cat.codes
data['artistcatcode'] = data['artist_id'].astype("category").cat.codes
data['ttime'] = data['tweet_time'].astype("category").cat.codes
data['songcatcode'] = data.track_title.astype('category', songs).cat.codes 
# Build train and test dataset
train, test = train_test_split(data, test_size=0.4)

In [4]:
#----------------------------------
# ARTISTS RECOMMENDATION MODEL 
#----------------------------------
artist_train = train.groupby(['user_id', 'artists_name', 'artistcatcode', 'usercatcode'])['tweet_id'].count().reset_index(name='tweet_count')
artist_data = data.groupby(['user_id', 'artists_name', 'artistcatcode', 'usercatcode'])['tweet_id'].count().reset_index(name='tweet_count')

# Create sparse matrices
sparse_artist_user = sparse.csr_matrix((artist_train['tweet_count'], (artist_train['artistcatcode'], artist_train['usercatcode'])))
sparse_user_artist = sparse.csr_matrix((artist_data['tweet_count'], (artist_data['usercatcode'], artist_data['artistcatcode'])))

# Calculate the confidence by multiplying it by our alpha value.
userartist_conf = (sparse_artist_user * alpha_val).astype('double')

# Initialize the als model_artist and fit it using the sparse artist-user matrix
model_artist = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)
model_artist.fit(userartist_conf) 

# Caculate the sparsity of an item-user matrix
def sparsity(matrix):
    matrix_size = matrix.shape[0]*matrix.shape[1] # Number of possible interactions in the matrix
    num_purchases = len(matrix.nonzero()[0]) # Number of items interacted with
    sparsity = 100*(1 - (num_purchases/matrix_size))
    return sparsity

100%|██████████| 50.0/50 [00:28<00:00,  1.05s/it]


In [5]:
train_neighbor = pd.merge(train, ttime, on='user_id').drop(['tweet_id', 'track_id', 'track_title', 'artist_id', 'artists_name', 'artist_type', 'artist_area', 'artist_gender','tweet_month', 'tweet_year', 'tweet_time', 'tweet_day', 'usercatcode','artistcatcode', 'ttime', 'songcatcode', 'Total'], 1).drop_duplicates().set_index('user_id')
test_neighbor = ttime.drop('Total',1).drop_duplicates().set_index('user_id')
tree = KDTree(train_neighbor.values, leaf_size=80)
dist, ind = tree.query(test_neighbor, k = 20)

In [36]:
#-----------------------------------------------
# IMPLICIT COLLABORATIVE FILTERING SYSTEM
#-----------------------------------------------

# FIND SIMILAR ARTISTS

def similar_artists_func(artist, N = 10): 
    testartist = data.artistcatcode.loc[data.artists_name == artist].iloc[0]

    # Use implicit to get similar items.
    similar = model_artist.similar_items(testartist, N)
    
    # Print the names of our most similar artists
    print('{} similar artists to {}:\n'.format(N, artist))
    max_similarity = similar[0][1]
    for i in range(N):
        idx, score = similar[i]
        percentage = (score / max_similarity) * 100
        name = data.artists_name.loc[data.artistcatcode == idx].iloc[0]
        print(i+1, name, "- {:.2f}%".format(percentage))

# CREATE RECOMMENDATIONS FOR USER
       
def cf_recsys(user_id, N = 15):
    testuser = data.usercatcode.loc[data.user_id == user_id].iloc[0]
    recommended = model_artist.recommend(testuser, sparse_user_artist, N)
    recommendation = dict()
    for idx, score in recommended:
        aname = data.artists_name.loc[data.artistcatcode == idx].iloc[0]
        recommendation[aname] = score.round(3)
    recommendation = sorted(recommendation.items(), key=lambda x: x[1], reverse=True)
    return recommendation

In [7]:
#------------------
# HYBRID SYSTEMS
#------------------

def hybrid_recsys(user_id, N=15):
        # Create recommendations for user with user_id 161262801
    testuser = data.usercatcode.loc[data.user_id == user_id].iloc[0]
        # Use the implicit recommender.
    recommended = model_artist.recommend(testuser, sparse_user_artist, N)
        # Get artist names from ids
    recommendation = dict()
    for idx, score in recommended:
            #get artist name
        aname = data.artists_name.loc[data.artistcatcode == idx].iloc[0]
            #get gender bias weight for that artist
        agender = data.artist_gender.loc[data.artistcatcode == idx].iloc[0]
        ag_bias = artist_gender[agender].loc[artist_gender.user_id == user_id].iloc[0]
            #get type bias weight for that artist
        atype = data.artist_type.loc[data.artistcatcode == idx].iloc[0]
        atype_bias = artist_type[atype].loc[artist_type.user_id == user_id].iloc[0]
            #update the final score
        fscore = score*(1 + ag_bias + atype_bias)
            #append results to lists
        recommendation[aname] = fscore.round(3)
            #print out the results
    recommendation = sorted(recommendation.items(), key=lambda x: x[1], reverse=True)[:N]
    return recommendation

In [8]:
#--------------------------
# CONTEXT-AWARE SYSTEMS
#--------------------------
def context_recsys(user_id, N=15):
        #build neighborlist
    ix = test_neighbor.index.get_loc(user_id)
    n_id = test_neighbor.index[ind[ix]].values
    sim_score = 1 - dist[ix]
    neighborlist = list(zip(n_id, sim_score))
        #build recommendation for each neighbor
    recommendation = dict()
    a_counter = dict()
    for neighbor, sim in neighborlist:
        hybrid = hybrid_recsys(neighbor)
        for artist, score in hybrid: 
            if neighbor == user_id: 
                fscore = score * 3
            else: 
                fscore = score * (1 + sim)
            if artist in recommendation.keys():
                a_counter[artist] =+ 1
                recommendation[artist] = max(recommendation[artist], fscore)
            else: 
                a_counter[artist] = 1
                recommendation[artist] = fscore.round(3)
    for artist in recommendation.keys(): 
        recommendation[artist] = recommendation[artist] + math.log(a_counter[artist])
    recommendation = sorted(recommendation.items(), key=lambda x: x[1], reverse=True)[:N]
    return recommendation 

In [9]:
#------------------
# EVALUATION
#------------------
def measure(user, system):
    recommend_artist = [i[0] for i in system(user)]
    real_artist = set(data[data['user_id'] == user]['artists_name'])
    match = real_artist.intersection(recommend_artist)
    precision = len(match)/len(recommend_artist) 
    recall = len(match)/len(real_artist)
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
    return [precision, recall, f1_score]

def evaluate(user):
    d = dict()
    d['user_id'] = user
    d['CF System'] = measure(user, cf_recsys)
    d['Hybrid System'] = measure(user, hybrid_recsys)
    d['Context System'] = measure(user, context_recsys)
    return d

In [18]:
for u in train.user_id.unique():
    if len(train[train.user_id == u].artist_id) >= 15:
        res.append(evaluate(u))

In [19]:
f_context =[]
f_hybrid = []
f_cf = []
p_context = []
p_hybrid = []
p_cf = []
r_context = []
r_hybrid = []
r_cf = []
for item in res:
    p_cf.append(item['CF System'][0])
    p_hybrid.append(item['Hybrid System'][0])
    p_context.append(item['Context System'][0])
    r_cf.append(item['CF System'][1])
    r_hybrid.append(item['Hybrid System'][1])
    r_context.append(item['Context System'][1])
    f_cf.append(item['CF System'][2])
    f_hybrid.append(item['Hybrid System'][2])
    f_context.append(item['Context System'][2])

In [20]:
reslst = [p_cf, p_hybrid, p_context, r_cf, r_hybrid, r_context, f_cf, f_hybrid, f_context]
finalresult = {}
finalresult['Precision'] = [mean(p_cf), mean(p_hybrid), mean(p_context)]
finalresult['Recall'] = [mean(r_cf), mean(r_hybrid), mean(r_context)]
finalresult['F-measure'] = [mean(f_cf), mean(f_hybrid), mean(f_context)]
finalresult = pd.DataFrame.from_dict(finalresult, orient='index')
finalresult.columns=['CF System', 'Hybrid System', 'Context System']

In [21]:
finalresult

Unnamed: 0,CF System,Hybrid System,Context System
Precision,0.003163,0.003163,0.106881
Recall,0.001887,0.001887,0.0604
F-measure,0.002242,0.002242,0.071867


In [40]:
similar_artists_func('Katy Perry', 15)

15 similar artists to Katy Perry:

1 Katy Perry - 100.00%
2 Tsukasa - 96.65%
3 Juliette Lewis - 95.92%
4 Mikroboy - 95.92%
5 Gene Kelly - 95.92%
6 Super Danger Casper - 95.84%
7 Jahanam - 89.88%
8 Deutschland sucht den Superstar - 88.80%
9 Keremcem - 80.83%
10 Esmeray - 77.00%
11 Simone - 73.45%
12 Milton Banana Trio - 72.11%
13 Jessie J - 71.95%
14 Annie Little - 71.89%
15 Frank Joshua - 70.78%
