# ABOUT

Million Musical Tweets Dataset: http://www.cp.jku.at/datasets/MMTD/

Music Brainz: https://musicbrainz.org/doc/MusicBrainz_Database/Download

Number of unique artists: 24673

Number of unique users: 214741

Number of unique tweets: 1074713

Number of unique tracks: 133228

ALS Implicit Collaborative Filtering: https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe

Artist Gender: 
1 = Male
2 = Female
3 = Group 
4 = Other 
5 = No Info

Artist Type: 
1 = Person 
2 = Group
3 = Other 
4 = Character 
5 = Choir 
6 = No Info

In [1]:
import numpy as np
import pandas as pd
import sys
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
from sklearn.preprocessing import MinMaxScaler
import implicit
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("/Users/MAC/Desktop/MIP 2019/Code/data.csv", header = 0, sep=',', low_memory=False)
artist_gender = pd.read_csv("/Users/MAC/Desktop/MIP 2019/Code/gender_bias.csv", header = 0, sep=',')
artist_type = pd.read_csv("/Users/MAC/Desktop/MIP 2019/Code/artist_type.csv", header = 0, sep=',')
ttime = pd.read_csv("/Users/MAC/Desktop/MIP 2019/Code/ttime.csv", header = 0, sep=',')

In [3]:
#### BUILDING 2D RECOMMENDER SYSTEM ###
alpha_val = 15
users = list(np.sort(data.user_id.unique()))
artists = list(data.artists_name.unique())
songs = list(data.track_title.unique())

data['usercatcode'] = data['user_id'].astype("category").cat.codes
data['artistcatcode'] = data['artist_id'].astype("category").cat.codes
data['ttime'] = data['tweet_time'].astype("category").cat.codes
data['songcatcode'] = data.track_title.astype('category', songs).cat.codes 
# Build train and test dataset
train, test = train_test_split(data, test_size=0.2)

In [4]:
#----------------------------------
# ARTISTS RECOMMENDATION MODEL 
#----------------------------------
artist_train = train.groupby(['user_id', 'artists_name', 'artistcatcode', 'usercatcode'])['tweet_id'].count().reset_index(name='tweet_count')
artist_data = data.groupby(['user_id', 'artists_name', 'artistcatcode', 'usercatcode'])['tweet_id'].count().reset_index(name='tweet_count')

# Create sparse matrices
sparse_artist_user = sparse.csr_matrix((artist_train['tweet_count'], (artist_train['artistcatcode'], artist_train['usercatcode'])))
sparse_user_artist = sparse.csr_matrix((artist_data['tweet_count'], (artist_data['usercatcode'], artist_data['artistcatcode'])))

# Calculate the confidence by multiplying it by our alpha value.
userartist_conf = (sparse_artist_user * alpha_val).astype('double')

# Initialize the als model_artist and fit it using the sparse artist-user matrix
model_artist = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)
model_artist.fit(userartist_conf) 

100%|██████████| 50.0/50 [00:42<00:00,  1.46it/s]


In [10]:
# Caculate the sparsity of an item-user matrix
def sparsity(matrix):
    matrix_size = matrix.shape[0]*matrix.shape[1] # Number of possible interactions in the matrix
    num_purchases = len(matrix.nonzero()[0]) # Number of items interacted with
    sparsity = 100*(1 - (num_purchases/matrix_size))
    return sparsity

In [11]:
#-----------------------------------------------
# FIND SIMILAR ARTISTS
#-----------------------------------------------
def similar_artists_func(name=random.choice(artists), n_similar = 10): 
    testartist = data.artistcatcode.loc[data.artists_name == name].iloc[0]

    # Use implicit to get similar items.
    similar = model_artist.similar_items(testartist, n_similar)
    
    # Print the names of our most similar artists
    print('{} similar artists to {}:'.format(n_similar, name))
    max_similarity = similar[0][1]
    for item in similar:
        idx, score = item
        percentage = (score / max_similarity) * 100
        name = data.artists_name.loc[data.artistcatcode == idx].iloc[0]
        print(name, "- with a similarity of {:.2f}%".format(percentage))

In [12]:
#-----------------------------------------------
# CREATE RECOMMENDATIONS FOR USER
#-----------------------------------------------
def recommend_artists_func(user_id = random.choice(users), N=10):
        # Create recommendations for user with user_id 161262801
    testuser = data.usercatcode.loc[data.user_id == user_id].iloc[0]
        # Use the implicit recommender.
    recommended = model_artist.recommend(testuser, sparse_user_artist, len(artists))
        # Get artist names from ids
    recommend_artist = []
    artist_score = []
    for idx, score in recommended:
            #get artist name
        aname = data.artists_name.loc[data.artistcatcode == idx].iloc[0]
            #get gender bias weight for that artist
        agender = data.artist_gender.loc[data.artistcatcode == idx].iloc[0]
        ag_bias = artist_gender[agender].loc[artist_gender.user_id == user_id].iloc[0]
            #get type bias weight for that artist
        atype = data.artist_type.loc[data.artistcatcode == idx].iloc[0]
        atype_bias = artist_type[atype].loc[artist_type.user_id == user_id].iloc[0]
            #update the final score
        score = score*(1 + ag_bias + atype_bias)
            #append results to lists
        recommend_artist.append(aname)
        artist_score.append(score.round(2))
            #print out the results
    print('{} artists recommendations for user {}:'.format(N, user_id), '\n')
    for i in range(N):
        print(i+1, recommend_artist[i], artist_score[i])

In [13]:
# FOR COMPARISON: 
def recommend_artists_func1(user_id = random.choice(users), N = 10):
    testuser = data.usercatcode.loc[data.user_id == user_id].iloc[0]
    recommended = model_artist.recommend(testuser, sparse_user_artist)
    recommend_artist = []
    artist_score = []
    for idx, score in recommended:
        aname = data.artists_name.loc[data.artistcatcode == idx].iloc[0]
        recommend_artist.append(aname)
        artist_score.append(score.round(2))
    print('{} artists recommendations for user {}:'.format(N, user_id), '\n')
    for i in range(N):
        print(i+1, recommend_artist[i], artist_score[i])

In [14]:
testuser = random.choice(test.user_id.unique())
recommend_artists_func(testuser)
print('Results before adding weight', '\n')
recommend_artists_func1(testuser)

10 artists recommendations for user 200110988: 

1 Rihanna 1.32
2 Lady Gaga 1.25
3 Adele 1.24
4 Maroon 5 0.41
5 Bruno Mars 0.81
6 Jessie J 1.18
7 Britney Spears 0.81
8 Taylor Swift 0.81
9 Paramore 0.25
10 Kelly Clarkson 0.75
Results before adding weight 

10 artists recommendations for user 200110988: 

1 Rihanna 0.44
2 Lady Gaga 0.42
3 Adele 0.41
4 Maroon 5 0.41
5 Bruno Mars 0.41
6 Jessie J 0.39
7 Britney Spears 0.27
8 Taylor Swift 0.27
9 Paramore 0.25
10 Kelly Clarkson 0.25


In [15]:
data.loc[data.user_id == testuser]

Unnamed: 0,tweet_id,user_id,track_id,track_title,artist_id,artists_name,artist_type,artist_area,artist_gender,tweet_month,tweet_year,tweet_time,tweet_day,usercatcode,artistcatcode,ttime,songcatcode
174659,139727322549981184,200110988,4098232,The One That Got Away,228395,Katy Perry,1,222,2,11,2011,Afternoon,weekday,96800,11418,0,77798
921060,261746215492915200,200110988,10477333,Crying Out Your Name,255764,Loreen,1,202,2,10,2012,Morning,weekday,96800,12839,2,16310
