# ABOUT my dataset: merging 2 major datasets into a file mydata.txt

Million Musical Tweets Dataset: http://www.cp.jku.at/datasets/MMTD/

Music Brainz: https://musicbrainz.org/doc/MusicBrainz_Database/Download

Number of unique artists: 24673

Number of unique users: 214741

Number of unique tweets: 1074713

Number of unique tracks: 133228

ALS Implicit Collaborative Filtering: https://medium.com/radon-dev/als-implicit-collaborative-filtering-5ed653ba39fe

In [2]:
import numpy as np
import pandas as pd
import sys
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
from sklearn.preprocessing import MinMaxScaler
import implicit
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv("/Users/MAC/Desktop/MIP 2019/Code/mydata.txt", header = 0, sep=',', low_memory=False)
data = data.drop_duplicates()

In [4]:
#### BUILDING 2D RECOMMENDER SYSTEM ###
alpha_val = 15
users = list(np.sort(data.user_id.unique()))
artists = list(data.artists_name.unique())
songs = list(data.track_title.unique())

data['usercatcode'] = data['user_id'].astype("category").cat.codes
data['artistcatcode'] = data['artist_id'].astype("category").cat.codes
data['ttime'] = data['tweet_time'].astype("category").cat.codes
data['songcatcode'] = data.track_title.astype('category', songs).cat.codes 
# Build train and test dataset
train, test = train_test_split(data, test_size=0.2)

In [7]:
#----------------------------------
# ARTISTS RECOMMENDATION MODEL 
#----------------------------------
artist_train = train.groupby(['user_id', 'artists_name', 'artistcatcode', 'usercatcode'])['tweet_id'].count().reset_index(name='tweet_count')
artist_data = data.groupby(['user_id', 'artists_name', 'artistcatcode', 'usercatcode'])['tweet_id'].count().reset_index(name='tweet_count')

# Create sparse matrices
sparse_artist_user = sparse.csr_matrix((artist_train['tweet_count'], (artist_train['artistcatcode'], artist_train['usercatcode'])))
sparse_user_artist = sparse.csr_matrix((artist_data['tweet_count'], (artist_data['usercatcode'], artist_data['artistcatcode'])))

# Calculate the confidence by multiplying it by our alpha value.
userartist_conf = (sparse_artist_user * alpha_val).astype('double')

# Initialize the als model_artist and fit it using the sparse artist-user matrix
model_artist = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)
model_artist.fit(userartist_conf) 

100%|██████████| 50.0/50 [00:27<00:00,  2.08it/s]


In [8]:
#---------------------
# FIND SIMILAR ARTISTS
#---------------------
n_similar = 20
# Find the n most similar to an artist

def similar_artists_func(name=random.choice(artists)): 
    testartist = data.artistcatcode.loc[data.artists_name == name].iloc[0]

    # Use implicit to get similar items.
    similar = model_artist.similar_items(testartist, n_similar)
    
    # Print the names of our most similar artists
    print('{} similar artists to {}:'.format(n_similar, name))
    max_similarity = similar[0][1]
    for item in similar:
        idx, score = item
        percentage = (score / max_similarity) * 100
        print(data.artists_name.loc[data.artistcatcode == idx].iloc[0], "- with a similarity of {:.2f}%".format(percentage))

#-----------------------------------------------
# CREATE RECOMMENDATIONS FOR USER
#-----------------------------------------------

def recommend_artists_func(user_id):
    # Create recommendations for user with user_id 161262801
    testuser = data.usercatcode.loc[data.user_id == user_id].iloc[0]
    # Use the implicit recommender.
    recommended = model_artist.recommend(testuser, sparse_user_artist)
    # Get artist names from ids
    recommend_artist = []
    artist_score = []
    for idx, score in recommended:
        recommend_artist.append(data.artists_name.loc[data.artistcatcode == idx].iloc[0])
        artist_score.append(score*100)
    print('{} artists recommendations for user {}: \n{}'.format(n_similar, user_id, recommend_artist), '\n')
    for i in range(10):
        print('{}. {}, with a score of {}%'.format(i+1, recommend_artist[i], artist_score[i].round(2)))

In [12]:
similar_artists_func('Avenged Sevenfold')

20 similar artists to Avenged Sevenfold:
Avenged Sevenfold - with a similarity of 100.00%
Fake ID - with a similarity of 96.05%
Amnesys - with a similarity of 94.03%
Slipknot - with a similarity of 85.35%
skankfunk - with a similarity of 83.82%
Edinéia Macedo - with a similarity of 83.79%
Juno Reactor - with a similarity of 80.29%
Inside Out - with a similarity of 77.95%
she - with a similarity of 76.93%
Vida - with a similarity of 74.51%
Bullet for My Valentine - with a similarity of 73.76%
Falak - with a similarity of 73.59%
System of a Down - with a similarity of 72.65%
Dark Lotus - with a similarity of 72.36%
Jay Gordon - with a similarity of 72.36%
Forgotten - with a similarity of 71.55%
Aereogramme - with a similarity of 71.51%
Dim Chris - with a similarity of 71.18%
Erase the Grey - with a similarity of 69.65%
Bass-T - with a similarity of 68.85%


In [9]:
recommend_artists_func(random.choice(data.user_id.unique()))

20 artists recommendations for user 58088831: 
['Avenged Sevenfold', 'Slipknot', 'System of a Down', 'Bullet for My Valentine', 'Metallica', 'Evanescence', 'Asking Alexandria', 'Skrillex', 'Bring Me the Horizon', 'Future'] 

1. Avenged Sevenfold, with a score of 0.21%
2. Slipknot, with a score of 0.2%
3. System of a Down, with a score of 0.14%
4. Bullet for My Valentine, with a score of 0.11%
5. Metallica, with a score of 0.1%
6. Evanescence, with a score of 0.1%
7. Asking Alexandria, with a score of 0.1%
8. Skrillex, with a score of 0.08%
9. Bring Me the Horizon, with a score of 0.08%
10. Future, with a score of 0.08%


In [14]:
# ----------------------------
# SONGS RECOMMENDATION MODEL
#----------------------------

# Get the associated row and column indices
data['songcatcode'] = data.track_title.astype('category', songs).cat.codes 
song_train = train.groupby(['user_id', 'track_title', 'usercatcode', 'songcatcode'])['tweet_id'].count().reset_index(name='tweet_count').sort_values(by = 'tweet_count', ascending=False)
song_test = data.groupby(['user_id', 'track_title', 'usercatcode', 'songcatcode'])['tweet_id'].count().reset_index(name='tweet_count').sort_values(by = 'tweet_count', ascending=False)

# Create sparse matrices
sparse_song_user = sparse.csr_matrix((song_train['tweet_count'], (song_train['songcatcode'], song_train['usercatcode'])))
sparse_user_song = sparse.csr_matrix((song_test['tweet_count'], (song_test['usercatcode'], song_test['songcatcode'])))

# Calculate the confidence by multiplying it by our alpha value.
usersong_conf = (sparse_song_user * alpha_val).astype('double')

# Initialize the als model and fit it using the sparse song-user matrix
model_song = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)
model_song.fit(usersong_conf)

100%|██████████| 50.0/50 [00:30<00:00,  1.75it/s]


In [13]:
#---------------------
# FIND SIMILAR SONGS
#---------------------
n_similar = 10
# Find the 10 most similar to a song
def similar_songs(name=random.choice(songs)): 
    testsong = song_test.songcatcode.loc[song_test.track_title == name].iloc[0]
    # Use implicit to get similar items.
    similar = model_song.similar_items(testsong, n_similar)
    # Print the names of our most similar artists
    print('{} similar songs to song "{}":'.format(n_similar, name))
    max_similarity = similar[0][1]
    for item in similar:
        idx, score = item
        percentage = (score / max_similarity) * 100
        print(song_test.track_title.loc[song_test.songcatcode == idx].iloc[0], "- with a similarity of {:.2f}%".format(percentage))
        
#-----------------------------------------------
# CREATE SONGS RECOMMENDATIONS FOR USER
#-----------------------------------------------       
def recommend_songs(user_id=random.choice(users)):
    # Create recommendations for user with user_id 
    testuser = (song_test.usercatcode.loc[usersong.user_id == user_id].iloc[0])

    # Use the implicit recommender.
    recommended = model_song.recommend(testuser, sparse_user_song)
    print('{} songs recommendations for user {}'.format(n_similar, user_id))
    # Get artist names from ids
    for item in recommended:
        idx, score = item
        print(song_test.track_title.loc[song_test.songcatcode == idx].iloc[0])