In [15]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

#------------------
# LOAD THE DATASET
#------------------

data = pd.read_csv('./raw_data/lastfm.csv')

# Create a new dataframe without the user ids.
data_items = data.drop('user', 1)

In [1]:
song_meta = pd.read_json('./raw_data/song_meta.json')

In [3]:
song_meta['total_gn'] = song_meta['song_gn_dtl_gnr_basket'] + song_meta['song_gn_gnr_basket']

In [4]:
df = song_meta[['id','total_gn']].explode('total_gn')
df

Unnamed: 0,id,total_gn
0,0,GN0901
0,0,GN0900
1,1,GN1601
1,1,GN1606
1,1,GN1600
...,...,...
707987,707987,GN1801
707987,707987,GN1800
707988,707988,GN0601
707988,707988,GN0604


In [13]:
data_items = pd.get_dummies(df['total_gn']).max(level=0)

In [16]:

#------------------------
# ITEM-ITEM CALCULATIONS
#------------------------

# As a first step we normalize the user vectors to unit vectors.

# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(data_items).sum(axis=1))

# unitvector = (x / magnitude, y / magnitude, z / magnitude, ...)
data_items = data_items.divide(magnitude, axis='index')

def calculate_similarity(data_items):
    """Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns)
    return sim

# Build the similarity matrix
data_matrix = calculate_similarity(data_items)

# Lets get the top 11 similar artists for Beyonce
print(data_matrix.loc['beyonce'].nlargest(11))

beyonce               1.000000
the pussycat dolls    0.351871
rihanna               0.334919
christina aguilera    0.308388
alicia keys           0.297264
justin timberlake     0.279726
britney spears        0.269557
leona lewis           0.259237
maria mena            0.248751
kelly clarkson        0.245713
nelly furtado         0.230789
Name: beyonce, dtype: float64


In [19]:

#------------------------
# USER-ITEM CALCULATIONS
#------------------------

# Construct a new dataframe with the 10 closest neighbours (most similar)
# for each artist.
data_neighbours = pd.DataFrame(index=data_matrix.columns, columns=range(1,11))
for i in range(0, len(data_matrix.columns)):
    data_neighbours.ix[i,:10] = data_matrix.ix[0:,i].sort_values(ascending=False)[:10].index

user = 5985
user_index = data[data.user == user].index.tolist()[0]

# Get the artists the user has played.
known_user_likes = data_items.ix[user_index]
known_user_likes = known_user_likes[known_user_likes >0].index.values

# Construct the neighbourhood from the most similar items to the
# ones our user has already liked.
most_similar_to_likes = data_neighbours.ix[known_user_likes]
similar_list = most_similar_to_likes.values.tolist()
similar_list = list(set([item for sublist in similar_list for item in sublist]))
neighbourhood = data_matrix[similar_list].ix[similar_list]

# A user vector containing only the neighbourhood items and
# the known user likes.
user_vector = data_items.ix[user_index].ix[similar_list]

# Calculate the score.
score = neighbourhood.dot(user_vector).div(neighbourhood.sum(axis=1))

# Drop the known likes.
score = score.drop(known_user_likes)

print(known_user_likes)
print(score.nlargest(20))

['bob dylan' 'the cure']
joy division           0.087840
the smiths             0.087004
the rolling stones     0.084162
david bowie            0.081768
tom waits              0.075365
belle and sebastian    0.070918
eric clapton           0.069710
misfits                0.069017
the beatles            0.067101
elliott smith          0.067058
ramones                0.064136
jimi hendrix           0.060558
depeche mode           0.057806
johnny cash            0.055420
the doors              0.047377
dtype: float64
