In [195]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics.pairwise import cosine_similarity

In [182]:

plays = pd.read_csv('data/user_artists.dat', sep='\t')
artists = pd.read_csv('data/artists.dat', sep='\t', usecols=['id','name'])

# Merge artist and user pref data
ap = pd.merge(artists, plays, how="inner", left_on="id", right_on="artistID")
ap = ap.rename(columns={"weight": "playCount"})

# Group artist by name
artist_rank = ap.groupby(['name']) \
    .agg({'userID' : 'count', 'playCount' : 'sum'}) \
    .rename(columns={"userID" : 'totalUsers', "playCount" : "totalPlays"}) \
    .sort_values(['totalPlays'], ascending=False)

artist_rank['avgPlays'] = artist_rank['totalPlays'] / artist_rank['totalUsers']
print(artist_rank)

                    totalUsers  totalPlays     avgPlays
name                                                   
Britney Spears             522     2393140  4584.559387
Depeche Mode               282     1301308  4614.567376
Lady Gaga                  611     1291387  2113.563011
Christina Aguilera         407     1058405  2600.503686
Paramore                   399      963449  2414.659148
...                        ...         ...          ...
Morris                       1           1     1.000000
Eddie Kendricks              1           1     1.000000
Excess Pressure              1           1     1.000000
My Mine                      1           1     1.000000
A.M. Architect               1           1     1.000000

[17632 rows x 3 columns]


In [183]:
# Merge into ap matrix
ap = ap.join(artist_rank, on="name", how="inner") \
    .sort_values(['playCount'], ascending=False)
# Preprocessing
pc = ap.playCount
play_count_scaled = (pc - pc.min()) / (pc.max() - pc.min())
ap = ap.assign(playCountScaled=play_count_scaled)

# Build a user-artist rating matrix 
ratings_df = ap.pivot(index='userID', columns='artistID', values='playCountScaled')
ratings = ratings_df.fillna(0).values

# Show sparsity
density = float(len(ratings.nonzero()[0])) / (ratings.shape[0] * ratings.shape[1]) * 100
print("density: %.2f" % density)

density: 0.28


In [184]:
from scipy.sparse import csr_matrix, csc_matrix, coo_matrix

# Build a sparse matrix
X = csr_matrix(ratings)

n_users, n_items = ratings_df.shape
print("rating matrix shape", ratings_df.shape)

user_ids = ratings_df.index.values
artist_names = ap.sort_values("artistID")["name"].unique()

rating matrix shape (1892, 17632)


In [185]:
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm.cross_validation import random_train_test_split
from lightfm.data import Dataset

# Build data references + train test
Xcoo = X.tocoo()
data = Dataset()
data.fit(np.arange(n_users), np.arange(n_items))
interactions, weights = data.build_interactions(zip(Xcoo.row, Xcoo.col, Xcoo.data)) 
train, test = random_train_test_split(interactions, random_state=42)

# Ignore that (weight seems to be ignored...)
#train = train_.tocsr()
#test = test_.tocsr()
#train[train==1] = X[train==1]
#test[test==1] = X[test==1]

# To be completed...

In [186]:
# Train
model = LightFM(learning_rate=0.05, loss='warp', random_state=42)
model.fit(train, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7fb8b370abb0>

In [187]:
# Evaluate
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10, train_interactions=train).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test, train_interactions=train).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.38, test 0.13.
AUC: train 0.96, test 0.86.


In [188]:
# Predict
scores = model.predict(0, np.arange(n_items))
top_items = artist_names[np.argsort(-scores)]
print(top_items)

['Depeche Mode' 'Madonna' 'New Order' ... 'Anata' 'Wayne Marshall'
 'Tokyo Gakuso']


In [189]:
model2 = LightFM(learning_rate=0.08, loss='warp', random_state=42)
model2.fit(train, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7fb8a48550d0>

#### Try some parametter and get a board with better

In [190]:
# Evaluate
trp = precision_at_k(model, train, k=10).mean()
tep = precision_at_k(model, test, k=10, train_interactions=train).mean()

tra = auc_score(model, train).mean()
tea = auc_score(model, test, train_interactions=train).mean()

dico = {'Loss':'Warp', 'K':10, 'Learning_rate':0.08, 'Train Precision':trp, 'Test Precision':tep, 'Train Auc':tra,'Test Auc':tea}

tab = pd.DataFrame(dico.items(), columns=['Parameter', 'Values'])

In [191]:
tab

Unnamed: 0,Parameter,Values
0,Loss,Warp
1,K,10
2,Learning_rate,0.08
3,Train Precision,0.377754
4,Test Precision,0.131981
5,Train Auc,0.964242
6,Test Auc,0.856067


In [192]:
import time

def scoring():
    
#    learning_rate = [0.05, 0.08, 0.10]
#    losslist = ['logistic', 'bpr', 'warp', 'warp-kos']
#    klist = [5, 7, 10]
    learning_rate = [0.05, 0.08, 0.10]
    losslist = ['bpr', 'warp', 'logistic', 'warp-kos']
    klist = [5, 7, 9, 11, 13]
    results = []
    
    for x in learning_rate:
        for y in losslist:
            for z in klist:
            
                model = LightFM(learning_rate=x, loss = y)
                t1 = time.process_time()
                model.fit(train, epochs=10, num_threads=2)
                t2 = time.process_time()
                t = t2 - t1
                trainPrecision = precision_at_k(model, train, k=z).mean()
                testPrecision = precision_at_k(model, test, k=z, train_interactions=train).mean()

                trainAUC = auc_score(model, train).mean()
                testAUC = auc_score(model, test, train_interactions=train).mean()

                dicttemp = {}
                dicttemp = {'Time:':t, 'K':z, 'Name':y, 'Learning Rate':x, 'Train Precision':trainPrecision, 'Train AUC':trainAUC, 'Test Precision':testPrecision, "Train AUC":trainAUC, "Test AUC":testAUC}

                results.append(dicttemp)
            
    results = pd.DataFrame(results)

    return results

In [None]:
scoring()

In [193]:
def recommend(user):
    scores = model.predict(user, np.arange(n_items))
    top_items = artist_names[np.argsort(-scores)]
    reco = pd.DataFrame(top_items[:10], columns=['Recommandations'])
    return reco

In [194]:
recommend(1)

Unnamed: 0,Recommandations
0,Autechre
1,Bola
2,Tim Hecker
3,Solar Fields
4,Madlib
5,Bong-Ra
6,NoMeansNo
7,Ef
8,Red House Painters
9,Danny Norbury


In [91]:
#  get_ground_truth qui renvoie les artistes ecoutés par un utilisateur par ordre décroissant du playCountScaled

In [None]:
def get_ground_truth():
    