In [1]:
import scipy as sp
import numpy as np
import pandas as pd
import time
import lightfm.evaluation
from lightfm import LightFM
from implicit.als import AlternatingLeastSquares
import sklearn.metrics
import math
import random

In [2]:
data = pd.read_table("/mnt/data/lastfm/360k/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv", 
                         usecols=[0, 2, 3], 
                         names=['user', 'artist', 'plays'],na_filter=False)
data['user'] = data['user'].astype("category")
data['artist'] = data['artist'].astype("category")
plays = sp.sparse.coo_matrix((data['plays'].astype(float), 
                   (data['user'].cat.codes,
                    data['artist'].cat.codes)), dtype = np.double)

In [3]:
plays = plays.astype(np.double)

In [13]:
small_data = pd.read_csv('/mnt/data/lastfm/2k/user_artists.dat', sep = '\t')
plays = sp.sparse.coo_matrix((small_data['weight'].astype(float), 
                                             (small_data['userID'], 
                                              small_data['artistID'])), dtype = np.double)

In [4]:
def split_train_test(plays, train_rate = 0.8):
    user_index = range(plays.shape[0])
    train = plays.copy().tolil()
    test = sp.sparse.lil_matrix(plays.shape)
    
    min_rows = int(1/(1 - train_rate))
    for uindex in user_index:
        rows = plays.getrow(uindex).indices
        if len(rows) <= min_rows:
            continue
        testindics = np.random.choice(plays.getrow(uindex).indices, 
                                        size=int(len(rows) * (1 - train_rate)), 
                                        replace=False)
        train[uindex, testindics] = 0.
        test[uindex, testindics] = plays[uindex, testindics]
    
    train = train.tocsr()
    train.eliminate_zeros()
    return train, test.tocsr()

def train_pair_wise_model_and_evaluate(train, test = None, factors = 50, epochs = 10, learning_rate = 0.05, loss = 'bpr', eva = True):
    tic = time.time()
    model = LightFM(no_components = factors, learning_rate=learning_rate, loss=loss)
    model.fit(train, epochs=epochs, num_threads = 2)
    toc = time.time()
    print("LightFM training cost %.2f seconds" % (toc - tic))
    
    if test is not None and eva:
        eva_test = lightfm.evaluation.auc_score(model, test, num_threads = 2)
        print("User auc mean = %.2f, std = %.2f (on testing dataset)" % (eva_test.mean(), eva_test.std()))
    
    return model

In [5]:
train_mat, test_mat = split_train_test(plays.tocsr())

In [None]:
model = train_pair_wise_model_and_evaluate(train_mat, test = test_mat)

In [17]:
def generate_hot_item_list(plays, top = 1000):
    item_indexs, item_counts = np.unique(plays.col, return_counts = True)
    items_played_count = filter(lambda item_pair: item_pair[1] > 10, list(zip(item_indexs, item_counts)))
    
    return sorted(list(items_played_count), key = lambda i:i[1], reverse = True)[: top]

为一个指定的用户产生负样本。产生的方式是：

1. 从热门Item中去掉用户已收听的Item
2. 按照热门程度加权采样

In [35]:
def weighted_sampling(sequence, k):
    """
    parameters:
    
    sequence -- list-like [(item1, weight1), ...]
    
    k -- number of selected items
    
    return:
    
    list that selected.
    """
    
    weighted_list = []
    for elements in sequence:
        weighted_list.append((elements[0], random.expovariate(elements[1])))
    
    return sorted(weighted_list, key = lambda x : x[1])[:k]

In [25]:
def generate_negative_samples(uindex, plays, hot_items, negative_count = 5):
    history = set(plays.getrow(2).indices)
    candidates = []
    for (item, weight) in hot_items:
        if item in history:
            continue
        candidates.append((item, weight))
    if negative_count > len(candidates):
        negative_count = len(candidates)
    return weighted_sampling(candidates, negative_count)

#negative_samples = generate_negative_samples(2, plays.tocsr(), hot_items, 50)

In [35]:
def evaluate_point_wise_model(model, plays, test, num_test_users = -1):
    hot_items = generate_hot_item_list(test_mat.tocoo())
    user_indexes = range(plays.shape[0])
    aucs = []
    if num_test_users > 0:
        user_indexes = np.random.choice(user_indexes, num_test_users)
    for uindex in user_indexes:
        positive_samples = test_mat.tocsr().getrow(uindex).indices
        negative_samples = generate_negative_samples(uindex, plays.tocsr(), hot_items, len(positive_samples))
        if len(negative_samples) == 0:
            continue
        negative_samples, weight = zip(*negative_samples)
        negative_samples = np.array(negative_samples)
        user_factor = model.user_factors[uindex].reshape((1, model.factors))
        user_samples = np.concatenate((positive_samples,  negative_samples), axis = 0).astype(np.int64)
        user_feedback = np.concatenate((np.full(len(positive_samples), 1), np.full(len(negative_samples), 0)), axis = 0)
        item_factors = model.item_factors[user_samples]
        scores = np.dot(user_factor, item_factors.transpose()).reshape(len(user_feedback))
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(user_feedback, scores, pos_label=1)
        auc = sklearn.metrics.auc(fpr, tpr)
        aucs.append(auc)
    return np.array(aucs)

In [6]:
def train_point_wise_model_and_evaluate(train, plays = None, test = None, factors = 100, epochs = 100, learning_rate = 0.05,num_test_users = -1, eva = True):
    tic = time.time()
    model = AlternatingLeastSquares(factors = factors, iterations = epochs)
    model.fit(train.transpose())
    toc = time.time()
    print("ALS training cost %.2f seconds" % (toc - tic))
    
    if eva:
        eva_test = evaluate_point_wise_model(model, plays, test, num_test_users)
        print("User auc mean = %.2f, std = %.2f (on testing dataset)" % (eva_test.mean(), eva_test.std()))
    return model

In [36]:
model2 = train_point_wise_model_and_evaluate(train_mat, plays, test_mat, factors = 100, epochs = 10, num_test_users = 1000)



ALS training cost 5.71 seconds
User auc mean = 0.67, std = 0.16 (on testing dataset)


In [7]:
model1 = train_pair_wise_model_and_evaluate(train_mat, test = None, factors = 100, epochs = 10, eva = False)

LightFM training cost 634.01 seconds


In [8]:
model2 = train_point_wise_model_and_evaluate(train_mat, factors = 100, epochs = 10, eva = False)



ALS training cost 119.72 seconds


In [67]:
class Recommender(object):
    def __init__(self, models = {}, plays = None, artists = None):
        self.models = models
        self.plays = plays
        self.artists = artists
        self.artistsDict = None
        if artists is not None:
            index, names = zip(*list(enumerate(self.artists)))
            self.artistsDict = dict(zip(names, index))
        
    def recommend(self, userid, modelname = 'bpr', top = 10, with_history = True):
        if modelname not in self.models:
            return []
        
        recommend_list = []
        if modelname == 'bpr':
            recommend_list = self._recommend_with_bpr(userid, top)
        elif modelname == 'als':
            recommend_list = self._recommend_with_als(userid, top)
        
        return self._output_more(userid, None, recommend_list, with_history)
    
    def similar_items(self, artist_name, top = 10):
        if artist_name not in self.artistsDict:
            return {}
        itemid = self.artistsDict[artist_name]
        model = self.models['als']
        similar_items = model.similar_items(itemid, top) 
        return self._output_more(None, itemid, similar_items, False)
    
    def _recommend_with_bpr(self, userid, top):
        """
        compute recommendation for user
        """
        model = self.models['bpr']
        items = np.array(range(plays.shape[1]))
        scores = model.predict(userid, items)
        sorted_items = sorted(zip(items, scores), key = lambda x : x[1], reverse = True)       
        """
        filter the items the user has consumed. 
        """
        history = set(self.plays.getrow(userid).indices)
        recommendations = []
        for item in sorted_items:
            if item[0] in history:
                continue
            recommendations.append(item)
            if len(recommendations) >= top:
                break
        return recommendations
    
    def _recommend_with_als(self, userid, top):
        model = self.models['als']
        return model.recommend(userid, self.plays, N = top)
    
    def _output_more(self, userid, itemid, item_list, with_history):
        userinfo = []
        output_iteminfo = []
        input_iteminfo = []
        if userid and with_history:
            userinfo = self._output_user_more_info(userid)
        if item_list:
            output_iteminfo = self._output_items_more_info(item_list)
        if itemid:
            input_iteminfo = self._output_items_more_info([(itemid, 1)])
        return {'user': userinfo, 'item':input_iteminfo, 'items': output_iteminfo}
    
    def _output_user_more_info(self, userid, sort = False, top = -1):
        history = self.artists[self.plays.getrow(userid).indices]
        playcount = self.plays.getrow(userid).data
        
        if not sort:
            return list(zip(history, playcount))[: top]
        else:
            return sorted(list(zip(history, playcount)), key = lambda item: item[1], reverse = True)[: top]
    
    def _output_items_more_info(self, items):
        itemids, scores = zip(*items)
        iteminfo = self.artists[list(itemids)]
        return list(zip(iteminfo, scores))

In [68]:
recommender = Recommender({'bpr': model1, 'als':model2}, plays.tocsr(), data.artist.cat.categories)

In [69]:
recommendation1 = recommender.recommend(173031, modelname='bpr', top = 20, with_history = False)
recommendation2 = recommender.recommend(173031, modelname='als', top = 20, with_history = False)
recommender._output_user_more_info(173031, sort = True)

[('周杰倫', 2122.0),
 ('陳奕迅', 1181.0),
 ('glenn gould', 673.0),
 ('楊丞琳', 536.0),
 ('wolfgang amadeus mozart', 492.0),
 ('aly & aj', 452.0),
 ('secondhand serenade', 425.0),
 ('avril lavigne', 424.0),
 ('s.h.e', 417.0),
 ('johann sebastian bach', 304.0),
 ('張敬軒', 291.0),
 ('kevin kern', 223.0),
 ('倉木麻衣', 223.0),
 ('張韶涵', 203.0),
 ('mitsuko uchida', 197.0),
 ('鄧麗欣', 194.0),
 ('x japan', 193.0),
 ('academy st. martins in the fields', 180.0),
 ('garnet crow', 160.0),
 ('michelle branch', 153.0),
 ('troy and gabriella', 150.0),
 ('jesse mccartney', 148.0),
 ('céline dion', 144.0),
 ('gil shaham and goran sollscher', 143.0),
 ('hide', 136.0),
 ('westlife', 136.0),
 ('孫燕姿', 128.0),
 ('jason mraz', 126.0),
 ('andy mckee', 123.0),
 ('oku hanako', 122.0),
 ('high school musical 2', 113.0),
 ('zard', 113.0),
 ('sara bareilles', 101.0),
 ('backstreet boys', 99.0),
 ('the corrs', 99.0),
 ('kelly sweet', 98.0),
 ('三枝夕夏 in db', 97.0),
 ('glay', 92.0),
 ('david garrett', 90.0),
 ('タイナカサチ', 87.0),
 ('jame

In [61]:
recommendation1

{'item': [],
 'items': [('josh groban', 2.541165828704834),
  ('michael w. smith', 2.5229959487915039),
  ('hillsong', 2.4939250946044922),
  ('宇多田ヒカル', 2.3995833396911621),
  ('hayley westenra', 2.369992733001709),
  ('angela aki', 2.3347458839416504),
  ('casting crowns', 2.3013358116149902),
  ('hillsong united', 2.2782249450683594),
  ('boa', 2.2732632160186768),
  ('steven curtis chapman', 2.2700684070587158),
  ('rebecca st. james', 2.2616958618164062),
  ('kokia', 2.2322888374328613),
  ('barlowgirl', 2.2148218154907227),
  ('do as infinity', 2.213282585144043),
  ('f.i.r.', 2.1844463348388672),
  ('corrinne may', 2.1044127941131592),
  ('chris tomlin', 2.0857734680175781),
  ('celtic woman', 2.0772864818572998),
  ('depapepe', 2.0735812187194824),
  ('ayaka', 2.0611968040466309)],
 'user': []}

In [62]:
recommendation2

{'item': [],
 'items': [('taylor swift', 1.186539712906344),
  ('frédéric chopin', 1.1863112931343351),
  ('colbie caillat', 1.0978262200222491),
  ('jonas brothers', 1.0811056577548976),
  ('bruno coulais', 1.0621494330528296),
  ('boa', 1.0488158944365353),
  ('bryan adams', 1.0277970165423405),
  ('daniel powter', 1.0220579888574337),
  ('yann tiersen', 1.0143708728515772),
  ('f.i.r.', 1.012305501265341),
  ('yiruma', 1.003527371248728),
  ('hilary duff', 0.99238201965409678),
  ('mandy moore', 0.97962069749414749),
  ('natasha bedingfield', 0.97652199359550906),
  ('simple plan', 0.95801904189552334),
  ('daughtry', 0.95771285384471316),
  ('bz', 0.93582329901255945),
  ('mariah carey', 0.93090378201808766),
  ('angela aki', 0.92184519678959209),
  ('claude debussy', 0.9190989429349703)],
 'user': []}

In [56]:
similar_items = recommender.similar_items('周杰倫', 10)

In [57]:
similar_items

{'item': [('周杰倫', 1)],
 'items': [('周杰倫', 0.99999999999999989),
  ('王力宏', 0.88290674032689487),
  ('陶喆', 0.85771472510708613),
  ('南拳媽媽', 0.85302933173781792),
  ('陳奕迅', 0.85081279073917004),
  ('林俊傑', 0.8493482632159628),
  ('孫燕姿', 0.83399343224087041),
  ('張惠妹', 0.82906432515041484),
  ('方大同', 0.82485675783986334),
  ('五月天', 0.82292313630632996)],
 'user': []}