In [1]:
import codecs
import numpy as np
import pandas as pd

In [2]:
inter_file = "./data/ml-100k/u.data"

In [3]:
data = pd.read_csv(inter_file, delimiter="\t", header=None)
data.columns = ["user_id", "item_id", "rating", "timestamp"]
data.shape

(100000, 4)

In [4]:
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
from lightfm.data import Dataset



In [6]:
dataset = Dataset()

In [7]:
user_array = np.sort(data["user_id"].unique())
item_array = np.sort(data["item_id"].unique())

In [8]:
dataset.fit(users=user_array, items=item_array)

In [9]:
num_users, num_items = dataset.interactions_shape()
num_users, num_items

(943, 1682)

In [10]:
%%time
interactions, weights = dataset.build_interactions([tuple(data[["user_id", "item_id", "rating"]].iloc[i, :]) for i in range(data.shape[0])])

Wall time: 2min 55s


In [11]:
interactions, weights

(<943x1682 sparse matrix of type '<class 'numpy.int32'>'
 	with 100000 stored elements in COOrdinate format>,
 <943x1682 sparse matrix of type '<class 'numpy.float32'>'
 	with 100000 stored elements in COOrdinate format>)

In [12]:
interactions.toarray(), weights.toarray()

(array([[1, 1, 1, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0]]), array([[5., 3., 4., ..., 0., 0., 0.],
        [4., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [5., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 5., 0., ..., 0., 0., 0.]], dtype=float32))

In [13]:
from lightfm import LightFM

In [14]:
model = LightFM(loss='warp')

In [15]:
model.fit(interactions)

<lightfm.lightfm.LightFM at 0x2960460e3c8>

In [16]:
model.item_embeddings

array([[-0.33354998, -0.62071806, -0.5468567 , ..., -0.7395678 ,
         0.5305297 ,  0.3459575 ],
       [-0.28881025, -0.33182573, -0.3384615 , ..., -0.4324158 ,
         0.17360955,  0.547681  ],
       [ 0.16587932, -0.21641761, -0.09041465, ..., -0.03192124,
         0.20462288,  0.13234344],
       ...,
       [ 0.14718033,  0.38212344,  0.3142625 , ...,  0.37996686,
        -0.32440943, -0.34812057],
       [ 0.24773902,  0.1682003 ,  0.25004494, ...,  0.28033793,
        -0.30692485, -0.20862225],
       [ 0.18793812,  0.31124133,  0.33497822, ...,  0.2797843 ,
        -0.33666873, -0.21062063]], dtype=float32)

In [17]:
model_weights = LightFM(loss='warp')
model_weights.fit(weights)
model_weights.item_embeddings

array([[ 0.71564746,  0.02465005, -0.66182977, ...,  0.09512775,
        -0.13642725,  0.05216855],
       [ 0.4488553 , -0.11205254, -0.2598676 , ...,  0.25055864,
        -0.03122658, -0.30726355],
       [ 0.06885144, -0.05553475, -0.09711617, ...,  0.24756578,
        -0.06330757,  0.08833592],
       ...,
       [-0.3319892 ,  0.09444311,  0.39592975, ..., -0.14360082,
         0.26195526, -0.02470076],
       [-0.23962982,  0.1656944 ,  0.36857685, ...,  0.00669156,
         0.21060328,  0.05325682],
       [-0.38051814,  0.17242694,  0.28419927, ..., -0.10607666,
         0.08893774, -0.1112946 ]], dtype=float32)

In [18]:
from lightfm.datasets import fetch_movielens
movielens = fetch_movielens()
train, test = movielens['train'], movielens['test']

In [19]:
model = LightFM(no_components=10, loss="warp")

In [20]:
model.fit(train, epochs=10)

<lightfm.lightfm.LightFM at 0x2960460e588>

In [21]:
item_index0 = np.arange(test.shape[1])[np.not_equal(test.tocsr()[0, :].toarray(), 0).ravel()]
item_index0

array([ 19,  32,  60, 116, 154, 159, 170, 188, 201, 264])

In [22]:
test.tocsr()[0, :].toarray().ravel()[np.not_equal(test.tocsr()[0, :].toarray(), 0).ravel()]

array([4, 4, 4, 3, 2, 4, 5, 3, 5, 4], dtype=int32)

In [23]:
model.predict(0, item_index0)

array([-3.89246869, -3.63163996, -4.02253771, -2.02001739, -3.03168797,
       -4.0731883 , -4.02320051, -3.68930364, -1.61427879, -2.43675303])

In [24]:
from lightfm.evaluation import auc_score, precision_at_k

In [25]:
model_auc = LightFM(no_components=10, loss="bpr")
model_top = LightFM(no_components=10, loss="warp")
model_auc.fit(train, epochs=10)
model_top.fit(train, epochs=10)

<lightfm.lightfm.LightFM at 0x2960521ab00>

In [26]:
auc_train_precision = auc_score(model_auc, train)
auc_train_precision

array([0.7686888 , 0.9528905 , 0.9282106 , 0.9007365 , 0.87563145,
       0.8408634 , 0.8459599 , 0.8133522 , 0.80748504, 0.8394044 ,
       0.7693873 , 0.83540654, 0.79497045, 0.8580332 , 0.9331756 ,
       0.8726059 , 0.9649105 , 0.90237606, 0.92583734, 0.94250226,
       0.78079134, 0.8408635 , 0.85553724, 0.86445767, 0.8788904 ,
       0.95166016, 0.93405324, 0.8357457 , 0.86230904, 0.81873316,
       0.8698439 , 0.97039926, 0.9794879 , 0.92888755, 0.98144376,
       0.9306818 , 0.9495735 , 0.8378835 , 0.9880739 , 0.9771634 ,
       0.94022924, 0.8818725 , 0.81856525, 0.90145934, 0.95239466,
       0.9586646 , 0.9870827 , 0.8811172 , 0.74358714, 0.95580685,
       0.9591188 , 0.85334057, 0.9641092 , 0.9622172 , 0.9927643 ,
       0.9020776 , 0.884182  , 0.8250704 , 0.8085817 , 0.8962127 ,
       0.9861271 , 0.86122996, 0.90163285, 0.8459648 , 0.9078607 ,
       0.9634004 , 0.9698554 , 0.951498  , 0.8774878 , 0.8440236 ,
       0.904949  , 0.889222  , 0.86744434, 0.97275585, 0.89797

In [27]:
auc_train_precision = auc_score(model_auc, train).mean()
auc_test_precision = auc_score(model_auc, test).mean()
auc_train_precision, auc_test_precision

(0.8913147, 0.8552065)

In [28]:
top_train_precision = precision_at_k(model_top, train, k=10).mean()
top_test_precision = precision_at_k(model_top, test, k=10).mean()
top_train_precision, top_test_precision

(0.61124074, 0.10965005)

In [29]:
from lightfm.datasets import fetch_stackexchange

data = fetch_stackexchange('crossvalidated',
                           test_set_fraction=0.1,
                           indicator_features=False,
                           tag_features=True)

In [30]:
train, test = data['train'], data['test']
train, test

(<3221x72360 sparse matrix of type '<class 'numpy.float32'>'
 	with 57830 stored elements in COOrdinate format>,
 <3221x72360 sparse matrix of type '<class 'numpy.float32'>'
 	with 4307 stored elements in COOrdinate format>)

In [31]:
item_features = data["item_features"]
tag_labels = data['item_feature_labels']

In [32]:
tag_labels

array(['bayesian', 'prior', 'elicitation', ..., 'events', 'mutlivariate',
       'sample-variance'], dtype='<U50')

In [33]:
NUM_THREADS = 2
NUM_COMPONENTS = 30
NUM_EPOCHS = 3
ITEM_ALPHA = 1e-6

In [34]:
model = LightFM(loss='bpr', item_alpha=ITEM_ALPHA, no_components=NUM_COMPONENTS)
model = model.fit(train, item_features=item_features, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

In [35]:
train_auc = auc_score(model, train, item_features=item_features, num_threads=NUM_THREADS).mean()
test_auc = auc_score(model, test, item_features=item_features, num_threads=NUM_THREADS).mean()
train_auc, test_auc

(0.8142137, 0.71949655)

In [36]:
user_count = np.array(train.sum(axis=0)).ravel()
np.sum(user_count == 0)

33827

In [37]:
t = model.predict(item_ids=np.ones(train.shape[0]) * (train.shape[1] - 1), user_ids=np.arange(train.shape[0]), item_features=item_features)

In [38]:
t.argsort()

array([ 145,  159, 1689, ..., 2149, 1878,  375], dtype=int64)

In [39]:
def get_similar_tags(model, tag_id):
    # Define similarity as the cosine of the angle
    # between the tag latent vectors

    # Normalize the vectors to unit length
    tag_embeddings = (model.item_embeddings.T
                      / np.linalg.norm(model.item_embeddings, axis=1)).T

    query_embedding = tag_embeddings[tag_id]
    similarity = np.dot(tag_embeddings, query_embedding)
    most_similar = np.argsort(-similarity)[1:4]

    return most_similar


for tag in (u'bayesian', u'regression', u'survival'):
    tag_id = tag_labels.tolist().index(tag)
    print('Most similar tags for %s: %s' % (tag_labels[tag_id],
                                            tag_labels[get_similar_tags(model, tag_id)]))

Most similar tags for bayesian: ['mcmc' 'bayes' 'prior']
Most similar tags for regression: ['multiple-regression' 'multicollinearity' 'bic']
Most similar tags for survival: ['kaplan-meier' 'cox-model' 'logrank']
