In [145]:
import numpy as np
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.datasets import fetch_movielens

In [146]:
data = fetch_movielens(min_rating=5.0, indicator_features = True, genre_features = True)

In [147]:
data_train = data['train']
data_test = data['test']
data_item_features = data['item_features']

In [148]:
### Baseline: no features

In [149]:
model = LightFM(loss='warp')
%time model.fit(data_train, epochs=30, num_threads=2)
print("Train precision: %.5f" % precision_at_k(model, data_train, k=20).mean())
print("Test precision: %.5f" % precision_at_k(model, data_test, k=20).mean())

CPU times: user 678 ms, sys: 7.96 ms, total: 686 ms
Wall time: 344 ms
Train precision: 0.28505
Test precision: 0.03600


In [150]:
### Adding Item features

In [151]:
model_item_features = LightFM(loss='warp')
%time model_item_features.fit(data_train, item_features= data_item_features, epochs=30, num_threads=2)
print("Train precision: %.5f" % precision_at_k(model_item_features, data_train, item_features= data_item_features, k=20).mean())
print("Test precision: %.5f" % precision_at_k(model_item_features, data_test, item_features= data_item_features, k=20).mean())

CPU times: user 989 ms, sys: 14 µs, total: 989 ms
Wall time: 497 ms
Train precision: 0.30115
Test precision: 0.03573


In [152]:
isbetter = precision_at_k(model, data_test, k=20).mean() < precision_at_k(model_item_features, data_test, item_features= data_item_features, k=20).mean()
print("Is the model with feature better than the baseline? %s" % isbetter)

Is the model with feature better than the baseline? False


In [153]:
### Separate from unknown and known user to test if cold start effect is mitigated 
# unknown user: user that do not have interaction in train
# known user: the opposite

In [154]:
import pandas as pd
data_train_df = pd.DataFrame(data_train.toarray())

In [155]:
unknown_users = data_train_df[(data_train_df==0).all(axis=1)]
unknown_users = list(unknown_users.index.values)

In [156]:
known_users = data_train_df[(data_train_df==5).any(axis=1)]
known_users = list(known_users.index.values)

In [157]:
data_test_df = pd.DataFrame(data_test.toarray())
data_test_df_known_users = data_test_df[data_test_df.index.isin(known_users)]
data_test_df_known_users = data_test_df_known_users.reindex(list(range(data_train.shape[0])), fill_value=0.0)

In [158]:
data_test_df_unknown_users = data_test_df[data_test_df.index.isin(unknown_users)]
data_test_df_unknown_users = data_test_df_unknown_users.reindex(list(range(data_train.shape[0])), fill_value=0.0)

In [159]:
### Baseline: no features
print("Test unknown precision: %.5f" % precision_at_k(model, coo_matrix(data_test_df_unknown_users), k=20).mean())
print("Test known precision: %.5f" % precision_at_k(model, coo_matrix(data_test_df_unknown_users), k=20).mean())

Test unknown precision: 0.02667
Test known precision: 0.02667


In [160]:
### Adding Item features
model_item_features = LightFM(loss='warp')
%time model_item_features.fit(data_train, item_features= data_item_features, epochs=30, num_threads=2)
print("Train precision: %.5f" % precision_at_k(model_item_features, data_train, item_features= coo_matrix(data_item_features), k=20).mean())
print("Test unknown precision: %.5f" % precision_at_k(model_item_features, coo_matrix(data_test_df_unknown_users), item_features= coo_matrix(data_item_features), k=20).mean())
print("Test known precision: %.5f" % precision_at_k(model_item_features, coo_matrix(data_test_df_unknown_users), item_features= coo_matrix(data_item_features), k=20).mean())

CPU times: user 1 s, sys: 200 µs, total: 1 s
Wall time: 504 ms
Train precision: 0.29682
Test unknown precision: 0.02333
Test known precision: 0.02333


In [161]:
isbetter = precision_at_k(model, coo_matrix(data_test_df_unknown_users), k=20).mean() < precision_at_k(model_item_features, coo_matrix(data_test_df_unknown_users), item_features= coo_matrix(data_item_features), k=20).mean()
print("Is the model with feature better than the baseline for unknown users? %s" % isbetter)

Is the model with feature better than the baseline for unknown users? False


In [162]:
### Adding User features

In [163]:
users_features = pd.read_csv('u.user', delimiter='|', names= ['age', 'sex', 'profession', 'zipcode'])
users_features_dummies = pd.get_dummies(users_features)

In [164]:
import scipy.sparse as sp

eye = sp.eye(data_train.shape[0], data_train.shape[0]).tocsr()
user_features_matrix = sp.hstack((eye, users_features_dummies))
user_features_matrix = user_features_matrix.tocsr().astype(np.float32)
user_features_matrix_coo = coo_matrix(user_features_matrix)

In [165]:
# Showing the size of all the matrix not to get lost

In [166]:
print("users x items: " + repr(data_train))
print("known users x items: " + repr(coo_matrix(data_test_df_known_users)))
print("unknown users x items: " + repr(coo_matrix(data_test_df_unknown_users)))
print("items x features_items: " + repr(coo_matrix(data_item_features)))
print("users x features_users: " + repr(user_features_matrix_coo))

users x items: <943x1682 sparse matrix of type '<class 'numpy.float32'>'
	with 19048 stored elements in COOrdinate format>
known users x items: <943x1682 sparse matrix of type '<class 'numpy.float64'>'
	with 2128 stored elements in COOrdinate format>
unknown users x items: <943x1682 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in COOrdinate format>
items x features_items: <1682x1701 sparse matrix of type '<class 'numpy.float32'>'
	with 4575 stored elements in COOrdinate format>
users x features_users: <943x1762 sparse matrix of type '<class 'numpy.float32'>'
	with 4715 stored elements in COOrdinate format>


In [167]:
model_item_users_features = LightFM(loss='warp')
%time model_item_users_features.fit(data_train, item_features= coo_matrix(data_item_features), user_features=user_features_matrix_coo, epochs=30, num_threads=2)
print("Train precision: %.5f" % precision_at_k(model_item_users_features, data_train, item_features= coo_matrix(data_item_features), user_features=user_features_matrix_coo, k=20).mean())
print("Test unknown precision: %.5f" % precision_at_k(model_item_users_features, coo_matrix(data_test_df_unknown_users), item_features= coo_matrix(data_item_features), user_features=user_features_matrix_coo, k=20).mean())
print("Test known precision: %.5f" % precision_at_k(model_item_users_features, coo_matrix(data_test_df_unknown_users), item_features= coo_matrix(data_item_features), user_features=user_features_matrix_coo, k=20).mean())

CPU times: user 1.19 s, sys: 7.97 ms, total: 1.19 s
Wall time: 599 ms
Train precision: 0.12519
Test unknown precision: 0.00667
Test known precision: 0.00667


In [171]:
isbetter = precision_at_k(model, coo_matrix(data_test_df_unknown_users), k=20).mean() < precision_at_k(model_item_users_features, coo_matrix(data_test_df_unknown_users), item_features= coo_matrix(data_item_features), user_features=user_features_matrix_coo, k=20).mean()
print("Is the model with feature user-item better than the baseline for unknown users? %s" % isbetter)

Is the model with feature user-item better than the baseline for unknown users? False


In [175]:
# Showing some films for unknown users
def sample_recommendation(model, data, user_id):
    
    print(users_features[user_id-1: user_id])
    n_users, n_items = data['train'].shape
    
    known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]
    scores = model.predict(user_id, np.arange(n_items))
    top_items = data['item_labels'][np.argsort(-scores)]
    
    print("     Known positives:")
    for x in known_positives[:3]:
        print("        %s" % x)

    print("     Recommended:")
    for x in top_items[:3]:
        print("        %s" % x)

In [176]:
sample_recommendation(model, data, 899) 
sample_recommendation(model_item_users_features, data, 899) 

     age sex profession zipcode
899   32   M      other   55116
     Known positives:
     Recommended:
        English Patient, The (1996)
        Star Wars (1977)
        Titanic (1997)
     age sex profession zipcode
899   32   M      other   55116
     Known positives:
     Recommended:
        Titanic (1997)
        Fargo (1996)
        Pulp Fiction (1994)
