# Задание к занятию «Гибридные рекомендатльные системы»


Описание задания:

Что делать

1. Возьмите ml-latest, предсказывайте оценку не ниже 4

2. Постройте lightFM: коллаборативные фичи + теги

3. Сравните время работы и качество (со случаем без тегов).

## lightFM: коллаборативные фичи

In [1]:
import pandas as pd
ratings = pd.read_csv("../ml-latest/ratings.csv")
movies = pd.read_csv("../ml-latest/movies.csv")
tags = pd.read_csv("../ml-latest/tags.csv",usecols=['movieId','tag'])

In [2]:
tags.fillna(value='nan', inplace=True)

In [3]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753170 entries, 0 to 753169
Data columns (total 2 columns):
movieId    753170 non-null int64
tag        753170 non-null object
dtypes: int64(1), object(1)
memory usage: 11.5+ MB


In [4]:
tags['tag'] = tags['tag'].str.upper()
tags['tag_id'] = tags['tag'].astype('category').cat.codes.copy()


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [6]:
from scipy.sparse import coo_matrix
import numpy as np

user_item_matrix = coo_matrix(
    ((ratings["rating"] >= 4).astype(np.float32),
    (ratings["userId"], ratings["movieId"])),
    shape=(
        ratings["userId"].unique().max() + 1,
        movies["movieId"].unique().max() + 1
    )
)
user_item_matrix.eliminate_zeros()

In [7]:
user_item_matrix

<270897x176280 sparse matrix of type '<class 'numpy.float32'>'
	with 12981742 stored elements in COOrdinate format>

In [8]:
import numpy as np

# делим разреженную матрицу на обучающую и тестовую
total_len = user_item_matrix.data.size
train_len = int(total_len * 0.8)
all_indices = np.arange(total_len)
np.random.seed(42)
train_indices = np.random.choice(all_indices, train_len, replace=False)
train_mask = np.in1d(all_indices, train_indices)

In [9]:
from scipy.sparse import coo_matrix

def get_masked(arr, mask):
    return coo_matrix(
        (
            [np.float32(item) for item in arr.data[mask]],
            (arr.row[mask], arr.col[mask])
        ),
        arr.shape
    )

In [10]:
train = get_masked(user_item_matrix, train_mask)
test = get_masked(user_item_matrix, ~train_mask)

In [11]:
from lightfm import LightFM

fm = LightFM()

In [12]:
%%time
fm.fit(
    interactions=train,
    epochs=100,
    num_threads=4,
    verbose=True
)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64
Epoch 65
Epoch 66
Epoch 67
Epoch 68
Epoch 69
Epoch 70
Epoch 71
Epoch 72
Epoch 73
Epoch 74
Epoch 75
Epoch 76
Epoch 77
Epoch 78
Epoch 79
Epoch 80
Epoch 81
Epoch 82
Epoch 83
Epoch 84
Epoch 85
Epoch 86
Epoch 87
Epoch 88
Epoch 89
Epoch 90
Epoch 91
Epoch 92
Epoch 93
Epoch 94
Epoch 95
Epoch 96
Epoch 97
Epoch 98
Epoch 99
CPU times: user 28min 33s, sys: 2.33 s, total: 28min 36s
Wall time: 8min 16s


<lightfm.lightfm.LightFM at 0x7fde9ed37898>

In [13]:
fm.get_params()

{'loss': 'logistic',
 'learning_schedule': 'adagrad',
 'no_components': 10,
 'learning_rate': 0.05,
 'k': 5,
 'n': 10,
 'rho': 0.95,
 'epsilon': 1e-06,
 'max_sampled': 10,
 'item_alpha': 0.0,
 'user_alpha': 0.0,
 'random_state': <mtrand.RandomState at 0x7fdea14fa480>}

In [14]:
%%time
from lightfm.evaluation import reciprocal_rank

rr = reciprocal_rank(
    model=fm,
    test_interactions=test,
    train_interactions=train,
    num_threads=4
)

CPU times: user 1h 16min 37s, sys: 2.07 s, total: 1h 16min 39s
Wall time: 19min 33s


In [15]:
rr.mean()

0.22469811

In [16]:
user_factors = fm.get_user_representations()
print(user_factors)

(array([ 0.        ,  0.40534738,  0.21969379, ...,  0.59263498,
        0.07978027,  0.97354531], dtype=float32), array([[ 0.04183123, -0.0217846 ,  0.02738476, ..., -0.0416659 ,
         0.04672469,  0.00624839],
       [-0.03302612, -0.10198953, -0.07643329, ..., -0.06150615,
        -0.02338957, -0.12917863],
       [ 0.02100091, -0.06511462, -0.02798225, ...,  0.04232505,
         0.00254494, -0.06460346],
       ..., 
       [-0.17308868, -0.13930814, -0.21077126, ..., -0.01122911,
        -0.00934649, -0.18670601],
       [ 0.04174485,  0.01459875,  0.01734498, ..., -0.01305523,
         0.0275433 ,  0.02772813],
       [-0.04968318, -0.01804409, -0.11948996, ...,  0.02698112,
         0.01468431, -0.05958324]], dtype=float32))


In [17]:
item_factors = fm.get_item_representations()
print(item_factors)

(array([ 0.        ,  9.31035233,  7.69351721, ...,  0.        ,
        0.        ,  0.        ], dtype=float32), array([[ 0.03760988,  0.0270666 ,  0.00455319, ...,  0.01898395,
         0.04481839, -0.00725883],
       [-0.24212019, -0.4651542 , -0.36825463, ..., -0.00238148,
        -0.01485588, -0.68477839],
       [-0.20801087, -0.45415413, -0.44284818, ..., -0.06824353,
        -0.02801028, -0.69064969],
       ..., 
       [-0.02904287,  0.02026233,  0.00994778, ..., -0.03073787,
        -0.04166988,  0.01597854],
       [-0.04065033,  0.01923469, -0.0044485 , ..., -0.03325109,
        -0.02751566,  0.00147619],
       [ 0.0138132 , -0.00676864, -0.00542541, ..., -0.00936656,
         0.00853478,  0.04592822]], dtype=float32))


In [18]:
print(len(user_factors[0]))
print(user_factors[1].shape)

270897
(270897, 10)


In [19]:
print(len(item_factors[0]))
print(item_factors[1].shape)

176280
(176280, 10)


## lightFM: коллаборативные фичи + теги

Теги добавим в item_features.
Теоретически, их можно было бы добавить и как фичи пользователя

In [20]:
tags.head()

Unnamed: 0,movieId,tag,tag_id
0,318,NARRATED,28984
1,4306,DREAMWORKS,12796
2,89302,ENGLAND,13798
3,89302,ESPIONAGE,14113
4,89302,JAZZ,21895


In [21]:
user_item_matrix.shape

(270897, 176280)

In [22]:
from scipy.sparse import identity, hstack

item_feature_matrix = hstack([
    coo_matrix(
        (np.ones(tags.count()[0], dtype=np.float32),
        (tags["movieId"], tags["tag_id"])),
        shape=(user_item_matrix.shape[1], tags["tag_id"].unique().size)
    ),
    identity(user_item_matrix.shape[1])
])
item_feature_matrix.shape

(176280, 224233)

In [23]:
%%time
fm.fit(
    interactions=train,
    epochs=100,
    item_features=item_feature_matrix,
    num_threads=4,
    verbose=True
)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64
Epoch 65
Epoch 66
Epoch 67
Epoch 68
Epoch 69
Epoch 70
Epoch 71
Epoch 72
Epoch 73
Epoch 74
Epoch 75
Epoch 76
Epoch 77
Epoch 78
Epoch 79
Epoch 80
Epoch 81
Epoch 82
Epoch 83
Epoch 84
Epoch 85
Epoch 86
Epoch 87
Epoch 88
Epoch 89
Epoch 90
Epoch 91
Epoch 92
Epoch 93
Epoch 94
Epoch 95
Epoch 96
Epoch 97
Epoch 98
Epoch 99
CPU times: user 8h 34min 36s, sys: 20.1 s, total: 8h 34min 56s
Wall time: 2h 12min 37s


<lightfm.lightfm.LightFM at 0x7fde9ed37898>

In [24]:
new_item_factors = fm.get_item_representations()
print(new_item_factors)

(array([  1.79897659e-02,   3.90410435e-07,   3.90410435e-07, ...,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00], dtype=float32), array([[ 0.01948255,  0.03034267,  0.02950883, ...,  0.04659824,
         0.0317934 , -0.01827358],
       [-0.02675867, -0.02772346,  0.02540882, ..., -0.02590871,
        -0.03519502, -0.00696808],
       [-0.03636106,  0.01241641, -0.00843739, ...,  0.00909514,
         0.01788808, -0.02646129],
       ..., 
       [-0.02371242, -0.00462511,  0.04717118, ..., -0.00732731,
        -0.0491414 ,  0.01826487],
       [ 0.01801646,  0.02623988, -0.04440532, ...,  0.00337404,
         0.02599613,  0.04141225],
       [-0.0187111 ,  0.0378853 ,  0.01825489, ...,  0.04437953,
        -0.00762609, -0.0404805 ]], dtype=float32))


In [25]:
%%time
from lightfm.evaluation import reciprocal_rank

rr = reciprocal_rank(
    model=fm,
    test_interactions=test,
    train_interactions=train,
    item_features=item_feature_matrix,
    num_threads=4
)

CPU times: user 1h 54min 55s, sys: 3.76 s, total: 1h 54min 59s
Wall time: 29min 29s


In [26]:
rr.mean()

0.17495015

## Вывод:

In [27]:
tags["tag_id"].unique().size

47953

Добавление тегов в качестве контентных фичей не улучшило качество модели - reciprocal_rank уменьшился.
Добавление 48к признаков очень сильно увеличило время работы 