В этом задании мы найдем похожие фильмы и пользователей по алгоритму ALS, реализуем подсчет метрики NDCG и исследуем влияние размерности скрытых представлений на работу алгоритма.

Загрузим данные и модели из семинара:

**Важно: не изменяйте код до задания 1!**

In [None]:
import zipfile
from collections import defaultdict, Counter
import datetime

from scipy import linalg
import numpy as np

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-1m.zip

--2021-05-18 06:19:41--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2021-05-18 06:19:41 (18.4 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]



In [None]:
# read data
movies = {} # id
users = {} # id
ratings = defaultdict(list) # user-id

with zipfile.ZipFile("ml-1m.zip", "r") as z:
    # parse movies
    with z.open("ml-1m/movies.dat") as m:
        for line in m:
            MovieID, Title, Genres = line.decode('iso-8859-1').strip().split("::")
            MovieID = int(MovieID)
            Genres = Genres.split("|")
            movies[MovieID] = {"Title": Title, "Genres": Genres}
    
    # parse users
    with z.open("ml-1m/users.dat") as m:
        fields = ["UserID", "Gender", "Age", "Occupation", "Zip-code"]
        for line in m:
            row = list(zip(fields, line.decode('iso-8859-1').strip().split("::")))
            data = dict(row[1:])
            data["Occupation"] = int(data["Occupation"])
            users[int(row[0][1])] = data
    
    # parse ratings
    with z.open("ml-1m/ratings.dat") as m:
        for line in m:
            UserID, MovieID, Rating, Timestamp = line.decode('iso-8859-1').strip().split("::")
            UserID = int(UserID)
            MovieID = int(MovieID)
            Rating = int(Rating)
            Timestamp = int(Timestamp)
            ratings[UserID].append((MovieID, Rating, datetime.datetime.fromtimestamp(Timestamp)))

In [None]:
# train-test split
times = []
for user_ratings in ratings.values():
  times.extend([x[2] for x in user_ratings])
times = sorted(times)
threshold_time = times[int(0.8 * len(times))]

train = []
test = []
for user_id, user_ratings in ratings.items():
    train.extend((user_id, rating[0], rating[1] / 5.0) for rating in user_ratings if rating[2] <= threshold_time)
    test.extend((user_id, rating[0], rating[1] / 5.0) for rating in user_ratings if rating[2] > threshold_time)
print("ratings in train:", len(train))
print("ratings in test:", len(test))

ratings in train: 800168
ratings in test: 200041


In [None]:
train_by_user = defaultdict(list)
test_by_user = defaultdict(list)
for u, i, r in train:
    train_by_user[u].append((i, r))
for u, i, r in test:
    test_by_user[u].append((i, r))

train_by_item = defaultdict(list)
for u, i, r in train:
    train_by_item[i].append((u, r))

n_users = max([e[0] for e in train]) + 1
n_items = max([e[1] for e in train]) + 1

In [None]:
# Реализация ALS из семинара
np.random.seed(0)
LATENT_SIZE = 10
N_ITER = 20

# регуляризаторы
lambda_p = 0.2
lambda_q = 0.001

# латентные представления
p = 0.1 * np.random.random((n_users, LATENT_SIZE))
q = 0.1 * np.random.random((n_items, LATENT_SIZE))


def compute_p(p, q, train_by_user):
    for u, rated in train_by_user.items():
        rated_items = [i for i, _ in rated]
        rated_scores = np.array([r for _, r in rated])
        Q = q[rated_items, :]
        A = (Q.T).dot(Q)
        d = (Q.T).dot(rated_scores)
        p[u, :] = np.linalg.solve(lambda_p * len(rated_items) * np.eye(LATENT_SIZE) + A, d)
    return p

def compute_q(p, q, train_by_item):
    for i, rated in train_by_item.items():
        rated_users = [j for j, _ in rated]
        rated_scores = np.array([s for _, s in rated])
        P = p[rated_users, :]
        A = (P.T).dot(P)
        d = (P.T).dot(rated_scores)
        q[i, :] = np.linalg.solve(lambda_q * len(rated_users) * np.eye(LATENT_SIZE) + A, d)
    return q

def train_error_mse(predictions):
    return np.mean([(predictions[u, i] - r) ** 2 for u, i, r in train])

def test_error_mse(predictions):
    return np.mean([(predictions[u, i] - r) ** 2 for u, i, r in test])


for iter in range(N_ITER):
    p = compute_p(p, q, train_by_user)
    q = compute_q(p, q, train_by_item)

    predictions = p.dot(q.T)
    
    print(iter, train_error_mse(predictions), test_error_mse(predictions))

0 0.03425406699095001 0.16161048497212951
1 0.030645740984182004 0.15155084906221652
2 0.02704533432715112 0.1438473404049406
3 0.025813288873051222 0.13697314498990507
4 0.025347613143060384 0.13077566964080353
5 0.025096380135403478 0.12524794035311046
6 0.024934047526840687 0.1203100891656011
7 0.024820279964542055 0.11587970123247354
8 0.024737480905353878 0.11188957847429631
9 0.02467735003476034 0.10828592317903525
10 0.024634483994446357 0.10502502426863121
11 0.024604361404763436 0.1020701490855293
12 0.024583346331205878 0.09938950190571309
13 0.0245687550997932 0.09695506282023518
14 0.024558698531058906 0.09474199207447905
15 0.024551877533063884 0.09272824318660156
16 0.02454739123798564 0.09089423607528803
17 0.02454460512475215 0.08922255977615282
18 0.02454306682449279 0.08769769701279079
19 0.024542448316282727 0.08630578168734003


## Задание 1

Для фильма "Star Wars: Episode V - The Empire Strikes Back (1980)" найдите 3 самых похожих фильма: 
* посчитайте скалярное произведение его эмбеддинга с остальными фильмами;
* найдите максимальные значения - они будут соответствовать ближайшим фильмам;
* вычислите значение id_top1+id_top2+id_top3.

Для решения задания вам пригодится словарь со всеми фильмами `movies`

In [None]:
movies[1196]

{'Genres': ['Action', 'Adventure', 'Drama', 'Sci-Fi', 'War'],
 'Title': 'Star Wars: Episode V - The Empire Strikes Back (1980)'}

In [None]:
len([x for x in list(movies.keys()) if x < 1196])

1178

In [None]:
list(movies.keys())[1178]

1196

In [None]:
q[1178]

array([0.16272855, 1.26614514, 0.60872235, 0.69415775, 1.03401425,
       1.31589285, 1.53875851, 1.4302255 , 1.29870737, 1.13913421])

In [None]:
scalarmult = q[1178].dot(q.T)

In [None]:
scalarmult1 = q.dot(q[1178])

In [None]:
len(scalarmult)

3953

In [None]:
sum = 0
new_list = list(scalarmult)
for i in range(3):
  max_ind = np.argmax(new_list)
  print(max_ind)
  sum += max_ind
  new_list[max_ind] = 0
  # new_list.pop(max_ind)
print(sum)

1420
1471
557
3448


## Задание 2

Для пользователя с ID=5472:

* Найдите самого похожего, аналогично предыдущему заданию;
* Определите количество фильмов, просмотренных обоими пользователями.

In [None]:
users[5472]

{'Age': '35', 'Gender': 'M', 'Occupation': 1, 'Zip-code': '27909'}

In [None]:
scalar_mult_user = p[5471].dot(p.transpose())

In [None]:
scalar_mult_user = list(scalar_mult_user)
scalar_mult_user[5471] = 0

In [None]:
scalar_mult_user

In [None]:
len(scalar_mult_user)

6041

In [None]:
np.argmax(scalar_mult_user)

5471

In [None]:
users[5670]

{'Age': '18', 'Gender': 'M', 'Occupation': 4, 'Zip-code': '48109'}

In [None]:
len(ratings[5670]) + len(ratings[5472])
# 342 if 5471

389

## Задание 3

На лекции была рассмотрена метрика для измерения качества работы рекомендательной системы NDCG. Вам необходимо реализовать подсчет DCG и NDCG и вывести значения из клетки ниже; ответ округлите до тысячных.

In [None]:
def DCG_k(ratings_list, k):
    '''
      ratings_list: np.array(n_items,)
      k: int
    '''
    
    pass


def NDCG_k(r, k):
    '''
      ratings_list: np.array(n_items,)
      k: int
    '''
    pass
    
NDCG_k([5, 5, 4, 5, 2, 4, 5, 3, 5, 5, 2, 3, 0, 0, 1, 2, 2, 3, 0], 5)

In [None]:
from sklearn.metrics import dcg_score, ndcg_score

In [None]:
a = sorted([5, 5, 4, 5, 2, 4, 5, 3, 5, 5, 2, 3, 0, 0, 1, 2, 2, 3, 0], reverse=True)
y_true = np.asarray([a])
print(y_true)
y_score = np.asarray([[5, 5, 4, 5, 2, 4, 5, 3, 5, 5, 2, 3, 0, 0, 1, 2, 2, 3, 0]])
ndcg_score(y_true, y_score, k=5)

[[5 5 5 5 5 5 4 4 3 3 3 2 2 2 2 1 0 0 0]]


0.8333333333333333