# **Милютина Лилия Александровна**

In [1]:
import numpy as np
import pandas as pd
import math
from collections import defaultdict

from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split

from keras import backend as K
from keras.layers import Activation, Dense, Input, Subtract
from keras.models import Model

import warnings
warnings.filterwarnings("ignore")

# Метрики

In [2]:
# MAP

def calculate_MAP(n, true, pred):
    m = len(true)
    pred = pred[:n]
    
    correct = 0
    precision = []
    for i, el in enumerate(pred):
        if el in true:
            correct += 1
            
        pr = correct / (i + 1)
        precision.append(pr)
    result = np.sum(precision) / m
    return result

In [3]:
# MRR

def calculate_MRR(true, pred):
    
    result = []
    u = len(true)
    for i, user_recom in enumerate(pred):
        
        rank = 0
        for j, el in enumerate(user_recom):
            if el in true[i]:
                rank = j
                break
        result.append(1 / (j+1))
    return np.sum(result) / n

In [4]:
# nDCG

def calculate_nDCG(n, true, pred):
    pred = pred[:n]
    idcg = 0
    dcg = 0
    for k in range(len(pred)):
        idcg_cur = 1 / (np.log2(k+1) + 1)
        idcg += idcg_cur
        
        if pred[k] in true:
            rel = 1 * idcg_cur
        else:
            rel = 0
        
        dcg += rel
    
    return dcg / idcg

**Посмотрим на простых примерах, что метрики работают корректно**

In [5]:
true = ["a", "b", "c"]
pred = ["a", "b", "c"]
n = 3

map_metric = calculate_MAP(n, true, pred)
ngcg_metric = calculate_nDCG(n, true, pred)
mrr_metric = calculate_MRR([true], [pred])

print(f'MAP: {map_metric}, MRR: {mrr_metric}, nDCG: {ngcg_metric}')

MAP: 1.0, MRR: 0.3333333333333333, nDCG: 1.0


In [6]:
true = ["a", "b", "c"]
pred = ["k", "b", "c"]
n = 3

map_metric = calculate_MAP(n, true, pred)
ngcg_metric = calculate_nDCG(n, true, pred)
mrr_metric = calculate_MRR([true], [pred])

print(f'MAP: {map_metric}, MRR: {mrr_metric}, nDCG: {ngcg_metric}')

MAP: 0.38888888888888884, MRR: 0.16666666666666666, nDCG: 0.47001695300989266


In [7]:
true = ["a", "b", "c"]
pred = ["k", "l", "c"]
n = 3

map_metric = calculate_MAP(n, true, pred)
ngcg_metric = calculate_nDCG(n, true, pred)
mrr_metric = calculate_MRR([true], [pred])

print(f'MAP: {map_metric}, MRR: {mrr_metric}, nDCG: {ngcg_metric}')

MAP: 0.1111111111111111, MRR: 0.1111111111111111, nDCG: 0.205025429514839


*Значения метрик совпадают с теоретически рассчитанными*

# **LambdaRanker**

Строится полносвязная нейронная сетка, которая производит listwise ранжирование, оптимизируя nGCG метрику

In [60]:
class LambdaRanker(object):

    def __init__(self, input_size, hidden_layer_sizes=(100,), activation=('relu',), solver='adam', n=None):
        self.model = self._build_model(input_size, hidden_layer_sizes, activation)
        self.model.compile(optimizer=solver, loss='binary_crossentropy')
        self.n = n

    @staticmethod
    def _build_model(input_shape, hidden_layer_sizes, activation):
        """
        Строим сетку.
        """
        
        # Добавляем полносвязные слои с нелинейностями
        hidden_layers = []
        for i in range(len(hidden_layer_sizes)):
            name = str(activation[i]) + '_layer' + str(i)
            hidden_layers.append(Dense(hidden_layer_sizes[i], activation=activation[i], name=name))
                                 
        h0 = Dense(1, activation='linear', name='Identity_layer')
        input1 = Input(shape=(input_shape,), name='Input_layer1')
        input2 = Input(shape=(input_shape,), name='Input_layer2')
        
        # Пропускаем через сеть
        x1 = input1
        x2 = input2
        for i in range(len(hidden_layer_sizes)):
            x1 = hidden_layers[i](x1)
            x2 = hidden_layers[i](x2)
                                 
        x1 = h0(x1)
        x2 = h0(x2)
                                 
        subtracted = Subtract(name='Subtract_layer')([x1, x2])
        out = Activation('sigmoid', name='Activation_layer')(subtracted)
        
        # Собираем итоговую модель
        model = Model(inputs=[input1, input2], outputs=out)
        return model

    def _fetch_qid_data(self, y, qid):
                                 
        """
        Извлекаем индексы, релевантные запросы, метрики.
        """
        qid_unique, qid2indices, qid_inverse_indices = np.unique(qid, return_index=True, return_inverse=True)
                                 
        # Получим релевантные запросы.
        qid2rel = [[] for _ in range(len(qid_unique))]
        for i, qid_unique_index in enumerate(qid_inverse_indices):
            qid2rel[qid_unique_index].append(y[i])
        
        # Посчитаем метрики
        qid2dcg = [self._CalcDCG(qid2rel[i]) for i in range(len(qid_unique))]
        qid2idcg = [self._CalcDCG(sorted(qid2rel[i], reverse=True)) for i in range(len(qid_unique))]
        return qid2indices, qid2rel, qid2idcg, qid2dcg


    def fit(self, X, y, qid, batch_size=None, epochs=1, verbose=1, validation_split=0.0):                  
        """
        Фитим модель.
        """
                                 
        X1_trans, X2_trans, y_trans, weight = self._transform_pairwise(X, y, qid)
        self.model.fit([X1_trans, X2_trans], y_trans, sample_weight=weight, batch_size=batch_size, epochs=epochs,
                       verbose=verbose, validation_split=validation_split)
        return self.evaluate(X, y, qid)

    def predict(self, X):
        """
        Делаем предсказания.
        """
        ranker_output = K.function([self.model.layers[0].input], [self.model.layers[-3].get_output_at(0)])
        return ranker_output([X])[0].ravel()

    def evaluate(self, X, y, qid):
        """
        Считаем метрику nDCG.
        """
        y_pred = self.predict(X)
        tmp = np.array(np.hstack([y.reshape(-1, 1), y_pred.reshape(-1, 1), qid.reshape(-1, 1)]))
        tmp = tmp[np.lexsort((-tmp[:, 1], tmp[:, 2]))]
        y_sorted = tmp[:, 0]
        qid_sorted = tmp[:, 2]
        ndcg = self._EvalNDCG(y_sorted, qid_sorted)
        return ndcg
            
    @staticmethod
    def _CalcDCG(labels):
        """
        Считаем DCG.
        """
        
        sumdcg = 0.0
        for i in range(len(labels)):
            rel = labels[i]
            if rel != 0:
                sumdcg += ((2 ** rel) - 1) / math.log2(i + 2)
        return sumdcg

    def _EvalNDCG(self, y, qid):
        """
        Метод для расчета nDCG на предикте.
        """
        _, _, qid2idcg, qid2dcg = self._fetch_qid_data(y, qid)
        sumndcg = 0
        count = 0.0
        for qid_unique_idx in range(len(qid2idcg)):
            count += 1
            if qid2idcg[qid_unique_idx] == 0:
                continue
            idcg = qid2idcg[qid_unique_idx]
            dcg = qid2dcg[qid_unique_idx]
            sumndcg += dcg / idcg
        return sumndcg / count

    def _transform_pairwise(self, X, y, qid):
        """
        Разбиваем на пары для ранжирования.
        """
        qid2indices, qid2rel, qid2idcg, _ = self._fetch_qid_data(y, qid)
        X1 = []
        X2 = []
        weight = []
        Y = []
        for qid_unique_idx in range(len(qid2indices)):
            if qid2idcg[qid_unique_idx] == 0:
                continue
            IDCG = 1.0 / qid2idcg[qid_unique_idx]
            rel_list = qid2rel[qid_unique_idx]
            qid_start_idx = qid2indices[qid_unique_idx]
            for pos_idx in range(len(rel_list)):
                for neg_idx in range(len(rel_list)):
                    if rel_list[pos_idx] <= rel_list[neg_idx]:
                        continue
                    # calculate lambda
                    pos_loginv = 1.0 / math.log2(pos_idx + 2)
                    neg_loginv = 1.0 / math.log2(neg_idx + 2)
                    pos_label = int(rel_list[pos_idx])
                    neg_label = int(rel_list[neg_idx])
                    original = ((1 << pos_label) - 1) * pos_loginv + ((1 << neg_label) - 1) * neg_loginv
                    changed = ((1 << neg_label) - 1) * pos_loginv + ((1 << pos_label) - 1) * neg_loginv
                    delta = (original - changed) * IDCG
                    if delta < 0:
                        delta = -delta
                    # balanced class
                    if 1 != (-1) ** (qid_unique_idx + pos_idx + neg_idx):
                        X1.append(X[qid_start_idx + pos_idx])
                        X2.append(X[qid_start_idx + neg_idx])
                        weight.append(delta)
                        Y.append(1)
                    else:
                        X1.append(X[qid_start_idx + neg_idx])
                        X2.append(X[qid_start_idx + pos_idx])
                        weight.append(delta)
                        Y.append(0)
        return np.asarray(X1), np.asarray(X2), np.asarray(Y), np.asarray(weight)


# Тестирование на MQ2007 и MQ2008

**Тестирование на датасете MQ2007**

In [9]:
X_train, y_train, query_train = load_svmlight_file('mq2007/train.txt', query_id=True)
X_val, y_val, query_val  = load_svmlight_file('mq2007/vali.txt', query_id=True)

X_train = np.array(X_train.todense(), dtype=np.float32)
X_val = np.array(X_val.todense(), dtype=np.float32)

*Посмотрим на данные*

In [10]:
X_train

array([[0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      ],
       [0.03131 , 0.666667, 0.5     , ..., 0.333333, 0.448276, 0.      ],
       [0.078682, 0.166667, 0.5     , ..., 0.833333, 0.678161, 0.      ],
       ...,
       [0.762295, 0.      , 0.      , ..., 0.5     , 0.686275, 0.      ],
       [0.02459 , 0.      , 0.      , ..., 0.5     , 0.352941, 0.      ],
       [0.663934, 0.      , 0.      , ..., 0.5     , 0.431373, 0.      ]],
      dtype=float32)

In [11]:
np.unique(y_train)

array([0., 1., 2.])

In [12]:
np.unique(query_train)

array([  10,   15,   33, ..., 5989, 5996, 6000])

In [13]:
# Обучим модель и сделаем предсказания

n = 5
ranker = LambdaRanker(input_size=X_train.shape[1], hidden_layer_sizes=(16,8,), activation=('relu', 'relu',), solver='adam', n=n)
ngcg_train = ranker.fit(X_train, y_train, query_train, epochs=5)
y_pred = ranker.predict(X_train)
y_val_pred = ranker.predict(X_val)
ngcg_val = ranker.evaluate(X_val, y_val, query_val)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [14]:
print('Metrics on train:')
map_metric = calculate_MAP(n, y_train, y_pred)
mrr_metric = calculate_MRR([y_train], [y_pred])
print(f"nGCG@5: {round(ngcg_train, 2)}")

print('Metrics on validation:')
map_metric = calculate_MAP(n, y_val, y_val_pred)
mrr_metric = calculate_MRR([y_val], [y_val_pred])
print(f"nGCG@5: {round(ngcg_train, 2)}")

Metrics on train:
nGCG@5: 0.42
Metrics on validation:
nGCG@5: 0.42


**Тестирование на датасете MQ2008**

In [15]:
X_train, y_train, query_train = load_svmlight_file('mq2008/train.txt', query_id=True)
X_val, y_val, query_val  = load_svmlight_file('mq2008/vali.txt', query_id=True)

X_train = np.array(X_train.todense(), dtype=np.float32)
X_val = np.array(X_val.todense(), dtype=np.float32)

In [16]:
# Обучим модель и сделаем предсказания

ranker = LambdaRanker(input_size=X_train.shape[1], hidden_layer_sizes=(16,8,), activation=('relu', 'relu',), solver='adam', n=n)
ngcg_train = ranker.fit(X_train, y_train, query_train, epochs=5)
y_pred = ranker.predict(X_train)
y_val_pred = ranker.predict(X_val)
ngcg_val = ranker.evaluate(X_val, y_val, query_val)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
print('Metrics on train:')
map_metric = calculate_MAP(n, y_train, y_pred)
mrr_metric = calculate_MRR([y_train], [y_pred])
print(f"nGCG@5: {round(ngcg_train, 2)}")

print('Metrics on validation:')
map_metric = calculate_MAP(n, y_val, y_val_pred)
mrr_metric = calculate_MRR([y_val], [y_val_pred])
print(f"nGCG@5: {round(ngcg_train, 2)}")

Metrics on train:
nGCG@5: 0.47
Metrics on validation:
nGCG@5: 0.47


# **Тестирование на датасете MovieLens**

In [72]:
# Читаем данные
ratings = pd.read_csv("movielens/ratings.csv")
ratings = ratings.iloc[:1000, :]
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [73]:
movies = pd.read_csv("movielens/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [74]:
links = pd.read_csv("movielens/links.csv")
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [75]:
all_info = pd.merge(ratings, links, on="movieId")

In [76]:
all_info

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,1,4.0,964982703,114709,862.0
1,5,1,4.0,847434962,114709,862.0
2,7,1,4.5,1106635946,114709,862.0
3,1,3,4.0,964981247,113228,15602.0
4,6,3,5.0,845554296,113228,15602.0
...,...,...,...,...,...,...
995,7,8972,2.5,1111552650,368891,2059.0
996,7,8984,4.0,1106636101,349903,163.0
997,7,27741,3.0,1107474020,351817,12496.0
998,7,30812,3.5,1113353083,338751,2567.0


In [77]:
# Разбиваем на train и test

trainset, testset = train_test_split(all_info, test_size=0.3)

In [78]:
# Собираем query как id user-ов

query_train = np.array(trainset["userId"])
query_val = np.array(testset["userId"])

In [79]:
# Собираем y из rating: 5 - 0, 4 - 1, 3, 2, 1 - 2

def get_y_from_dataset(dataset):
    y = []
    for el in np.array(dataset["rating"]):
        if el == 5:
            y.append(0)
        elif el == 4:
            y.append(1)
        else:
            y.append(2)
        
    return np.array(y)

y_train = get_y_from_dataset(trainset)
y_val = get_y_from_dataset(testset)

In [80]:
# Собираем матрицу признаков X (признаки movies)

X_train = np.array(trainset[["imdbId", "tmdbId"]])
X_val = np.array(testset[["imdbId", "tmdbId"]])

In [82]:
# Обучим модель и сделаем предсказания

ranker = LambdaRanker(input_size=X_train.shape[1], hidden_layer_sizes=(16,8,), activation=('relu', 'relu',), solver='adam', n=5)
ngcg_train = ranker.fit(X_train, y_train, query_train, epochs=5)
y_pred = ranker.predict(X_train)
y_val_pred = ranker.predict(X_val)
ngcg_val = ranker.evaluate(X_val, y_val, query_val)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [83]:
print(f"nGCG@5 on train: {round(ngcg_train, 2)},\nnDCG@5 on validation: {round(ngcg_val, 2)}")

nGCG@5 on train: 0.81,
nDCG@5 on validation: 0.8


**Посмотрим на предсказания**

In [84]:
testset["y_pred"] = y_val_pred

In [85]:
# посмотрим на топ-5 для каждого пользователя

n = 5
r_pred_lambda_rank = testset.groupby(["userId"]).apply(lambda x: x.nlargest(n, "y_pred"))[["userId", "movieId", "rating"]]

r_pred_lambda_rank.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,userId,movieId,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,286,1,2644,4.0
1,110,1,923,5.0
1,219,1,2018,5.0
1,111,1,940,5.0
1,119,1,1029,5.0


*Ранжирование выглядит разумно*

*Результаты кажутся очень даже хорошими. Посмотрим на SVD*

# **Сравнение с SVD**

In [86]:
trainset

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
643,5,153,3.0,847434802,112462,414.0
584,4,3079,3.0,964778197,178737,10399.0
956,7,5445,4.0,1106636707,181689,180.0
829,6,434,4.0,845553200,106582,9350.0
708,6,92,4.0,845555454,117002,9095.0
...,...,...,...,...,...,...
240,1,2139,5.0,964982791,84649,11704.0
287,1,2648,4.0,964983414,21884,3035.0
421,3,26409,4.5,1306463993,78062,98851.0
382,2,106782,5.0,1445714966,993846,106646.0


In [87]:
# Приведем матрицу рейтингов к удобной форме.

def convert_matrix(df, fillna=0):
    res = df.pivot(
        index='userId',
        columns='movieId',
        values='rating'
    ).fillna(fillna)

    return res

trainset = convert_matrix(trainset)
testset = convert_matrix(testset)


for column in trainset.columns:
    if column not in testset.columns:
        testset[column] = 0
        
for column in testset.columns:
    if column not in trainset.columns:
        trainset[column] = 0

In [88]:
trainset

movieId,1,2,3,4,5,6,10,11,15,16,...,48516,60756,70946,71535,74458,77455,89774,91658,114060,115713
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,0.0,4.0,0.0,3.0,5.0,4.0,3.0,4.0,4.0,4.0,...,0,0,0,0,0,0,0,0,0,0
7,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


**Возьмем реализацию SVD из прошлой домашней работы**

In [89]:
class SVD:
    def __init__(self, optimization_method='sgd', n_factors=10, n_epochs=10, lr_pq=1e-3, 
                 lr_bias=1e-6, biased=True, std_init=1e-1, reg_alpha=1e-2, verbose=True):
        
        if optimization_method not in ['sgd', 'als']:
            print('Incorrect name of optimization method')
            
        self.optimization_method = optimization_method
        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.lr_pq = lr_pq
        self.lr_bias = lr_bias
        self.biased = biased
        self.std_init = std_init
        self.reg_alpha = reg_alpha
        self.verbose = verbose
        
        self.trainset = None
        self.bu = {}
        self.bi = {}
        self.pu = {}
        self.qi = {}
        
    def fit(self, trainset):
        """
        Find pu, qi, mu, bu, bi: 
        r' = mu + bu + bi + pu * qi.
        SGD or ALS can be used for optimization.
        """
        self.trainset = trainset
            
        if self.optimization_method == 'sgd':
            self.sgd()
        else:
            
            df = self.trainset.replace(0, np.NaN)
            self.mu = df.mean(skipna=True, axis=1).mean(axis=0)
            self.biased = False
            
            X, y = self.als()
            X = X.toarray()
            y = y.toarray().T
            
            for u in range(self.trainset.shape[0]):
                u_id = int(self.trainset.index[u])
                for f in range(self.n_factors):
                    self.pu[u_id, f] = X[u, f]
                
            for i in range(self.trainset.shape[1]):
                i_id = int(self.trainset.columns[i])
                for f in range(self.n_factors):
                    self.qi[i_id, f] = y.T[i, f]
                
    def sgd(self):
        """
        Stochastic gradient descent.
        """
        
        if not self.biased:
            self.mu = 0
        else:
            df = self.trainset.replace(0, np.NaN)
            self.mu = df.mean(skipna=True, axis=1).mean(axis=0)
        
        bu = np.zeros(self.trainset.shape[0])
        bi = np.zeros(self.trainset.shape[1])
        pu = np.random.normal(self.mu, self.std_init, (self.trainset.shape[0], self.n_factors))
        qi = np.random.normal(self.mu, self.std_init, (self.trainset.shape[1], self.n_factors))

        for current_epoch in range(self.n_epochs):
            if self.verbose:
                print (f'Processing epoch {current_epoch+1} / {self.n_epochs}')
            for u in range(self.trainset.shape[0]):
                for i in range(self.trainset.shape[1]):
                    r = self.trainset.iloc[u, i]

                    dot = 0
                    for f in range(self.n_factors):
                        dot += qi[i, f] * pu[u, f]
                    err = r - self.mu - dot - bu[u] - bi[i]
                    
                    # update biases
                    if self.biased:
                        bu[u] += self.lr_bias * (err - self.reg_alpha *  bu[u])
                        bi[i] += self.lr_bias * (err - self.reg_alpha *  bi[i])

                    for f in range(self.n_factors):
                        pu[u, f] += self.lr_pq * (err * pu[u, f] - self.reg_alpha * qi[i, f]) 
                        qi[i, f] += self.lr_pq * (err * qi[i, f] - self.reg_alpha * pu[u, f])
      
        for u in range(self.trainset.shape[0]):
            u_id = int(self.trainset.index[u])
            self.bu[u_id] = bu[u]
            for f in range(self.n_factors):
                self.pu[u_id, f] = pu[u, f]
                
        for i in range(self.trainset.shape[1]):
            i_id = int(self.trainset.columns[i])
            self.bi[i_id] = bi[i]
            for f in range(self.n_factors):
                self.qi[i_id, f] = qi[i, f]
                
    def als(self):
        """
        Alternative least squares.
        """
        
        confidence = self.trainset
    
        X = scipy.sparse.csr_matrix(np.random.normal(0, size = (self.trainset.shape[0], self.n_factors)))
        Y = scipy.sparse.csr_matrix(np.random.normal(0, size = (self.trainset.shape[1], self.n_factors)))
        X_I = scipy.sparse.eye(self.trainset.shape[0])
        Y_I = scipy.sparse.eye(self.trainset.shape[1])
        I = scipy.sparse.eye(self.n_factors)
        lI = self.reg_alpha * I
        
        for i in range(self.n_epochs):
            
            if self.verbose:
                print (f'Processing epoch {i+1} / {self.n_epochs}')

            yTy = Y.T.dot(Y)
            xTx = X.T.dot(X)

            for u in range(self.trainset.shape[0]):
                
                u_row = confidence.iloc[u,:].values
                p_u = u_row.copy()
                p_u[p_u != 0] = 1.0

                CuI = scipy.sparse.diags(u_row)
                Cu = CuI + Y_I
                
                yT_CuI_y = Y.T.dot(CuI).dot(Y)
                yT_Cu_pu = Y.T.dot(Cu).dot(p_u.T)
                X[u] = scipy.sparse.linalg.spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)

    
            for i in range(self.trainset.shape[1]):

                i_row = confidence.iloc[:,i].T.values
                p_i = i_row.copy()
                p_i[p_i != 0] = 1.0


                CiI = scipy.sparse.diags(i_row)
                Ci = CiI + X_I

                xT_CiI_x = X.T.dot(CiI).dot(X)
                xT_Ci_pi = X.T.dot(Ci).dot(p_i.T)
                Y[i] = scipy.sparse.linalg.spsolve(xTx + xT_CiI_x + lI, xT_Ci_pi)

        return X, Y
        
    def predict(self, testset):
        """
        Get predicted rating matrix.
        """
        preds = []
        for u in range(testset.shape[0]):
            u_id = int(self.trainset.index[u])
            known_user = u_id in self.trainset.index
            
            pred = []
            for i in range(testset.shape[1]):
                i_id = int(self.trainset.columns[i])
                known_item = i_id in self.trainset.columns
                
                if testset.iloc[u, i] == 0:
                    pred.append(0)
                    continue
                    
                if self.biased:
                    est = self.mu

                    if known_user:
                        est += self.bu[u_id]

                    if known_item:
                        est += self.bi[i_id]

                    if known_user and known_item:
                        for f in range(self.n_factors):
                            est += self.qi[i_id, f] * self.pu[u_id, f]

                else:
                    est = 0
                    if known_user and known_item:
                        for f in range(self.n_factors):
                            est += self.qi[i_id, f] * self.pu[u_id, f]
                    else:
                        print('User and item are unknown.')
                pred.append(est)
            preds.append(pred)
        preds = pd.DataFrame(preds)
        return preds

In [90]:
model = SVD('sgd', verbose=False)
model.fit(trainset)
r_pred = model.predict(testset)

In [91]:
r_pred.index = testset.index
r_pred.columns = testset.columns

In [92]:
r_pred

movieId,3,7,8,13,22,24,34,36,39,43,...,80489,80906,86345,91529,99114,106782,109487,112552,122882,131724
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,0.0,0.0,0.0,0.0,0.0,0.0,3.425335,0.0,3.412993,0.0,...,0,0,0,0,0,0,0,0,0,0
6,3.431458,3.435189,3.427826,3.422389,3.411711,3.40084,0.0,3.396947,0.0,3.388942,...,0,0,0,0,0,0,0,0,0,0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


**Отранжируем и возьмем топ-n**

In [93]:
n = 5

recoms_svd = defaultdict(list)
for i in range(r_pred.shape[0]):
    user_id = r_pred.index[i]
    r_pred_sort = r_pred.iloc[i, :].sort_values(ascending=False).iloc[:n]
    recoms_svd[user_id] = r_pred_sort.index.to_list()

In [94]:
recoms_svd

defaultdict(list,
            {1: [47, 110, 216, 316, 333],
             2: [318, 89774, 114060, 48516, 46970],
             3: [688, 849, 2080, 1371, 3949],
             4: [45, 47, 162, 235, 190],
             5: [34, 39, 58, 266, 300],
             6: [7, 3, 8, 13, 22],
             7: [58, 165, 150, 466, 1923]})

In [95]:
recoms_lambda_rank = defaultdict(list)
for i in range(r_pred_lambda_rank.shape[0]):
    
    recoms_lambda_rank[r_pred_lambda_rank.iloc[i, :]["userId"]].append(r_pred_lambda_rank.iloc[i, :]["movieId"])

In [96]:
recoms_lambda_rank

defaultdict(list,
            {1.0: [2644.0, 923.0, 2018.0, 940.0, 1029.0],
             2.0: [318.0, 6874.0, 8798.0, 48516.0, 46970.0],
             3.0: [720.0, 2080.0, 1371.0, 849.0, 70946.0],
             4.0: [919.0, 912.0, 899.0, 1086.0, 2019.0],
             5.0: [590.0, 589.0, 515.0, 266.0, 608.0],
             6.0: [304.0, 979.0, 592.0, 597.0, 593.0],
             7.0: [750.0, 2529.0, 8207.0, 1208.0, 1210.0]})

In [97]:
len_of_common = []
for key in recoms_svd.keys():
    common = set(recoms_svd[key]) & set(recoms_lambda_rank[key])
    len_of_common.append(len(common))
    print(f'UserId: {int(key)}, Число одинаковых рекомендаций: {len(common)}')

print('------')
print(f'Среднее по пользователям число одинаковых рекомендаций: {np.mean(len_of_common)}')

UserId: 1, Число одинаковых рекомендаций: 0
UserId: 2, Число одинаковых рекомендаций: 3
UserId: 3, Число одинаковых рекомендаций: 3
UserId: 4, Число одинаковых рекомендаций: 0
UserId: 5, Число одинаковых рекомендаций: 1
UserId: 6, Число одинаковых рекомендаций: 0
UserId: 7, Число одинаковых рекомендаций: 0
------
Среднее по пользователям число одинаковых рекомендаций: 1.0


*Рекомендации отличаются и довольно сильно, но посмотрим на сравнение качества рекомендаций*

Под *качеством* будем понимать то, насколько хорошие мы фильмы порекомендовали в смысле того, что это рекомендации с лучшими оценками, которые выставил пользователь. Таким образом, не важно какие именно фильмы мы порекомендовали, важно, насколько хорошо мы угадали рейтинг (и выбрали максимальные соответственно), который поставит пользователь этому фильму.

In [98]:
# посмотрим на рейтинги фильмов, которые порекомендовали пользователям двумя алгоритмами

r_test_top_svd = []

for key in recoms_svd.keys():
    r_test_temp = []
    for el in recoms_svd[key]:
        r_test_temp.append(testset.loc[int(key), int(el)])
    r_test_top_svd.append(r_test_temp)
    
r_test_top_lambda_rank = []

for key in recoms_lambda_rank.keys():
    r_test_temp = []
    for el in recoms_lambda_rank[key]:
        r_test_temp.append(testset.loc[int(key), int(el)])
    r_test_top_lambda_rank.append(r_test_temp)

In [99]:
r_test_top_svd

[[5.0, 4.0, 5.0, 3.0, 5.0],
 [3.0, 5.0, 2.0, 4.0, 4.0],
 [0.5, 5.0, 0.5, 3.0, 0.5],
 [3.0, 2.0, 5.0, 2.0, 2.0],
 [4.0, 3.0, 5.0, 1.0, 3.0],
 [4.0, 5.0, 3.0, 3.0, 5.0],
 [3.0, 4.0, 4.5, 5.0, 1.5]]

In [100]:
r_test_top_lambda_rank

[[4.0, 5.0, 5.0, 5.0, 5.0],
 [3.0, 4.0, 3.5, 4.0, 4.0],
 [0.5, 0.5, 3.0, 5.0, 5.0],
 [5.0, 5.0, 4.0, 5.0, 2.0],
 [5.0, 3.0, 3.0, 1.0, 3.0],
 [4.0, 3.0, 3.0, 4.0, 4.0],
 [4.0, 5.0, 4.5, 4.0, 4.0]]

In [101]:
np.array(r_test_top_lambda_rank) - np.array(r_test_top_svd)

array([[-1. ,  1. ,  0. ,  2. ,  0. ],
       [ 0. , -1. ,  1.5,  0. ,  0. ],
       [ 0. , -4.5,  2.5,  2. ,  4.5],
       [ 2. ,  3. , -1. ,  3. ,  0. ],
       [ 1. ,  0. , -2. ,  0. ,  0. ],
       [ 0. , -2. ,  0. ,  1. , -1. ],
       [ 1. ,  1. ,  0. , -1. ,  2.5]])

In [102]:
# посмотрим на среднюю разницу

np.mean(np.array(r_test_top_lambda_rank) - np.array(r_test_top_svd))

0.4142857142857143

**Lambda rank в среднем рекомендует фильмы более релевантные**

# Выводы

- Реализован алгоритм LambdaRank и продемонстрирована его работа на классических датасетов
- Проведено сравнение метода LambdaRank и SVD
- Показано, что LambdaRank дает лучше релевантные рекомендации, нежели чем классический алгоритм SVD
- Было очень познавательно покопаться в реализации LambdaRank, заодно потренить навыки конструирования и обучения нейросеток:)