In [1]:
import os
import sys

from collections import defaultdict, Counter
from tqdm.notebook import tqdm_notebook

import numpy as np
import pandas as pd
import scipy.stats as sps
import scipy.sparse as scsp
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ndcg_score, dcg_score, roc_auc_score, average_precision_score
from sklearn.metrics.pairwise import cosine_similarity

from joblib import Parallel, delayed

import tqdm
import json

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, clear_output

sns.set()

In [2]:
ases = [f"AS_{i}" for i in range(10)]
ases

['AS_0',
 'AS_1',
 'AS_2',
 'AS_3',
 'AS_4',
 'AS_5',
 'AS_6',
 'AS_7',
 'AS_8',
 'AS_9']

In [3]:
ratings = {}

ratings['AS_0'] = {'AS_0': 1,
                   'AS_1': 0,
                   'AS_3': 0,
                   'AS_6': 1,
                   'AS_9': 0}

ratings['AS_1'] = {'AS_1': 1,
                   'AS_2': 0,
                   'AS_3': 1,
                   'AS_4': 0,
                   'AS_5': 1,
                   'AS_7': 1,
                   'AS_9': 1}

ratings['AS_2'] = {'AS_0': 1,
                   'AS_1': 0,
                   'AS_2': 1,
                   'AS_3': 0,
                   'AS_4': 1,
                   'AS_6': 1,
                   'AS_8': 1}

ratings['AS_3'] = {'AS_0': 1,
                   'AS_1': 1,
                   'AS_3': 1,
                   'AS_7': 1,
                   'AS_8': 0,
                   'AS_9': 1,}

ratings['AS_4'] = {'AS_2': 1,
                   'AS_4': 1,}

ratings['AS_5'] = {'AS_0': 1,
                   'AS_1': 1,
                   'AS_3': 1,
                   'AS_5': 1,
                   'AS_6': 0,}

ratings['AS_6'] = {'AS_1': 0,
                   'AS_2': 1,
                   'AS_4': 1,
                   'AS_6': 1,}

ratings['AS_7'] = {'AS_0': 0,
                   'AS_1': 1,
                   'AS_7': 1,}

ratings['AS_8'] = {'AS_0': 1,
                   'AS_3': 0,
                   'AS_5': 1,
                   'AS_6': 1,
                   'AS_8': 1}

ratings['AS_9'] = {'AS_0': 1,
                   'AS_1': 1,
                   'AS_2': 1,
                   'AS_4': 1,
                   'AS_6': 0,
                   'AS_8': 1,
                   'AS_9': 1,}

In [4]:
df = pd.DataFrame({"AS": ases,
                   "Ratings": list(ratings.values())})

In [5]:
df

Unnamed: 0,AS,Ratings
0,AS_0,"{'AS_0': 1, 'AS_1': 0, 'AS_3': 0, 'AS_6': 1, '..."
1,AS_1,"{'AS_1': 1, 'AS_2': 0, 'AS_3': 1, 'AS_4': 0, '..."
2,AS_2,"{'AS_0': 1, 'AS_1': 0, 'AS_2': 1, 'AS_3': 0, '..."
3,AS_3,"{'AS_0': 1, 'AS_1': 1, 'AS_3': 1, 'AS_7': 1, '..."
4,AS_4,"{'AS_2': 1, 'AS_4': 1}"
5,AS_5,"{'AS_0': 1, 'AS_1': 1, 'AS_3': 1, 'AS_5': 1, '..."
6,AS_6,"{'AS_1': 0, 'AS_2': 1, 'AS_4': 1, 'AS_6': 1}"
7,AS_7,"{'AS_0': 0, 'AS_1': 1, 'AS_7': 1}"
8,AS_8,"{'AS_0': 1, 'AS_3': 0, 'AS_5': 1, 'AS_6': 1, '..."
9,AS_9,"{'AS_0': 1, 'AS_1': 1, 'AS_2': 1, 'AS_4': 1, '..."


In [6]:
df.to_csv('as_ratings.csv', index=False)

In [7]:
df = pd.read_csv('as_ratings.csv')

In [8]:
df.head()

Unnamed: 0,AS,Ratings
0,AS_0,"{'AS_0': 1, 'AS_1': 0, 'AS_3': 0, 'AS_6': 1, '..."
1,AS_1,"{'AS_1': 1, 'AS_2': 0, 'AS_3': 1, 'AS_4': 0, '..."
2,AS_2,"{'AS_0': 1, 'AS_1': 0, 'AS_2': 1, 'AS_3': 0, '..."
3,AS_3,"{'AS_0': 1, 'AS_1': 1, 'AS_3': 1, 'AS_7': 1, '..."
4,AS_4,"{'AS_2': 1, 'AS_4': 1}"


In [9]:
encoder = LabelEncoder().fit(df['AS'])

In [10]:
all_items = df['AS']
indices = encoder.transform(all_items)
item_to_id = dict(zip(all_items, indices))

In [11]:
item_to_id

{'AS_0': 0,
 'AS_1': 1,
 'AS_2': 2,
 'AS_3': 3,
 'AS_4': 4,
 'AS_5': 5,
 'AS_6': 6,
 'AS_7': 7,
 'AS_8': 8,
 'AS_9': 9}

# SLIM

Пусть $R \in \mathbb{R}^{|U| \times |I|}$ &mdash; бинарная матрица взаимодействий пользователей и айтемов. Тогда SLIM оптимизирует

$$L = \frac{1}{2} \Vert R - RW \Vert_F^2 + \frac{\beta}{2} \Vert W \Vert_F^2 + \lambda \Vert W \Vert_1 \rightarrow \min_W,\\
  s.t. \forall i, j \; W_{ij} \geq 0, W_{ii} = 0.$$
  
Матрица $W$ &mdash; матрица попарных айтемых "взаимосвязей".

In [12]:
import numba

item_ratings_ind = [numba.typed.List() for _ in range(len(encoder.classes_))]
user_ids = encoder.transform(df['AS'])

for user_id, items_with_ratings in tqdm_notebook(zip(user_ids, df['Ratings']),
                                                 total=len(df)):
    item_ids, item_ratings = zip(*json.loads(items_with_ratings.replace("'", '"')).items())
    item_ids = [item_to_id[item_id] for item_id in item_ids]
    for item_id, rating in zip(item_ids, item_ratings):
        item_ratings_ind[item_id].append((user_id, rating))
        
item_ratings_ind_nb = numba.typed.List(item_ratings_ind)

  0%|          | 0/10 [00:00<?, ?it/s]

In [13]:
# item_ratings_ind = [[] for _ in range(len(encoder.classes_))]
# user_ids = encoder.transform(df['AS'])

# for user_id, items_with_ratings in tqdm_notebook(zip(user_ids, df['Ratings']),
#                                                  total=len(df)):

#     items_with_ratings = eval(items_with_ratings)
#     for key, value in items_with_ratings.items():
#         pass
#     # print(items_with_ratings)

In [14]:
@numba.njit()
def fit_one_item(item_ratings, j, n_iter=20, l2_reg=1.0, l1_reg=2.0):
    """
    Оптимизирует один столбец матрицы W
    * item_ratings -- список списков взаимодействий айтема,
      item_ratings[j] -- список взаимодействий айтема j с пользователями
    * j -- номер айтема, для которого ищем веса
    * n_iter -- количество итераций оптимизации
    * l1_reg, l2_reg -- коэффициенты регуляризации L1 и L2

    Возвращает dict: item -> вес, все ненулевые веса
    """
    n_items = len(item_ratings)
    per_item_positives = []
    item_interactions = set()
    for user, rating in item_ratings[j]:
        item_interactions.add(user)
    for i in range(n_items):
        positives = set()
        for user, rating in item_ratings[i]:
            if rating == 1 and user in item_interactions:
                positives.add(user)
        per_item_positives.append(positives)

    w = np.zeros(n_items)
    non_zero_items = set()
    for _ in range(n_iter):
        for k in range(n_items):
            if k == j:
                continue

            score = len(per_item_positives[j] & per_item_positives[k]) - l1_reg
            for i in non_zero_items:
                if i == k:
                    continue
                score -= w[i] * len(per_item_positives[i] & per_item_positives[k])
                if score < 0:
                    break
            score /= len(per_item_positives[k]) + l2_reg

            score = max(score, 0.0)

            w[k] = score
            if w[k] > 1e-5:
                non_zero_items.add(k)

    non_zero_elements = {}
    for i, value in enumerate(w):
        assert value >= 0.0
        if value > 0:
            non_zero_elements[i] = value

    return non_zero_elements

In [15]:
def get_item_meta(item_id):
#     item_id = int(encoder.inverse_transform([item_id])[0])
#     return item_meta[item_meta['itemId'] == item_id].iloc[0].to_dict()
    return encoder.inverse_transform([item_id])[0]


def visualize_top(item_ratings_ind_nb, j, top=10):
    weights = fit_one_item(item_ratings_ind_nb, j)
    sorted_items = sorted(weights.items(), key=lambda x: x[1], reverse=True)[:top]

    item_ids, weights = zip(*sorted_items)
    items = map(get_item_meta, item_ids)
    anchor_item = get_item_meta(j)

    with pd.option_context('display.max_colwidth', 100):
        display(pd.DataFrame({
            anchor_item: [item for item in items],
            'score': weights
        }))

In [16]:
for i in range(len(encoder.classes_)):
    try:
        visualize_top(item_ratings_ind_nb, i)
    except:
        pass

Unnamed: 0,AS_0,score
0,AS_6,0.196429
1,AS_1,0.178571
2,AS_8,0.107143


Unnamed: 0,AS_1,score
0,AS_7,0.135135
1,AS_3,0.094595
2,AS_9,0.094595
3,AS_0,0.081081


Unnamed: 0,AS_2,score
0,AS_4,0.4


Unnamed: 0,AS_3,score
0,AS_1,0.25


Unnamed: 0,AS_4,score
0,AS_2,0.4


Unnamed: 0,AS_6,score
0,AS_0,0.166667


Unnamed: 0,AS_7,score
0,AS_1,0.25


Unnamed: 0,AS_8,score
0,AS_0,0.2


Unnamed: 0,AS_9,score
0,AS_1,0.25
