In [None]:
!pip install implicit==0.7.2 requests==2.32.3 rectools[lightfm,nmslib]==0.12.0 pandas==2.2.3 numpy==1.26.4 scipy==1.12.0 hnswlib



In [None]:
import os
import time
import threadpoolctl
import requests

import numpy as np
import pandas as pd
import zipfile as zf
import typing as tp

from pathlib import Path

from tqdm.auto import tqdm

from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking

from lightfm import LightFM

from rectools import Columns
from rectools.metrics import MAP, Precision, Recall, calc_metrics
from rectools.dataset import Dataset
from rectools.models import (
    PopularModel,
    ImplicitALSWrapperModel,
    LightFMWrapperModel,
    model_from_config,
    load_model,
    model_from_params
)

import nmslib
import hnswlib

import matplotlib.pyplot as plt
import seaborn as sns


# For implicit ALS
os.environ["OPENBLAS_NUM_THREADS"] = "1"
threadpoolctl.threadpool_limits(1, "blas")

<threadpoolctl.threadpool_limits at 0x7ca15abdd7d0>

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%%time

!wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O kion_train.zip
!unzip -o kion_train.zip -x '__MACOSX/*'
!rm kion_train.zip
!ls -lah .

Archive:  kion_train.zip
  inflating: data_original/interactions.csv  
  inflating: data_original/users.csv  
  inflating: data_original/items.csv  
total 20K
drwxr-xr-x 1 root root 4.0K Mar 25 13:11 .
drwxr-xr-x 1 root root 4.0K Mar 25 12:48 ..
drwxr-xr-x 4 root root 4.0K Mar 21 13:32 .config
drwxr-xr-x 2 root root 4.0K Mar 25 13:11 data_original
drwxr-xr-x 1 root root 4.0K Mar 21 13:33 sample_data
CPU times: user 37.6 ms, sys: 14.4 ms, total: 52 ms
Wall time: 7.53 s


In [None]:
DATA_PATH = Path("data_original")

# LOAD DATA

In [None]:
%%time
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

CPU times: user 5.58 s, sys: 880 ms, total: 6.46 s
Wall time: 11.2 s


# Preprocess

In [None]:
Columns.Datetime = 'last_watch_dt'

In [None]:
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)

In [None]:
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

In [None]:
max_date = interactions[Columns.Datetime].max()

In [None]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [None]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [None]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [None]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [None]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

# Prepare features

## User features

In [None]:
users.isnull().sum()

Unnamed: 0,0
user_id,0
age,14095
income,14776
sex,13831
kids_flg,0


In [None]:
users.fillna('Unknown', inplace=True)

In [None]:
users.nunique()

Unnamed: 0,0
user_id,840197
age,7
income,7
sex,3
kids_flg,2


In [None]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [None]:
users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
5,1037719,age_45_54,income_60_90,М,0
...,...,...,...,...,...
840184,529394,age_25_34,income_40_60,Ж,0
840186,80113,age_25_34,income_40_60,Ж,0
840188,312839,age_65_inf,income_60_90,Ж,0
840189,191349,age_45_54,income_40_60,М,1


In [None]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [None]:
user_features.query(f"id == 973171")

Unnamed: 0,id,value,feature
0,973171,М,sex
0,973171,age_25_34,age
0,973171,income_60_90,income


# Item features

In [None]:
items.isnull().sum()

Unnamed: 0,0
item_id,0
content_type,0
title,0
title_orig,4745
release_year,98
genres,0
countries,37
for_kids,15397
age_rating,2
studios,14898


In [None]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [None]:
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


In [None]:
items.nunique()

Unnamed: 0,0
item_id,14019
content_type,2
title,13454
title_orig,9724
release_year,104
genres,2559
countries,666
for_kids,2
age_rating,6
studios,38


### Genre

In [None]:
# Explode genres to flatten table
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [None]:
genre_feature["value"].value_counts()

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
драмы,4923
комедии,3479
зарубежные,3055
мелодрамы,2533
триллеры,2297
...,...
токшоу,2
красота и здоровье,2
передачи,1
образование,1


### Content

In [None]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

In [None]:
content_feature

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


In [None]:
content_feature['value'].value_counts()

Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
film,10662
series,3357


In [None]:
item_features = pd.concat((genre_feature, content_feature))

In [None]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


# Metrics

In [None]:
metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in [1, 3, 5, 10]:
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [None]:
metrics

{'Precision@1': Precision(k=1, debias_config=None, r_precision=False),
 'Precision@3': Precision(k=3, debias_config=None, r_precision=False),
 'Precision@5': Precision(k=5, debias_config=None, r_precision=False),
 'Precision@10': Precision(k=10, debias_config=None, r_precision=False),
 'Recall@1': Recall(k=1, debias_config=None),
 'Recall@3': Recall(k=3, debias_config=None),
 'Recall@5': Recall(k=5, debias_config=None),
 'Recall@10': Recall(k=10, debias_config=None),
 'MAP@1': MAP(k=1, debias_config=None, divide_by_k=False),
 'MAP@3': MAP(k=3, debias_config=None, divide_by_k=False),
 'MAP@5': MAP(k=5, debias_config=None, divide_by_k=False),
 'MAP@10': MAP(k=10, debias_config=None, divide_by_k=False)}

# Models

In [None]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = (32,)
N_EPOCHS = 1 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = 0.05 # Lightfm

In [None]:
models = {
    'popular': PopularModel(),
}

In [None]:
implicit_models = {
    'ALS': AlternatingLeastSquares,
}
for implicit_name, implicit_model in implicit_models.items():
    for is_fitting_features in (True, False):
        for n_factors in N_FACTORS:
            models[f"{implicit_name}_{n_factors}_{is_fitting_features}"] = (
                ImplicitALSWrapperModel(
                    model=implicit_model(
                        factors=n_factors,
                        random_state=RANDOM_STATE,
                        num_threads=NUM_THREADS,
                    ),
                    fit_features_together=is_fitting_features,
                )
            )

In [None]:
lightfm_losses = ('logistic', 'bpr', 'warp')

for loss in lightfm_losses:
    for n_factors in N_FACTORS:
        models[f"LightFM_{loss}_{n_factors}"] = LightFMWrapperModel(
            LightFM(
                no_components=n_factors,
                loss=loss,
                random_state=RANDOM_STATE,
                learning_rate=LEARNING_RATE,
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA,
            ),
            epochs=N_EPOCHS,
            num_threads=NUM_THREADS,
        )

In [None]:
models

{'popular': <rectools.models.popular.PopularModel at 0x7ca155b467d0>,
 'ALS_32_True': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7ca155b4d410>,
 'ALS_32_False': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7ca155b4cad0>,
 'LightFM_logistic_32': <rectools.models.lightfm.LightFMWrapperModel at 0x7ca15c766cd0>,
 'LightFM_bpr_32': <rectools.models.lightfm.LightFMWrapperModel at 0x7ca166b15290>,
 'LightFM_warp_32': <rectools.models.lightfm.LightFMWrapperModel at 0x7ca155b462d0>}

In [None]:
%%time
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 1.9 s, sys: 298 ms, total: 2.2 s
Wall time: 2.24 s


In [None]:
TEST_USERS = test[Columns.User].unique()

In [None]:
%%time
results = []
for model_name, model in tqdm(models.items()):
    print(f"Fitting model {model_name}...")
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)

  0%|          | 0/6 [00:00<?, ?it/s]

Fitting model popular...
Fitting model ALS_32_True...
Fitting model ALS_32_False...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Fitting model LightFM_logistic_32...
Fitting model LightFM_bpr_32...
Fitting model LightFM_warp_32...
CPU times: user 12min 8s, sys: 20 s, total: 12min 28s
Wall time: 7min 57s


In [None]:
df_quality = pd.DataFrame(results).T

df_quality.columns = df_quality.iloc[0]

df_quality.drop('model', inplace=True)

In [None]:
df_quality.style.highlight_max(color='lightgreen', axis=1)

model,popular,ALS_32_True,ALS_32_False,LightFM_logistic_32,LightFM_bpr_32,LightFM_warp_32
Precision@1,0.069368,0.080866,0.06095,0.000363,0.035352,0.082353
Precision@3,0.059233,0.059856,0.049863,0.000313,0.020252,0.062384
Precision@5,0.051035,0.047234,0.042768,0.000191,0.014343,0.050709
Precision@10,0.032803,0.031225,0.031436,0.000208,0.009088,0.034107
Recall@1,0.035863,0.041728,0.030516,0.000116,0.02014,0.041858
Recall@3,0.08808,0.088478,0.072278,0.000301,0.033149,0.091943
Recall@5,0.124184,0.113661,0.100516,0.000306,0.038254,0.121526
Recall@10,0.15607,0.145367,0.140669,0.000686,0.046368,0.158
MAP@1,0.035863,0.041728,0.030516,0.000116,0.02014,0.041858
MAP@3,0.059216,0.063046,0.049124,0.000208,0.02609,0.064588


# Approximate Nearest Neighbors

# HNSW algorithm parameters

## Search parameters:
* ```ef``` - the size of the dynamic list for the nearest neighbors (used during the search). Higher ```ef```
leads to more accurate but slower search. ```ef``` cannot be set lower than the number of queried nearest neighbors
```k```. The value ```ef``` of can be anything between ```k``` and the size of the dataset.
* ```k``` number of nearest neighbors to be returned as the result.
The ```knn_query``` function returns two numpy arrays, containing labels and distances to the k found nearest
elements for the queries. Note that in case the algorithm is not be able to find ```k``` neighbors to all of the queries,
(this can be due to problems with graph or ```k```>size of the dataset) an exception is thrown.


## Construction parameters:
* ```M``` - the number of bi-directional links created for every new element during construction. Reasonable range for ```M```
is 2-100. Higher ```M``` work better on datasets with high intrinsic dimensionality and/or high recall, while low ```M``` work
better for datasets with low intrinsic dimensionality and/or low recalls. The parameter also determines the algorithm's memory
consumption, which is roughly ```M * 8-10``` bytes per stored element.  
As an example for ```dim```=4 random vectors optimal ```M``` for search is somewhere around 6, while for high dimensional datasets
(word embeddings, good face descriptors), higher ```M``` are required (e.g. ```M```=48-64) for optimal performance at high recall.
The range ```M```=12-48 is ok for the most of the use cases. When ```M``` is changed one has to update the other parameters.
Nonetheless, ef and ef_construction parameters can be roughly estimated by assuming that ```M```*```ef_{construction}``` is
a constant.

* ```ef_construction``` - the parameter has the same meaning as ```ef```, but controls the index_time/index_accuracy. Bigger
ef_construction leads to longer construction, but better index quality. At some point, increasing ef_construction does
not improve the quality of the index. One way to check if the selection of ef_construction was ok is to measure a recall
for M nearest neighbor search when ```ef``` =```ef_construction```: if the recall is lower than 0.9, than there is room
for improvement.
* ```num_elements``` - defines the maximum number of elements in the index. The index can be extened by saving/loading(load_index
function has a parameter which defines the new maximum number of elements).

Еще источники:
- [Nmslib Docs](https://github.com/nmslib/nmslib/blob/master/manual/methods.md)
- [Pinecone Vector Indexes](https://www.pinecone.io/learn/vector-indexes/)

<img src="https://d33wubrfki0l68.cloudfront.net/4c635fabb268a4af60109a506300a2dfda612063/d2535/images/similarity-search-indexes17.jpg">

<img src="https://d33wubrfki0l68.cloudfront.net/96d80cd46c2d12df99c044c860a8a5fb00cf6376/d59ca/images/similarity-search-indexes18.jpg">

In [None]:
model

<rectools.models.lightfm.LightFMWrapperModel at 0x7ca155b462d0>

In [None]:
user_embeddings, item_embeddings = model.get_vectors(dataset)

In [None]:
user_embeddings.shape, item_embeddings.shape

((756562, 34), (14019, 34))

In [None]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()

    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [None]:
print('pre shape: ', item_embeddings.shape)
max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)
augmented_item_embeddings.shape

pre shape:  (14019, 34)


(14019, 35)

In [None]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
augmented_user_embeddings.shape

(756562, 35)

In [None]:
user_id = 30

In [None]:
user_embeddings[user_id]

array([-2.25424530e+02,  1.00000000e+00, -1.42826691e-01,  3.81425917e-02,
       -2.81122863e-01, -3.98500189e-02, -2.48058885e-03, -2.67318606e-01,
        2.12787285e-01, -3.56602550e-01, -1.00381352e-01, -1.71458930e-01,
        3.15686464e-01, -1.27490610e-03, -4.51651871e-01, -3.14176306e-02,
       -2.47271284e-01, -2.85945565e-01, -3.23614061e-01, -1.77833930e-01,
       -2.03510121e-01, -1.41103148e-01, -2.14366063e-01,  3.51128876e-01,
       -2.10072726e-01,  1.70982882e-01, -3.44153382e-02,  3.77270997e-01,
       -4.01926935e-01, -2.21559197e-01, -1.19094200e-01, -1.91293061e-01,
       -2.90782541e-01,  3.07248175e-01])

In [None]:
augmented_user_embeddings[user_id]

array([-2.25424530e+02,  1.00000000e+00, -1.42826691e-01,  3.81425917e-02,
       -2.81122863e-01, -3.98500189e-02, -2.48058885e-03, -2.67318606e-01,
        2.12787285e-01, -3.56602550e-01, -1.00381352e-01, -1.71458930e-01,
        3.15686464e-01, -1.27490610e-03, -4.51651871e-01, -3.14176306e-02,
       -2.47271284e-01, -2.85945565e-01, -3.23614061e-01, -1.77833930e-01,
       -2.03510121e-01, -1.41103148e-01, -2.14366063e-01,  3.51128876e-01,
       -2.10072726e-01,  1.70982882e-01, -3.44153382e-02,  3.77270997e-01,
       -4.01926935e-01, -2.21559197e-01, -1.19094200e-01, -1.91293061e-01,
       -2.90782541e-01,  3.07248175e-01,  0.00000000e+00])

In [None]:
item_id = 0

In [None]:
item_embeddings[item_id]

array([ 1.        ,  1.55703056,  0.28764731, -0.84088397, -0.16164654,
        0.51847982,  0.07491209,  0.12415464,  0.17356083, -0.90181524,
       -0.6619159 ,  0.03611821,  0.49875396,  0.88175726, -0.07119629,
       -0.26309633, -0.58569324,  0.18361798, -0.40840062,  0.05773047,
       -1.25995374,  0.02868177,  0.15860614,  0.29673213, -0.52789551,
       -0.03979107, -0.91377938,  0.7699008 , -0.03022068, -0.20016402,
        0.15744451, -0.16348104, -0.33135971, -0.20557144])

In [None]:
augmented_item_embeddings[item_id]

array([ 1.        ,  1.55703056,  0.28764731, -0.84088397, -0.16164654,
        0.51847982,  0.07491209,  0.12415464,  0.17356083, -0.90181524,
       -0.6619159 ,  0.03611821,  0.49875396,  0.88175726, -0.07119629,
       -0.26309633, -0.58569324,  0.18361798, -0.40840062,  0.05773047,
       -1.25995374,  0.02868177,  0.15860614,  0.29673213, -0.52789551,
       -0.03979107, -0.91377938,  0.7699008 , -0.03022068, -0.20016402,
        0.15744451, -0.16348104, -0.33135971, -0.20557144,  4.5170697 ])

In [None]:
# Set index parameters
# These are the most important ones
M = 48
efC = 100

num_threads = 4
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}


In [None]:
# Number of neighbors
K=10

In [None]:
# Space name should correspond to the space name
# used for brute-force search
space_name='negdotprod'

In [None]:
# Intitialize the library, specify the space, the type of the vector and add data points
index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR)
index.addDataPointBatch(augmented_item_embeddings)

14019

In [None]:
index

<nmslib.FloatIndex method='hnsw' space='negdotprod' at 0x91cd270>

In [None]:
# Create an index
start = time.time()
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params)
end = time.time()
print('Index-time parameters', index_time_params)
print('Indexing time = %f' % (end-start))

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100}
Indexing time = 0.981389


In [None]:
# Setting query-time parameters
efS = 100
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 100}


In [None]:
query_matrix = augmented_user_embeddings[:1000, :]

In [None]:
# Querying
query_qty = query_matrix.shape[0]
start = time.time()
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time()
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' %
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty))

kNN time total=0.027742 (sec), per query=0.000028 (sec), per query adjusted for thread number=0.000111 (sec)


In [None]:
nbrs[0]

(array([ 43,  32,  19,  31,  62, 166, 121, 100,  69,  36], dtype=int32),
 array([198.3718 , 198.6821 , 198.86887, 199.12776, 199.1423 , 199.42725,
        199.45709, 199.45741, 199.49812, 199.51248], dtype=float32))

In [None]:
nbrs[0][1]

array([198.3718 , 198.6821 , 198.86887, 199.12776, 199.1423 , 199.42725,
       199.45709, 199.45741, 199.49812, 199.51248], dtype=float32)

In [None]:
def recommend_all(query_factors, index_factors, topn=10):
    output = query_factors.dot(index_factors.T)
    argpartition_indices = np.argpartition(output, -topn)[:, -topn:]

    x_indices = np.repeat(np.arange(output.shape[0]), topn)
    y_indices = argpartition_indices.flatten()
    top_value = output[x_indices, y_indices].reshape(output.shape[0], topn)
    top_indices = np.argsort(top_value)[:, ::-1]

    y_indices = top_indices.flatten()
    top_indices = argpartition_indices[x_indices, y_indices]
    labels = top_indices.reshape(-1, topn)
    distances = output[x_indices, top_indices].reshape(-1, topn)
    return labels, distances

In [None]:
recommend_all(user_embeddings[[0], :], item_embeddings)

(array([[ 43,  32,  19,  31,  62, 166, 121, 100,  69,  36]]),
 array([[-198.37178268, -198.68211821, -198.86887223, -199.12776854,
         -199.14229232, -199.42725192, -199.45708235, -199.45741433,
         -199.49812068, -199.51249354]]))

In [None]:
item_embeddings[:1000, :].shape, user_embeddings.shape

((1000, 34), (756562, 34))

In [None]:
query_matrix_not_augmented = user_embeddings[:1000, :]

In [None]:
%%timeit
recommend_all(query_matrix_not_augmented, item_embeddings)

246 ms ± 5.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit
index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)

26.8 ms ± 670 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%%time
max_elements, dim = augmented_item_embeddings.shape
hnsw = hnswlib.Index("ip", dim) # possible options for space are l2, cosine or ip

# Initing index - the maximum number of elements should be known beforehand
hnsw.init_index(max_elements, M, efC)

# Element insertion (can be called several times)
hnsw.add_items(augmented_item_embeddings)

CPU times: user 1.26 s, sys: 3.08 ms, total: 1.27 s
Wall time: 662 ms


In [None]:
# Controlling the recall by setting ef, should always be > k
hnsw.set_ef(efS)

In [None]:
label, distance = hnsw.knn_query(query_matrix, k=k)

In [None]:
label

array([[  43,   32,   19, ...,  100,   69,   36],
       [  31,    8,   62, ...,   43,   19,  149],
       [  31,   19,  121, ...,  268,   29, 1276],
       ...,
       [  19,   31,   32, ...,  268,  258,  173],
       [  43,   19,   32, ...,  164,  487,   75],
       [  32,   43,   31, ...,  100,  120,  173]], dtype=uint64)

In [None]:
1 - distance

array([[-198.37178  , -198.68208  , -198.86885  , ..., -199.45741  ,
        -199.49811  , -199.51247  ],
       [-199.48941  , -199.54874  , -199.55763  , ..., -199.92894  ,
        -199.93971  , -199.99533  ],
       [-185.1278   , -185.35916  , -186.02507  , ..., -186.26729  ,
        -186.28326  , -186.29094  ],
       ...,
       [-220.15611  , -220.19745  , -220.46617  , ..., -220.7164   ,
        -220.72238  , -220.85843  ],
       [-200.35648  , -200.53137  , -200.62827  , ..., -201.05673  ,
        -201.09016  , -201.11523  ],
       [   4.5450006,    4.3846407,    4.3406014, ...,    3.7275565,
           3.6817563,    3.6548834]], dtype=float32)

In [None]:
item_embeddings[8867].dot(user_embeddings[0])

-205.38771253205275

In [None]:
labels, distances = recommend_all(user_embeddings[:1000, :], item_embeddings)
print(labels)
print(distances)

[[  43   32   19 ...  100   69   36]
 [  31    8   62 ...   43   19  149]
 [  31   19  121 ...  268   29 1276]
 ...
 [  19   31   32 ...  268  258  173]
 [  43   19   32 ...  164  487   75]
 [  32   43   31 ...  100  120  173]]
[[-198.37178268 -198.68211821 -198.86887223 ... -199.45741433
  -199.49812068 -199.51249354]
 [-199.48941872 -199.54872038 -199.55762579 ... -199.92894088
  -199.93971236 -199.99530661]
 [-185.12779137 -185.35915297 -186.02506024 ... -186.26727922
  -186.28327549 -186.29092958]
 ...
 [-220.15610514 -220.19745336 -220.46616457 ... -220.71641241
  -220.72240113 -220.85841215]
 [-200.35646878 -200.53134919 -200.62826774 ... -201.0567136
  -201.09017803 -201.11520724]
 [   4.54500083    4.38464034    4.34060154 ...    3.72755665
     3.68175597    3.65488408]]
