In [1]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, MeanInvUserFreq, Serendipity, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

import optuna
from optuna.samplers import TPESampler

# Data

In [4]:
interactions = pd.read_csv("../data/kion_train/interactions.csv")
users = pd.read_csv("../data/kion_train/users.csv")
items = pd.read_csv("../data/kion_train/items.csv")

In [5]:
Columns.Datetime = 'last_watch_dt'

In [6]:
def display_df(df):
    return pd.concat([df.head(), df.tail()])

In [7]:
display_df(interactions)

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0
5476250,319709,4436,2021-08-15,3921,45.0


# Prepare features

## Users

In [8]:
display_df(users)

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
840192,339025,age_65_inf,income_0_20,Ж,0
840193,983617,age_18_24,income_20_40,Ж,1
840194,251008,,,,0
840195,590706,,,Ж,0
840196,166555,age_65_inf,income_20_40,Ж,0


In [9]:
users.isnull().sum()

user_id         0
age         14095
income      14776
sex         13831
kids_flg        0
dtype: int64

In [10]:
users.dtypes

user_id      int64
age         object
income      object
sex         object
kids_flg     int64
dtype: object

In [11]:
users.fillna("Unknown", inplace=True)

In [12]:
users.nunique()

user_id     840197
age              7
income           7
sex              3
kids_flg         2
dtype: int64

In [13]:
users.isnull().sum()

user_id     0
age         0
income      0
sex         0
kids_flg    0
dtype: int64

In [14]:
features = {
    "users": ["age", "income", "sex"]
}

## Items 

In [15]:
display_df(items)

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."
15958,6443,series,Полярный круг,Arctic Circle,2018.0,"драмы, триллеры, криминал","Финляндия, Германия",,16.0,,Ханну Салонен,"Иина Куустонен, Максимилиан Брюкнер, Пихла Вии...","Во время погони за браконьерами по лесу, сотру...","убийство, вирус, расследование преступления, н..."
15959,2367,series,Надежда,,2020.0,"драмы, боевики",Россия,0.0,18.0,,Елена Хазанова,"Виктория Исакова, Александр Кузьмин, Алексей М...",Оригинальный киносериал от создателей «Бывших»...,"Надежда, 2020, Россия"
15960,10632,series,Сговор,Hassel,2017.0,"драмы, триллеры, криминал",Россия,0.0,18.0,,"Эшреф Рейбрук, Амир Камдин, Эрик Эгер","Ола Рапас, Алиетт Офейм, Уильма Лиден, Шанти Р...",Криминальная драма по мотивам романов о шведск...,"Сговор, 2017, Россия"
15961,4538,series,Среди камней,Darklands,2019.0,"драмы, спорт, криминал",Россия,0.0,18.0,,"Марк О’Коннор, Конор МакМахон","Дэйн Уайт О’Хара, Томас Кэйн-Бирн, Джудит Родд...",Семнадцатилетний Дэмиен мечтает вырваться за п...,"Среди, камней, 2019, Россия"
15962,3206,series,Гоша,,2019.0,комедии,Россия,0.0,16.0,,Михаил Миронов,"Мкртыч Арзуманян, Виктория Рунцова","Добродушный Гоша не может выйти из дома, чтобы...","Гоша, 2019, Россия"


In [16]:
items.isnull().sum()

item_id             0
content_type        0
title               0
title_orig       4745
release_year       98
genres              0
countries          37
for_kids        15397
age_rating          2
studios         14898
directors        1509
actors           2619
description         2
keywords          423
dtype: int64

In [17]:
items.dtypes

item_id           int64
content_type     object
title            object
title_orig       object
release_year    float64
genres           object
countries        object
for_kids        float64
age_rating      float64
studios          object
directors        object
actors           object
description      object
keywords         object
dtype: object

In [18]:
items.nunique()

item_id         15963
content_type        2
title           15293
title_orig      10575
release_year      105
genres           2771
countries         687
for_kids            2
age_rating          6
studios            38
directors        7973
actors          12996
description     15619
keywords        15492
dtype: int64

In [19]:
items["countries"].fillna("Unknown", inplace=True)

In [20]:
items["genres"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
items["countries"] = items["countries"].str.lower().str.replace(", ", ",", regex=False).str.split(",")

In [21]:
features.update(
    {
        "items": ["content_type", "genres", "countries"]
    }
)

**Фичи**, которые **будут использоваться** для users-ов и item-ов соответственно

In [22]:
features

{'users': ['age', 'income', 'sex'],
 'items': ['content_type', 'genres', 'countries']}

In [23]:
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

In [24]:
max_date = interactions[Columns.Datetime].max()
max_date

Timestamp('2021-08-22 00:00:00')

In [25]:
# Бинаризация процента просмотренного на 10 частей
interactions[Columns.Weight] = pd.qcut(
    interactions["watched_pct"], 10, duplicates = "drop"
).astype("category").cat.codes

# Create CV

В соответствии с ДЗ-3 **валидацию** сделаем по **7 дней**

In [26]:
def create_data_range(
    last_date: pd.Timestamp, 
    n_folds: int = 7, 
    unit: str = "W", 
    n_units: int = 1, 
    show: bool = True,
):
    periods = n_folds + 1
    freq = f"{n_units}{unit}"
    
    start_date = last_date - pd.Timedelta(n_folds * n_units + n_units, unit=unit)  
    
    date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
    
    if show:
        print(
            f"start_date: {start_date}\n"
            f"last_date: {last_date}\n"
            f"periods: {periods}\n"
            f"freq: {freq}\n"
            f"Test fold borders: {date_range.values.astype('datetime64[D]')}\n"
        )
        
    return date_range

Создадим **три фолда** в силу ограниченности по времени

In [27]:
CONFIG_CV = {
    "cv_v1": {
        "n_folds": 3,
        "unit": "W",
        "n_units": 1,
    }, 
}

In [28]:
date_range = create_data_range(
    max_date, 
    n_folds=CONFIG_CV["cv_v1"]["n_folds"], 
    unit=CONFIG_CV["cv_v1"]["unit"], 
    n_units=CONFIG_CV["cv_v1"]["n_units"]
)

start_date: 2021-07-25 00:00:00
last_date: 2021-08-22 00:00:00
periods: 4
freq: 1W
Test fold borders: ['2021-07-25' '2021-08-01' '2021-08-08' '2021-08-15']



In [29]:
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [30]:
from rectools.dataset import Interactions
print(f"Real number of folds: {cv.get_n_splits(Interactions(interactions))}")

Real number of folds: 3


# Create Metrics

In [31]:
metrics = {
    "prec@10": Precision(k=10),
    "recall@10": Recall(k=10),
    "MAP@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
    "serendipity": Serendipity(k=10),
}

metrics

{'prec@10': Precision(k=10),
 'recall@10': Recall(k=10),
 'MAP@10': MAP(k=10, divide_by_k=False),
 'novelty': MeanInvUserFreq(k=10),
 'serendipity': Serendipity(k=10)}

# Models

**Модель на основе популряного возьмем из ДЗ-3**. 
Холодным юзерам будет возвращаться популярное

In [32]:
# hack from https://gist.github.com/crcrpar/c6d6fdf8112280654884353d6e68f6bb
suggested_params = {}


def create_features_dataset(col, train, df_value, features):
    """
        Create feature dataset for users / items
    """
    df_value = df_value.loc[df_value[col].isin(train[col])].copy()
    
    features_frames = []
    for feature in features:           
        feature_frame = df_value.reindex(columns=[col, feature])
        if feature in ["genres", "countries"]:
            feature_frame = feature_frame.explode(feature)
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        features_frames.append(feature_frame)
    
    return pd.concat(features_frames)


def cross_val(model, cv, interactions_df, user_df, item_df, features, k_recos):
    """
        Cross validation for models
    """
    results = list()
    fold_iterator = cv.split(Interactions(interactions), collect_fold_stats=True)
    
    for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
        print(f"\n==================== Fold {i_fold}")

        train = interactions.iloc[train_ids].copy()
        test = interactions.iloc[test_ids][Columns.UserItem].copy()
        
        train.drop(train.query("total_dur < 300").index, inplace=True)
        cold_users = set(test[Columns.User]) - set(train[Columns.User])
        test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)
        
        user_features = create_features_dataset(Columns.User, train, user_df, features["users"])
        item_features = create_features_dataset(Columns.Item, train, item_df, features["items"])
        
        dataset = Dataset.construct(
            interactions_df=train,
            user_features_df=user_features,
            cat_user_features=features["users"],
            item_features_df=item_features,
            cat_item_features=features["items"]
        )
        
        user_test = test[Columns.User].unique() 
        catalog = train[Columns.Item].unique()
        
        model.fit(dataset)

        recos = model.recommend(
            users=user_test,
            dataset=dataset,
            k=k_recos,
            filter_viewed=True,
        )
        
        metric_values = calc_metrics(metrics, recos, test, train, catalog)
        results.append(metric_values) 
    
    df_result = pd.DataFrame(results)
    
    return df_result.mean(axis=0).tolist()


def objective_als(trial, config, cv, interactions_df, user_df, item_df, features, metrics):
    """
    Function for select hyperparameters for ALS model
    """
    MODEL = "ALS"
    
    n_factors = trial.suggest_categorical("n_factors", config["N_FACTORS"])
    
    is_fitting_features = trial.suggest_categorical(
        "is_fitting_features", 
        config["is_fitting_features"]
    )

    if f"{MODEL}_{n_factors}_{is_fitting_features}" in suggested_params:
        suggested_params[f"{MODEL}_{n_factors}_{is_fitting_features}"] += 1
        raise optuna.exceptions.TrialPruned()
    else:
        suggested_params[f"{MODEL}_{n_factors}_{is_fitting_features}"] = 1
    
    model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=n_factors, 
            random_state=config["RANDOM_STATE"], 
            num_threads=config["NUM_THREADS"],
        ), 
        fit_features_together=is_fitting_features
    )
    
    trial.set_user_attr("n_factors", n_factors)
    trial.set_user_attr("is_fitting_features", is_fitting_features)
    
    print(f"***RUN***\nmodel: {MODEL}\nN_FACTORS: {n_factors}\nis_fitting_features: {is_fitting_features}\n")
        
    return cross_val(model, cv, interactions_df, user_df, item_df, features, k_recos=config["K_RECOS"])


def objective_lfm(trial, config, cv, interactions_df, user_df, item_df, features, metrics):
    """
    Function for select hyperparameters for LightFM model
    """
    MODEL = "LightFM"
    
    n_factors = trial.suggest_categorical("n_factors", config["N_FACTORS"])
    losses = trial.suggest_categorical("losses", config["lightfm_losses"])
    user_alpha = trial.suggest_categorical("user_alpha", config["USER_ALPHA"])
    item_alpha = trial.suggest_categorical("item_alpha", config["ITEM_ALPHA"])
    
    if f"{MODEL}_{n_factors}_{losses}_{user_alpha}_{item_alpha}" in suggested_params:
        suggested_params[f"{MODEL}_{n_factors}_{losses}_{user_alpha}_{item_alpha}"] += 1
        raise optuna.exceptions.TrialPruned()
    else:
        suggested_params[f"{MODEL}_{n_factors}_{losses}_{user_alpha}_{item_alpha}"] = 1
    
    model = LightFMWrapperModel(
        model=LightFM(
            no_components=n_factors, 
            loss=losses, 
            random_state=config["RANDOM_STATE"], 
            learning_rate=config["LEARNING_RATE"], 
            user_alpha=user_alpha,
            item_alpha=item_alpha,
        ),
        epochs=config["N_EPOCHS"],
        num_threads=config["NUM_THREADS"],
    )
    
    trial.set_user_attr("n_factors", n_factors)
    trial.set_user_attr("losses", losses)
    trial.set_user_attr("user_alpha", user_alpha)
    trial.set_user_attr("losses", item_alpha)
    
    print(f"***RUN***model: {MODEL}\nN_FACTORS: {n_factors}\nloss: {losses}\nuser_alpha: {user_alpha}\nitem_alpha: {item_alpha}\n")

    return cross_val(model, cv, interactions_df, user_df, item_df, features, k_recos=config["K_RECOS"])

In [49]:
CONFIG_MODELS = {
    "K_RECOS": 10,
    "RANDOM_STATE": 42,
    "NUM_THREADS": 16,
    "N_FACTORS": (16, 32, ),
    "is_fitting_features": (False, True),  # ALS
    "N_EPOCHS": 10,  # Lightfm
    "USER_ALPHA": (0,),  # Lightfm 
    "ITEM_ALPHA": (0,),  # Lightfm
    "LEARNING_RATE": 0.05,  # Lightfm
    "lightfm_losses": ('bpr', 'warp', ),  # Lightfm
}

In [34]:
%%time

study_als = optuna.create_study(
    study_name="ALS",
    sampler=TPESampler(seed=42),
    directions=['maximize', 'maximize', 'maximize', 'maximize', 'maximize']
)
study_als.optimize(
    lambda trial: objective_als(
        trial=trial, 
        config=CONFIG_MODELS, 
        cv=cv, 
        interactions_df=interactions, 
        user_df=users, 
        item_df=items, 
        features=features, 
        metrics=metrics
    ),
    n_trials=int(
        len(CONFIG_MODELS["N_FACTORS"]) * 
        len(CONFIG_MODELS["is_fitting_features"]) * 
        2
    )
)

[32m[I 2022-12-12 13:24:19,615][0m A new study created in memory with name: ALS[0m


***RUN***
model: ALS
N_FACTORS: 32
is_fitting_features: False






[32m[I 2022-12-12 13:29:55,593][0m Trial 0 finished with values: [0.033030692508210814, 0.15920183057692003, 0.0702827179628595, 5.17550196751981, 6.158320385214538e-05] and parameters: {'n_factors': 32, 'is_fitting_features': False}. [0m


***RUN***
model: ALS
N_FACTORS: 16
is_fitting_features: True






[32m[I 2022-12-12 13:37:08,786][0m Trial 1 finished with values: [0.03202969841185801, 0.1608645253011474, 0.08259661188671252, 4.824391508634623, 2.8752851349528606e-05] and parameters: {'n_factors': 16, 'is_fitting_features': True}. [0m


***RUN***
model: ALS
N_FACTORS: 32
is_fitting_features: True






[32m[I 2022-12-12 13:44:47,897][0m Trial 2 finished with values: [0.032152985846619395, 0.1615385029437911, 0.0831643603394286, 4.7999162605677075, 2.8024957212082277e-05] and parameters: {'n_factors': 32, 'is_fitting_features': True}. [0m
[32m[I 2022-12-12 13:44:47,898][0m Trial 3 pruned. [0m
[32m[I 2022-12-12 13:44:47,899][0m Trial 4 pruned. [0m
[32m[I 2022-12-12 13:44:47,901][0m Trial 5 pruned. [0m
[32m[I 2022-12-12 13:44:47,902][0m Trial 6 pruned. [0m


***RUN***
model: ALS
N_FACTORS: 16
is_fitting_features: False






[32m[I 2022-12-12 13:49:55,795][0m Trial 7 finished with values: [0.033457762344784286, 0.1625549617602303, 0.07228444822316231, 4.860120719285789, 3.719975293722041e-05] and parameters: {'n_factors': 16, 'is_fitting_features': False}. [0m


CPU times: user 1h 1min 40s, sys: 15min 26s, total: 1h 17min 6s
Wall time: 25min 36s


In [38]:
metrics

{'prec@10': Precision(k=10),
 'recall@10': Recall(k=10),
 'MAP@10': MAP(k=10, divide_by_k=False),
 'novelty': MeanInvUserFreq(k=10),
 'serendipity': Serendipity(k=10)}

In [39]:
log_study_als = study_als.trials_dataframe()
log_study_als.rename(
    columns={
        "values_0": "prec@10", "values_1": "recall@10", "values_2": "MAP@10", "values_3": "novelty", 
        "values_4": "serendipity"
    }, 
    inplace=True
)
log_study_als

Unnamed: 0,number,prec@10,recall@10,MAP@10,novelty,serendipity,datetime_start,datetime_complete,duration,params_is_fitting_features,params_n_factors,user_attrs_is_fitting_features,user_attrs_n_factors,state
0,0,0.033031,0.159202,0.070283,5.175502,6.2e-05,2022-12-12 13:24:19.617119,2022-12-12 13:29:55.593706,0 days 00:05:35.976587,False,32,False,32.0,COMPLETE
1,1,0.03203,0.160865,0.082597,4.824392,2.9e-05,2022-12-12 13:29:55.594592,2022-12-12 13:37:08.785865,0 days 00:07:13.191273,True,16,True,16.0,COMPLETE
2,2,0.032153,0.161539,0.083164,4.799916,2.8e-05,2022-12-12 13:37:08.786772,2022-12-12 13:44:47.897142,0 days 00:07:39.110370,True,32,True,32.0,COMPLETE
3,3,,,,,,2022-12-12 13:44:47.898363,2022-12-12 13:44:47.898769,0 days 00:00:00.000406,True,16,,,PRUNED
4,4,,,,,,2022-12-12 13:44:47.899255,2022-12-12 13:44:47.899665,0 days 00:00:00.000410,False,32,,,PRUNED
5,5,,,,,,2022-12-12 13:44:47.900641,2022-12-12 13:44:47.900944,0 days 00:00:00.000303,True,16,,,PRUNED
6,6,,,,,,2022-12-12 13:44:47.901530,2022-12-12 13:44:47.901959,0 days 00:00:00.000429,True,32,,,PRUNED
7,7,0.033458,0.162555,0.072284,4.860121,3.7e-05,2022-12-12 13:44:47.902573,2022-12-12 13:49:55.795716,0 days 00:05:07.893143,False,16,False,16.0,COMPLETE


In [45]:
log_study_als.drop(log_study_als[log_study_als["state"] != "COMPLETE"].index, axis=0, inplace=True)
log_study_als

Unnamed: 0,number,prec@10,recall@10,MAP@10,novelty,serendipity,datetime_start,datetime_complete,duration,params_is_fitting_features,params_n_factors,user_attrs_is_fitting_features,user_attrs_n_factors,state
0,0,0.033031,0.159202,0.070283,5.175502,6.2e-05,2022-12-12 13:24:19.617119,2022-12-12 13:29:55.593706,0 days 00:05:35.976587,False,32,False,32.0,COMPLETE
1,1,0.03203,0.160865,0.082597,4.824392,2.9e-05,2022-12-12 13:29:55.594592,2022-12-12 13:37:08.785865,0 days 00:07:13.191273,True,16,True,16.0,COMPLETE
2,2,0.032153,0.161539,0.083164,4.799916,2.8e-05,2022-12-12 13:37:08.786772,2022-12-12 13:44:47.897142,0 days 00:07:39.110370,True,32,True,32.0,COMPLETE
7,7,0.033458,0.162555,0.072284,4.860121,3.7e-05,2022-12-12 13:44:47.902573,2022-12-12 13:49:55.795716,0 days 00:05:07.893143,False,16,False,16.0,COMPLETE


In [54]:
log_study_als["model"] = "ALS"
log_study_als.to_csv("../data/hw_4/log_study_als.csv", index=False)

In [51]:
%%time

study_lfm = optuna.create_study(
    study_name="LightFM",
    sampler=TPESampler(seed=42),
    directions=['maximize', 'maximize', 'maximize', 'maximize', 'maximize']
)
study_lfm.optimize(
    lambda trial: objective_lfm(
        trial=trial, 
        config=CONFIG_MODELS, 
        cv=cv, 
        interactions_df=interactions, 
        user_df=users, 
        item_df=items, 
        features=features, 
        metrics=metrics
    ),
    n_trials=int(
        len(CONFIG_MODELS["N_FACTORS"]) *
        len(CONFIG_MODELS["USER_ALPHA"]) * 
        len(CONFIG_MODELS["ITEM_ALPHA"]) *  
        len(CONFIG_MODELS["lightfm_losses"]) *
        2
    )
)

[32m[I 2022-12-12 14:01:05,529][0m A new study created in memory with name: LightFM[0m


***RUN***model: LightFM
N_FACTORS: 32
loss: bpr
user_alpha: 0
item_alpha: 0






[32m[I 2022-12-12 14:07:31,301][0m Trial 0 finished with values: [0.0068209394096222925, 0.03101255683019741, 0.015230800287448587, 12.068082761810189, 0.00011798481324603613] and parameters: {'n_factors': 32, 'losses': 'bpr', 'user_alpha': 0, 'item_alpha': 0}. [0m


***RUN***model: LightFM
N_FACTORS: 16
loss: warp
user_alpha: 0
item_alpha: 0






[32m[I 2022-12-12 14:10:10,527][0m Trial 1 finished with values: [0.03417229586777249, 0.1679148467818227, 0.08044566974378901, 5.1767501080041, 6.310733976519341e-05] and parameters: {'n_factors': 16, 'losses': 'warp', 'user_alpha': 0, 'item_alpha': 0}. [0m


***RUN***model: LightFM
N_FACTORS: 32
loss: warp
user_alpha: 0
item_alpha: 0






[32m[I 2022-12-12 14:13:13,837][0m Trial 2 finished with values: [0.03387333769793409, 0.16619084263850772, 0.0790568310893865, 5.508329186408287, 9.017829915307326e-05] and parameters: {'n_factors': 32, 'losses': 'warp', 'user_alpha': 0, 'item_alpha': 0}. [0m
[32m[I 2022-12-12 14:13:13,838][0m Trial 3 pruned. [0m
[32m[I 2022-12-12 14:13:13,839][0m Trial 4 pruned. [0m
[32m[I 2022-12-12 14:13:13,840][0m Trial 5 pruned. [0m
[32m[I 2022-12-12 14:13:13,841][0m Trial 6 pruned. [0m


***RUN***model: LightFM
N_FACTORS: 16
loss: bpr
user_alpha: 0
item_alpha: 0






[32m[I 2022-12-12 14:17:50,320][0m Trial 7 finished with values: [0.005295678169442308, 0.02416944628414099, 0.012153373034598803, 12.79936887733164, 8.33351124291126e-05] and parameters: {'n_factors': 16, 'losses': 'bpr', 'user_alpha': 0, 'item_alpha': 0}. [0m


CPU times: user 1h 32min 28s, sys: 2.67 s, total: 1h 32min 31s
Wall time: 16min 44s


In [55]:
log_study_lfm = study_lfm.trials_dataframe()
log_study_lfm.rename(
    columns={
        "values_0": "prec@10", "values_1": "recall@10", "values_2": "MAP@10", "values_3": "novelty", 
        "values_4": "serendipity"
    }, 
    inplace=True
)
log_study_lfm.drop(log_study_lfm[log_study_lfm["state"] != "COMPLETE"].index, axis=0, inplace=True)
log_study_lfm["model"] = "LighFM"

In [56]:
log_study_lfm

Unnamed: 0,number,prec@10,recall@10,MAP@10,novelty,serendipity,datetime_start,datetime_complete,duration,params_item_alpha,params_losses,params_n_factors,params_user_alpha,user_attrs_losses,user_attrs_n_factors,user_attrs_user_alpha,state,model
0,0,0.006821,0.031013,0.015231,12.068083,0.000118,2022-12-12 14:01:05.530755,2022-12-12 14:07:31.301238,0 days 00:06:25.770483,0,bpr,32,0,0.0,32.0,0.0,COMPLETE,LighFM
1,1,0.034172,0.167915,0.080446,5.17675,6.3e-05,2022-12-12 14:07:31.302396,2022-12-12 14:10:10.527654,0 days 00:02:39.225258,0,warp,16,0,0.0,16.0,0.0,COMPLETE,LighFM
2,2,0.033873,0.166191,0.079057,5.508329,9e-05,2022-12-12 14:10:10.528535,2022-12-12 14:13:13.837143,0 days 00:03:03.308608,0,warp,32,0,0.0,32.0,0.0,COMPLETE,LighFM
7,7,0.005296,0.024169,0.012153,12.799369,8.3e-05,2022-12-12 14:13:13.842459,2022-12-12 14:17:50.319895,0 days 00:04:36.477436,0,bpr,16,0,0.0,16.0,0.0,COMPLETE,LighFM


In [57]:
log_study_lfm.to_csv("../data/hw_4/log_study_lfm.csv", index=False)

**Лучшие результаты обучения показывает модель lighFM с лоссом warp** подберем для моделей с данным лоссом оптимальное user_alpha и item_alpha

In [58]:
CONFIG_MODELS_LFM = {
    "K_RECOS": 10,
    "RANDOM_STATE": 42,
    "NUM_THREADS": 16,
    "N_FACTORS": (16, 32, ),
    "N_EPOCHS": 10,  # Lightfm
    "USER_ALPHA": (0, 0.2),  # Lightfm 
    "ITEM_ALPHA": (0, 0.2),  # Lightfm
    "LEARNING_RATE": 0.05,  # Lightfm
    "lightfm_losses": ('warp', ),  # Lightfm
}

In [59]:
%%time

study_lfm_alpha = optuna.create_study(
    study_name="LightFM_alpha",
    sampler=TPESampler(seed=42),
    directions=['maximize', 'maximize', 'maximize', 'maximize', 'maximize']
)
study_lfm_alpha.optimize(
    lambda trial: objective_lfm(
        trial=trial, 
        config=CONFIG_MODELS_LFM, 
        cv=cv, 
        interactions_df=interactions, 
        user_df=users, 
        item_df=items, 
        features=features, 
        metrics=metrics
    ),
    n_trials=int(
        len(CONFIG_MODELS["N_FACTORS"]) *
        len(CONFIG_MODELS["USER_ALPHA"]) * 
        len(CONFIG_MODELS["ITEM_ALPHA"]) *
        2
    )
)

[32m[I 2022-12-12 14:25:21,276][0m A new study created in memory with name: LightFM_alpha[0m
[32m[I 2022-12-12 14:25:21,279][0m Trial 0 pruned. [0m


***RUN***model: LightFM
N_FACTORS: 32
loss: warp
user_alpha: 0.2
item_alpha: 0.2






[32m[I 2022-12-12 14:29:36,116][0m Trial 1 finished with values: [0.034278189254484864, 0.17449544816064208, 0.08483012091477123, 3.766206685818005, 2.354521448242052e-06] and parameters: {'n_factors': 32, 'losses': 'warp', 'user_alpha': 0.2, 'item_alpha': 0.2}. [0m


***RUN***model: LightFM
N_FACTORS: 16
loss: warp
user_alpha: 0.2
item_alpha: 0.2






[32m[I 2022-12-12 14:32:43,095][0m Trial 2 finished with values: [0.0344082365349989, 0.175267733724092, 0.08524555380380662, 3.7785924768795187, 2.3107020986006132e-06] and parameters: {'n_factors': 16, 'losses': 'warp', 'user_alpha': 0.2, 'item_alpha': 0.2}. [0m


***RUN***model: LightFM
N_FACTORS: 16
loss: warp
user_alpha: 0
item_alpha: 0.2






[32m[I 2022-12-12 14:37:16,325][0m Trial 3 finished with values: [8.121216487323505e-05, 0.00023670865324451867, 6.592145604060793e-05, 16.300173367831395, 4.115795688696286e-06] and parameters: {'n_factors': 16, 'losses': 'warp', 'user_alpha': 0, 'item_alpha': 0.2}. [0m


CPU times: user 1h 1min 35s, sys: 2.72 s, total: 1h 1min 38s
Wall time: 11min 55s


In [60]:
log_study_lfm_alpha = study_lfm_alpha.trials_dataframe()
log_study_lfm_alpha.rename(
    columns={
        "values_0": "prec@10", "values_1": "recall@10", "values_2": "MAP@10", "values_3": "novelty", 
        "values_4": "serendipity"
    }, 
    inplace=True
)
log_study_lfm_alpha.drop(
    log_study_lfm_alpha[log_study_lfm_alpha["state"] != "COMPLETE"].index, axis=0, inplace=True
)
log_study_lfm_alpha["model"] = "LighFM"
log_study_lfm_alpha

Unnamed: 0,number,prec@10,recall@10,MAP@10,novelty,serendipity,datetime_start,datetime_complete,duration,params_item_alpha,params_losses,params_n_factors,params_user_alpha,user_attrs_losses,user_attrs_n_factors,user_attrs_user_alpha,state,model
1,1,0.034278,0.174495,0.08483,3.766207,2e-06,2022-12-12 14:25:21.279916,2022-12-12 14:29:36.116148,0 days 00:04:14.836232,0.2,warp,32,0.2,0.2,32.0,0.2,COMPLETE,LighFM
2,2,0.034408,0.175268,0.085246,3.778592,2e-06,2022-12-12 14:29:36.116969,2022-12-12 14:32:43.095291,0 days 00:03:06.978322,0.2,warp,16,0.2,0.2,16.0,0.2,COMPLETE,LighFM
3,3,8.1e-05,0.000237,6.6e-05,16.300173,4e-06,2022-12-12 14:32:43.096137,2022-12-12 14:37:16.325048,0 days 00:04:33.228911,0.2,warp,16,0.0,0.2,16.0,0.0,COMPLETE,LighFM


In [61]:
log_study_lfm = pd.concat([log_study_lfm, log_study_lfm_alpha], ignore_index=True)
log_study_lfm

Unnamed: 0,number,prec@10,recall@10,MAP@10,novelty,serendipity,datetime_start,datetime_complete,duration,params_item_alpha,params_losses,params_n_factors,params_user_alpha,user_attrs_losses,user_attrs_n_factors,user_attrs_user_alpha,state,model
0,0,0.006821,0.031013,0.015231,12.068083,0.000118,2022-12-12 14:01:05.530755,2022-12-12 14:07:31.301238,0 days 00:06:25.770483,0.0,bpr,32,0.0,0.0,32.0,0.0,COMPLETE,LighFM
1,1,0.034172,0.167915,0.080446,5.17675,6.3e-05,2022-12-12 14:07:31.302396,2022-12-12 14:10:10.527654,0 days 00:02:39.225258,0.0,warp,16,0.0,0.0,16.0,0.0,COMPLETE,LighFM
2,2,0.033873,0.166191,0.079057,5.508329,9e-05,2022-12-12 14:10:10.528535,2022-12-12 14:13:13.837143,0 days 00:03:03.308608,0.0,warp,32,0.0,0.0,32.0,0.0,COMPLETE,LighFM
3,7,0.005296,0.024169,0.012153,12.799369,8.3e-05,2022-12-12 14:13:13.842459,2022-12-12 14:17:50.319895,0 days 00:04:36.477436,0.0,bpr,16,0.0,0.0,16.0,0.0,COMPLETE,LighFM
4,1,0.034278,0.174495,0.08483,3.766207,2e-06,2022-12-12 14:25:21.279916,2022-12-12 14:29:36.116148,0 days 00:04:14.836232,0.2,warp,32,0.2,0.2,32.0,0.2,COMPLETE,LighFM
5,2,0.034408,0.175268,0.085246,3.778592,2e-06,2022-12-12 14:29:36.116969,2022-12-12 14:32:43.095291,0 days 00:03:06.978322,0.2,warp,16,0.2,0.2,16.0,0.2,COMPLETE,LighFM
6,3,8.1e-05,0.000237,6.6e-05,16.300173,4e-06,2022-12-12 14:32:43.096137,2022-12-12 14:37:16.325048,0 days 00:04:33.228911,0.2,warp,16,0.0,0.2,16.0,0.0,COMPLETE,LighFM


In [62]:
log_study_lfm.to_csv("../data/hw_4/log_study_lfm.csv", index=False)

# Анализ метрик

In [63]:
log_study_als = pd.read_csv("../data/hw_4/log_study_als.csv")
log_study_lfm = pd.read_csv("../data/hw_4/log_study_lfm.csv")

In [64]:
log_study_als

Unnamed: 0,number,prec@10,recall@10,MAP@10,novelty,serendipity,datetime_start,datetime_complete,duration,params_is_fitting_features,params_n_factors,user_attrs_is_fitting_features,user_attrs_n_factors,state,model
0,0,0.033031,0.159202,0.070283,5.175502,6.2e-05,2022-12-12 13:24:19.617119,2022-12-12 13:29:55.593706,0 days 00:05:35.976587,False,32,False,32.0,COMPLETE,ALS
1,1,0.03203,0.160865,0.082597,4.824392,2.9e-05,2022-12-12 13:29:55.594592,2022-12-12 13:37:08.785865,0 days 00:07:13.191273,True,16,True,16.0,COMPLETE,ALS
2,2,0.032153,0.161539,0.083164,4.799916,2.8e-05,2022-12-12 13:37:08.786772,2022-12-12 13:44:47.897142,0 days 00:07:39.110370,True,32,True,32.0,COMPLETE,ALS
3,7,0.033458,0.162555,0.072284,4.860121,3.7e-05,2022-12-12 13:44:47.902573,2022-12-12 13:49:55.795716,0 days 00:05:07.893143,False,16,False,16.0,COMPLETE,ALS


In [65]:
log_study_lfm

Unnamed: 0,number,prec@10,recall@10,MAP@10,novelty,serendipity,datetime_start,datetime_complete,duration,params_item_alpha,params_losses,params_n_factors,params_user_alpha,user_attrs_losses,user_attrs_n_factors,user_attrs_user_alpha,state,model
0,0,0.006821,0.031013,0.015231,12.068083,0.000118,2022-12-12 14:01:05.530755,2022-12-12 14:07:31.301238,0 days 00:06:25.770483,0.0,bpr,32,0.0,0.0,32.0,0.0,COMPLETE,LighFM
1,1,0.034172,0.167915,0.080446,5.17675,6.3e-05,2022-12-12 14:07:31.302396,2022-12-12 14:10:10.527654,0 days 00:02:39.225258,0.0,warp,16,0.0,0.0,16.0,0.0,COMPLETE,LighFM
2,2,0.033873,0.166191,0.079057,5.508329,9e-05,2022-12-12 14:10:10.528535,2022-12-12 14:13:13.837143,0 days 00:03:03.308608,0.0,warp,32,0.0,0.0,32.0,0.0,COMPLETE,LighFM
3,7,0.005296,0.024169,0.012153,12.799369,8.3e-05,2022-12-12 14:13:13.842459,2022-12-12 14:17:50.319895,0 days 00:04:36.477436,0.0,bpr,16,0.0,0.0,16.0,0.0,COMPLETE,LighFM
4,1,0.034278,0.174495,0.08483,3.766207,2e-06,2022-12-12 14:25:21.279916,2022-12-12 14:29:36.116148,0 days 00:04:14.836232,0.2,warp,32,0.2,0.2,32.0,0.2,COMPLETE,LighFM
5,2,0.034408,0.175268,0.085246,3.778592,2e-06,2022-12-12 14:29:36.116969,2022-12-12 14:32:43.095291,0 days 00:03:06.978322,0.2,warp,16,0.2,0.2,16.0,0.2,COMPLETE,LighFM
6,3,8.1e-05,0.000237,6.6e-05,16.300173,4e-06,2022-12-12 14:32:43.096137,2022-12-12 14:37:16.325048,0 days 00:04:33.228911,0.2,warp,16,0.0,0.2,16.0,0.0,COMPLETE,LighFM


In [68]:
pd.concat([log_study_als, log_study_lfm], ignore_index=True)

Unnamed: 0,number,prec@10,recall@10,MAP@10,novelty,serendipity,datetime_start,datetime_complete,duration,params_is_fitting_features,params_n_factors,user_attrs_is_fitting_features,user_attrs_n_factors,state,model,params_item_alpha,params_losses,params_user_alpha,user_attrs_losses,user_attrs_user_alpha
0,0,0.033031,0.159202,0.070283,5.175502,6.2e-05,2022-12-12 13:24:19.617119,2022-12-12 13:29:55.593706,0 days 00:05:35.976587,False,32,False,32.0,COMPLETE,ALS,,,,,
1,1,0.03203,0.160865,0.082597,4.824392,2.9e-05,2022-12-12 13:29:55.594592,2022-12-12 13:37:08.785865,0 days 00:07:13.191273,True,16,True,16.0,COMPLETE,ALS,,,,,
2,2,0.032153,0.161539,0.083164,4.799916,2.8e-05,2022-12-12 13:37:08.786772,2022-12-12 13:44:47.897142,0 days 00:07:39.110370,True,32,True,32.0,COMPLETE,ALS,,,,,
3,7,0.033458,0.162555,0.072284,4.860121,3.7e-05,2022-12-12 13:44:47.902573,2022-12-12 13:49:55.795716,0 days 00:05:07.893143,False,16,False,16.0,COMPLETE,ALS,,,,,
4,0,0.006821,0.031013,0.015231,12.068083,0.000118,2022-12-12 14:01:05.530755,2022-12-12 14:07:31.301238,0 days 00:06:25.770483,,32,,32.0,COMPLETE,LighFM,0.0,bpr,0.0,0.0,0.0
5,1,0.034172,0.167915,0.080446,5.17675,6.3e-05,2022-12-12 14:07:31.302396,2022-12-12 14:10:10.527654,0 days 00:02:39.225258,,16,,16.0,COMPLETE,LighFM,0.0,warp,0.0,0.0,0.0
6,2,0.033873,0.166191,0.079057,5.508329,9e-05,2022-12-12 14:10:10.528535,2022-12-12 14:13:13.837143,0 days 00:03:03.308608,,32,,32.0,COMPLETE,LighFM,0.0,warp,0.0,0.0,0.0
7,7,0.005296,0.024169,0.012153,12.799369,8.3e-05,2022-12-12 14:13:13.842459,2022-12-12 14:17:50.319895,0 days 00:04:36.477436,,16,,16.0,COMPLETE,LighFM,0.0,bpr,0.0,0.0,0.0
8,1,0.034278,0.174495,0.08483,3.766207,2e-06,2022-12-12 14:25:21.279916,2022-12-12 14:29:36.116148,0 days 00:04:14.836232,,32,,32.0,COMPLETE,LighFM,0.2,warp,0.2,0.2,0.2
9,2,0.034408,0.175268,0.085246,3.778592,2e-06,2022-12-12 14:29:36.116969,2022-12-12 14:32:43.095291,0 days 00:03:06.978322,,16,,16.0,COMPLETE,LighFM,0.2,warp,0.2,0.2,0.2


По метрике **MAP@10** выберем по одной лучшей модели из ALS и LighFM:

**ALS MAP@10 = 0.083164** 
 - params_is_fitting_features = True
 - params_n_factors = 32
 
**LighFM MAP@10 = 0.085246** 
 - params_n_factors = 16
 - params_losses = warp
 - params_item_alpha = 0.2
 - params_user_alpha = 0.2

# Обучение лучших моделей с валидации, добавление 3-ёх аватаров, метод приближенного поиска соседей при помощи nmslib

## Формирование векторов фичей юзеров и айтемов

In [74]:
interactions.drop(interactions.query("total_dur < 300").index, inplace=True)

**Добавим 3 аватаров**:
1. Возраст от 45-54, доход 60-90, пол мужской, смотрим русские сериалы боевики (id 5555555)
2. Возраст от 18-24, доход 20-40, пол мужской, смотрит фильмы жанра комедия страна любая кроме России (id 6666666)
3. Возраст от 35-44, доход 40-60, пол женский, смотрим и фильмы и сериалы жанра мелодрамы неважна какая страна (id 7777777)

Добавили именно таких юзеров, т.к. можно между ними провести грань, например, предлагаем, что для первого аватара не попадёт в выдачу сериалы с жанром не боевик и будут только русские сериала, возможно в выдачу могут попасть русские фильмы боевики. Для второго пользователя должны рекомендоваться комедии не из России. Для Третьего только мелодрамы

In [142]:
first_cond = items[items["countries"].apply(lambda countries: "россия" in countries)]["item_id"].to_numpy()
second_cond = items[items["genres"].apply(lambda genres: "боевики" in genres)]["item_id"].to_numpy()
first_avatar_items = items[
    (
        (items["content_type"] == "series") & 
        (items["item_id"].isin(first_cond)) &
        (items["item_id"].isin(second_cond))
    )
].reset_index(drop=True)
display_df(first_avatar_items)

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,8434,series,Застывшие депеши,,2010.0,"[боевики, русские, детективы]",[россия],,16.0,,Армен Арутюнян,"Александр Орловский, Анна Литкенс, Артем Семак...",Криминальная сага режиссера Армена Арутюняна с...,"Застывшие, депеши, 2010, Россия, бандиты, ганг..."
1,2074,series,Посредник,Posrednik,1990.0,"[боевики, фантастика]",[россия],,16.0,,В.Потапов,"Олеся Судзиловская, Инара Слуцка, Валерий Стор...",Нависший над землей загадочный шар становится ...,"инопланетянин, открытый космос"
2,415,series,Приступить к ликвидации,Pristupit k likvidatsii,1983.0,"[боевики, криминал]",[россия],,6.0,,Б. Григорьев,"Олег Стриженов, Михаил Жигалов, Василий Ланово...",Весна 1945 года. Близится к концу Великая Отеч...,"агент под прикрытием, детектив, милиция, ограб..."
3,3530,series,Господа офицеры,,2004.0,"[боевики, русские]",[россия],,16.0,,Андрей Кравчук,"Александр Иванов, Аркадий Коваль, Артем Алексе...","Захватывающий многосерийный боевик, рассказыва...","Господа, офицеры, 2004, Россия, армия, интриги..."
4,10839,series,Рокировка,,2004.0,"[боевики, русские, мелодрамы]",[россия],,12.0,,"Владимир Кононенко, Милиана Черкасова, Тимофей...","Александр Баранов, Анатолий Чижиков, Борис Мир...",Успешный бизнесмен Игорь Березин в одночасье л...,"Рокировка, 2004, Россия, друзья, предатели, на..."
74,15443,series,Морпехи,,2011.0,"[драмы, боевики, русские, военные]",[россия],,16.0,,"Дмитрий Матов, Мурад Алиев","Александр Зельский, Александр Лобанов, Алексан...","«Черные береты», «черные дьяволы», «черная сме...","Морпехи, 2011, Россия"
75,8632,series,Бой с тенью (сериал),Boy s tenyu,2005.0,"[боевики, драмы, спорт, криминал]",[россия],,16.0,,Алексей Сидоров,"Денис Никифоров, Елена Панова, Иван Макаревич,...",Артем Колчин – профессиональный боксер и главн...,"россия, боксер, бокс"
76,9591,series,Паршивые овцы,,2010.0,"[боевики, русские, военные]",[россия],,16.0,,Сергей Чекалов,"Алексей Дмитриев, Алексей Лонгин, Андрей Фроло...",Сложившийся в работе над телевизионным сериало...,"Паршивые, овцы, 2010, Россия, бандиты, гангсте..."
77,14389,series,Последний бой майора Пугачёва,Posledniy boy mayora Pugachova,2005.0,"[боевики, драмы, военные]",[россия],,16.0,,Владимир Фатьянов,"Игорь Лифанов, Борис Токарев, Лев Дуров, Игорь...",Действие фильма начинается на территории Польш...,
78,2367,series,Надежда,,2020.0,"[драмы, боевики]",[россия],0.0,18.0,,Елена Хазанова,"Виктория Исакова, Александр Кузьмин, Алексей М...",Оригинальный киносериал от создателей «Бывших»...,"Надежда, 2020, Россия"


In [143]:
first_cond = items[items["countries"].apply(lambda countries: "россия" not in countries)]["item_id"].to_numpy()
second_cond = items[items["genres"].apply(lambda genres: "комедии" in genres)]["item_id"].to_numpy()
second_avatar_items = items[
    (
        (items["content_type"] == "film") & 
        (items["item_id"].isin(first_cond)) &
        (items["item_id"].isin(second_cond))
    )
].reset_index(drop=True)
display_df(second_avatar_items)

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,2508,film,Голые перцы,Search Party,2014.0,"[зарубежные, приключения, комедии]",[сша],,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
1,10716,film,Тактическая сила,Tactical Force,2011.0,"[криминал, зарубежные, триллеры, боевики, коме...",[канада],,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
2,6181,film,"Первая встреча, последняя встреча",,1987.0,"[драмы, советские, комедии]",[ссср],,16.0,Ленфильм,Виталий Мельников,"Борис Плотников, Гражина Шаполовска, Иннокенти...","Инициатива наказуема, особенно в Петрограде вр...","Первая, встреча, последняя, встреча, 1987, ССС..."
3,15076,film,Бладфест,Blood Fest,2018.0,"[зарубежные, ужасы, комедии]",[сша],,18.0,,Оуэн Эгертон,"Барбара Дункельман, Джейкоб Баталон, Крис Дабе...",Фанат хорорров Дакс вместе с друзьями отправля...,"Бладфест, 2018, США, друзья, заговоры, страхи,..."
4,13109,film,Новый парень моей мамы,My Mom's New Boyfriend,2007.0,"[мелодрамы, зарубежные, криминал, комедии]",[германия],,12.0,,Джордж Галло,"Антонио Бандерас, Джон Вальдетеро, Кит Дэвид, ...",«Новый парень моей мамы» – американо-германска...,"Новый, парень, моей, мамы, 2007, Германия, огр..."
2712,8233,film,4.3.2.1 (с тифлокомментарием),4.3.2.1 (with Audio Description),2010.0,"[зарубежные, триллеры, комедии]",[великобритания],,16.0,,"Ноэль Кларк, Марк Дэвис","Адам Дикон, Офелия Ловибонд, Тэмзин Эджертон, ...","Джоанн работает в магазине, Кассандра стремитс...","4321, тифлокомментарием, 2010, Великобритания"
2713,3563,film,Утро без отметок,Utro bez otmetok,1983.0,"[семейное, комедии]",[ссср],,6.0,,Владимир Мартынов,"Кирилл Головко-Серский, Мария Вартикова, Павел...",Веселый и поучительный фильм про шестилетнего ...,"дети и семья, Дети и семья, 1983, su, утро, бе..."
2714,13074,film,Маленькая мисс Дулиттл,Liliane Susewind - Ein tierisches Abenteuer,2018.0,"[фэнтези, семейное, криминал, комедии]",[германия],,6.0,,Йоахим Масаннек,"Малу Ляйхер, Пери Баумайстер, Том Бек, Айлин Т...","Рыжеволосая Лилиан не просто обожает животных,...",", Вор, Зоопарк, Комбинезон, Разговоры с животн..."
2715,10372,film,Правила секса,"RULES OF ATTRACTION, THE",2002.0,"[драмы, мелодрамы, комедии]","[германия, сша]",,18.0,,Роджер Эвери,"Джеймс Ван Дер Бик, Шаннин Соссамон, Джессика ...",Шон Бэйтман позволяет себе только мимолётные с...,"любовный треугольник, самоубийство, изнасилова..."
2716,3364,film,Восьмой класс,Eighth Grade,2018.0,"[драмы, комедии]",[сша],,16.0,,Бо Бёрнэм,"Элси Фишер, Джош Хэмилтон, Эмили Робинсон, Дже...","История Кейлы Дэй — восьмиклассницы, которая в...","подростковый возраст, совершеннолетие, девочка..."


In [144]:
first_cond = items[items["genres"].apply(lambda genres: "мелодрамы" in genres)]["item_id"].to_numpy()
third_avatar_items = items[
    (
        (items["item_id"].isin(first_cond))
    )
].reset_index(drop=True)
display_df(third_avatar_items)

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"[драмы, зарубежные, детективы, мелодрамы]",[испания],,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,7868,film,45 лет,45 Years,2015.0,"[драмы, зарубежные, мелодрамы]",[великобритания],,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
2,16268,film,Все решает мгновение,,1978.0,"[драмы, спорт, советские, мелодрамы]",[ссср],,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."
3,8604,film,Третья попытка,,2013.0,"[русские, мелодрамы]",[россия],,12.0,,Игорь Мужжухин,"Александр Асташенок, Александр Пашков, Андрей ...","Екатерина Рябова, Александр Асташенок и Алекса...","Третья, попытка, 2013, Россия, любовь, измена,..."
4,3526,film,Код «Красный»,Red Joan,2018.0,"[биография, экранизации, драмы, зарубежные, ме...",[великобритания],,18.0,,Тревор Нанн,"Бен Майлз, Джуди Денч, Лоуренс Спэллман, Софи ...",Тихая английская пенсионерка Джоан попадает по...,"Код, Красный, 2018, Великобритания, друзья, лю..."
2773,5473,film,"Чего ждать, когда ждёшь ребенка",WHAT TO EXPECT WHEN YOU'RE EXPECTING,2012.0,"[драмы, мелодрамы]",[сша],,16.0,,Кирк Джонс,"Кэмерон Диаз, Дженнифер Лопес, Элизабет Бэнкс,...","Фильм расскажет историю пяти пар, которым пред...","на основе романа или книги, усыновление, берем..."
2774,10372,film,Правила секса,"RULES OF ATTRACTION, THE",2002.0,"[драмы, мелодрамы, комедии]","[германия, сша]",,18.0,,Роджер Эвери,"Джеймс Ван Дер Бик, Шаннин Соссамон, Джессика ...",Шон Бэйтман позволяет себе только мимолётные с...,"любовный треугольник, самоубийство, изнасилова..."
2775,1922,film,Цзюй Доу,Ju Dou,1990.0,"[драмы, мелодрамы]",[китай],,12.0,,Чжан Имоу,"Гун Ли, Ли Баотянь, Ли Вэй, Чжан И, Цзи-ан Чжэ...","История женщины, проданной в качестве жены бог...","супружеская измена, утопление, садизм, убийств..."
2776,4430,series,Наши соседи,,2010.0,"[русские, мелодрамы]",[россия],,16.0,,"Дмитрий Гольдман, Казбек Меретуков","Александр Зельский, Александр Шаврин, Амбарцум...",«Наши соседи» - сериал о жителях коммуналки. Ш...,"Наши, соседи, 2010, Россия, жизнь, отцы, дети,..."
2777,15610,series,Цена любви,Tsena Lubvi,2015.0,[мелодрамы],[россия],,16.0,,А. Хван,"Анна Невская, Юрий Батурин, Анастасия Матвеева...",Преподаватель университета Елена однажды пришл...,


In [124]:
7777777 in users[Columns.User]

False

In [125]:
6666666 in users[Columns.User]

False

In [126]:
5555555 in users[Columns.User]

False

In [127]:
users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
...,...,...,...,...,...
840192,339025,age_65_inf,income_0_20,Ж,0
840193,983617,age_18_24,income_20_40,Ж,1
840194,251008,Unknown,Unknown,Unknown,0
840195,590706,Unknown,Unknown,Ж,0


In [77]:
items_features = create_features_dataset(Columns.Item, interactions, items, features["items"])
display_df(items_features)

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type
15958,6443,германия,countries
15959,2367,россия,countries
15960,10632,россия,countries
15961,4538,россия,countries
15962,3206,россия,countries


In [134]:
users_with_avatar = users.append(
    pd.DataFrame(
        {
            "user_id": [5555555, 6666666, 7777777],
            "age": ["age_45_54", "age_18_24", "age_35_44"],
            "income": ["income_60_90", "income_20_40", "income_40_60"],
            "sex": ["М", "М", "Ж"],
            "kids_flg": [0, 0, 0]    
        },
    ),
    ignore_index=True
)

users_with_avatar.tail(5)

Unnamed: 0,user_id,age,income,sex,kids_flg
840195,590706,Unknown,Unknown,Ж,0
840196,166555,age_65_inf,income_20_40,Ж,0
840197,5555555,age_45_54,income_60_90,М,0
840198,6666666,age_18_24,income_20_40,М,0
840199,7777777,age_35_44,income_40_60,Ж,0


In [135]:
display_df(interactions)

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,weight
0,176549,9506,2021-05-11,4250,72.0,5.0
1,699317,1659,2021-05-29,8317,100.0,6.0
3,864613,7638,2021-07-05,14483,100.0,6.0
4,964868,9506,2021-04-30,6725,100.0,6.0
5,1032142,6686,2021-05-13,11286,100.0,6.0
5476245,786732,4880,2021-05-12,753,0.0,0.0
5476247,546862,9673,2021-04-13,2308,49.0,4.0
5476248,697262,15297,2021-08-20,18307,63.0,4.0
5476249,384202,16197,2021-04-19,6203,100.0,6.0
5476250,319709,4436,2021-08-15,3921,45.0,4.0


In [140]:
interactions["user_id"].value_counts().describe()

count    809577.000000
mean          5.184158
std           9.285684
min           1.000000
25%           1.000000
50%           2.000000
75%           5.000000
max         461.000000
Name: user_id, dtype: float64

Отберем по **20 item** для каждого нового аватара

In [145]:
first_avatar_items = first_avatar_items.sample(20, random_state=17)["item_id"].to_numpy()
second_avatar_items = second_avatar_items.sample(20, random_state=17)["item_id"].to_numpy()
third_avatar_items = third_avatar_items.sample(20, random_state=17)["item_id"].to_numpy()

In [147]:
interactions['total_dur'].describe()

count    4.196975e+06
mean     1.079820e+04
std      5.523617e+04
min      3.000000e+02
25%      1.646000e+03
50%      5.353000e+03
75%      8.245000e+03
max      8.041167e+07
Name: total_dur, dtype: float64

In [162]:
total_dur = [np.random.randint(500, 10000) for _ in range(60)]

In [161]:
watched_pct, weight = [], []
for _ in range(60):
    random_value = interactions.sample(1)[["watched_pct", "weight"]].values[0]
    watched_pct.append(random_value[0])
    weight.append(random_value[1])

In [173]:
datetime = [
    interactions[interactions[Columns.Datetime] > "2021-05-01"].sample(1)[Columns.Datetime].iloc[0]
    for _ in range(60)
]

In [186]:
interactions_new_avatar = pd.DataFrame(
    {
        Columns.User: [5555555, 6666666, 7777777],
        Columns.Item: [list(first_avatar_items), list(second_avatar_items), list(third_avatar_items)],
        Columns.Datetime: [datetime[:20], datetime[20:40], datetime[40:]],
        "total_dur": [total_dur[:20], total_dur[20:40], total_dur[40:]],
        "watched_pct": [watched_pct[:20], watched_pct[20:40], watched_pct[40:]],
        Columns.Weight: [weight[:20], weight[20:40], weight[40:]],
    }
).explode(
    [Columns.Item, Columns.Datetime, "total_dur", "watched_pct", Columns.Weight]
).reset_index(drop=True)

interactions_new_avatar

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,weight
0,5555555,879,2021-06-21,5230,94.0,5.0
1,5555555,2280,2021-08-21,5476,22.0,3.0
2,5555555,3178,2021-07-04,4843,16.0,3.0
3,5555555,15813,2021-07-24,7414,100.0,6.0
4,5555555,3471,2021-06-27,8583,100.0,6.0
5,5555555,2113,2021-07-20,3288,63.0,4.0
6,5555555,3818,2021-08-20,6726,80.0,5.0
7,5555555,9550,2021-06-14,2662,16.0,3.0
8,5555555,3209,2021-08-16,5060,100.0,6.0
9,5555555,1577,2021-07-26,1400,100.0,6.0


In [187]:
interactions.shape

(4196975, 6)

In [191]:
interactions_with_avatar = pd.concat(
    [
        interactions,
        interactions_new_avatar
    ],
    ignore_index=True
).sample(
    interactions_with_avatar.shape[0]
).reset_index(drop=True)

interactions_with_avatar.shape[0] - interactions.shape[0] == 60

True

In [192]:
interactions_with_avatar.to_csv("../data/kion_train/interactions_with_avatar.csv", index=False)
users_with_avatar.to_csv("../data/kion_train/users_with_avatar.csv", index=False)

In [193]:
users_features = create_features_dataset(
    Columns.User, interactions_with_avatar, users_with_avatar, features["users"]
)
display_df(users_features)

Unnamed: 0,id,value,feature
0,973171,age_25_34,age
1,962099,age_18_24,age
3,721985,age_45_54,age
4,704055,age_35_44,age
5,1037719,age_45_54,age
840189,191349,М,sex
840190,393868,М,sex
840197,5555555,М,sex
840198,6666666,М,sex
840199,7777777,Ж,sex


In [234]:
users_features["id"].unique().shape

(626278,)

In [194]:
items_features = create_features_dataset(
    Columns.Item, interactions_with_avatar, items, features["items"]
)
display_df(items_features)

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type
15958,6443,германия,countries
15959,2367,россия,countries
15960,10632,россия,countries
15961,4538,россия,countries
15962,3206,россия,countries


In [254]:
users_mapping = dict(enumerate(interactions_with_avatar[Columns.User].unique()))
users_inv_mapping = {v: k for k, v in users_mapping.items()}
items_mapping = dict(enumerate(interactions_with_avatar[Columns.Item].unique()))
items_inv_mapping = {v: k for k, v in items_mapping.items()}

print(f"Count unique users: {len(users_mapping)}")
print(f"Count unique items: {len(items_mapping)}")

Count unique users: 809580
Count unique items: 14169


### Построение лучших моделей с валидации на всех данных

In [416]:
dataset = Dataset.construct(
    interactions_df=interactions_with_avatar,
    user_features_df=users_features,
    cat_user_features=features["users"],
    item_features_df=items_features,
    cat_item_features=features["items"]
)

In [418]:
model_als = ImplicitALSWrapperModel(
    model=AlternatingLeastSquares(
        factors=32, 
        random_state=42, 
        num_threads=16,
    ), 
    fit_features_together=True
)

model_lfm = LightFMWrapperModel(
    LightFM(
        no_components=16, 
        loss="warp", 
        random_state=42,
        learning_rate=0.05, 
        user_alpha=0.2,
        item_alpha=0.2,
    ),
    epochs=10,
    num_threads=16,
)

In [419]:
%%time

model_als.fit(dataset)

CPU times: user 7min 36s, sys: 18.5 s, total: 7min 54s
Wall time: 1min 37s


<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7fce85959850>

In [420]:
%%time

model_lfm.fit(dataset)

CPU times: user 6min 12s, sys: 133 ms, total: 6min 12s
Wall time: 53.1 s


<rectools.models.lightfm.LightFMWrapperModel at 0x7fce85959af0>

### Реализация приближенного поиска соседей для 3-ёх добавленных аватаров

In [491]:
import nmslib

In [492]:
user_embeddings_als, item_embeddings_als = model_als.get_vectors()
user_embeddings_als.shape, item_embeddings_als.shape

((809580, 237), (14169, 237))

In [493]:
user_embeddings_lfm, item_embeddings_lfm = model_lfm.get_vectors(dataset)
user_embeddings_lfm_bias.shape, item_embeddings_lfm_bias.shape

((809580, 18), (14169, 18))

In [494]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [495]:
_, augmented_item_embeddings_als = augment_inner_product(item_embeddings_als)
_, augmented_item_embeddings_lfm = augment_inner_product(item_embeddings_lfm)

In [496]:
extra_zero_als = np.zeros((user_embeddings_als.shape[0], 1))
extra_zero_lfm = np.zeros((user_embeddings_lfm.shape[0], 1))
augmented_user_embeddings_als = np.append(user_embeddings_als, extra_zero_als, axis=1)
augmented_user_embeddings_lfm = np.append(user_embeddings_lfm, extra_zero_lfm, axis=1)

In [497]:
augmented_user_embeddings_als.shape, augmented_user_embeddings_lfm.shape

((809580, 238), (809580, 19))

In [498]:
CONFIG_NN = {
    "K": 10,
    "space_name": 'negdotprod',
    "method": 'hnsw',
    "index_time_params": {
        "M": 64,
        "efConstruction": 1000,
        "indexThreadQty": 4,
    },
    "query_time_params": {
        "efSearch": 1000,
    },
}

In [499]:
# Intitialize the library, specify the space, the type of the vector and add data points for each models
index_als = nmslib.init(
    method=CONFIG_NN["method"], space=CONFIG_NN["space_name"], data_type=nmslib.DataType.DENSE_VECTOR
) 
index_als.addDataPointBatch(augmented_item_embeddings_als) 

index_lfm = nmslib.init(
    method=CONFIG_NN["method"], space=CONFIG_NN["space_name"], data_type=nmslib.DataType.DENSE_VECTOR
) 
index_lfm.addDataPointBatch(augmented_item_embeddings_lfm) 

14169

In [500]:
# Create index for each model
index_als.createIndex(CONFIG_NN["index_time_params"]) 
index_lfm.createIndex(CONFIG_NN["index_time_params"]) 

In [501]:
# Setting query-time parameters
index_als.setQueryTimeParams(CONFIG_NN["query_time_params"])
index_lfm.setQueryTimeParams(CONFIG_NN["query_time_params"])

In [502]:
def get_reco_item(index, u_emb, user, users_inv_mapping, items_mapping, k=10):
    avatar_idx = users_inv_mapping[user]
    items_idx = index.knnQuery(u_emb[avatar_idx], k=k,)[0].tolist()
    return [items_mapping[idx] for idx in items_idx]

In [503]:
%%time 

models = ["als", "lfm"]
index_with_embs = [(index_als, augmented_user_embeddings_als), (index_lfm, augmented_user_embeddings_lfm)]

result = {
    "avatar": [],
    "model": [],
    "reco": [],
}
for avatar in [5555555, 6666666, 7777777]:
    for idx, (index, u_emb) in enumerate(index_with_embs):
        reco = get_reco_item(index, u_emb, avatar, users_inv_mapping, items_mapping, k=CONFIG_NN["K"])
        
        result["avatar"].append(avatar)
        result["model"].append(models[idx])
        result["reco"].append(reco)

CPU times: user 7.76 ms, sys: 0 ns, total: 7.76 ms
Wall time: 14.5 ms


In [504]:
df_result = pd.DataFrame(result).explode("reco")
df_result

Unnamed: 0,avatar,model,reco
0,5555555,als,9728
0,5555555,als,2280
0,5555555,als,13865
0,5555555,als,9254
0,5555555,als,10195
0,5555555,als,15443
0,5555555,als,10451
0,5555555,als,879
0,5555555,als,1577
0,5555555,als,4659


In [505]:
df_result = df_result.merge(
    items[["item_id", "content_type", "genres", "countries"]], 
    left_on=["reco"],
    right_on=["item_id"], 
    how="left"
)
df_result

Unnamed: 0,avatar,model,reco,item_id,content_type,genres,countries
0,5555555,als,9728,9728,film,"[боевики, триллеры]","[великобритания, сша]"
1,5555555,als,2280,2280,series,"[боевики, историческое, военные]","[россия, беларусь]"
2,5555555,als,13865,13865,film,"[драмы, военные, приключения]",[россия]
3,5555555,als,9254,9254,series,"[боевики, военные]",[беларусь]
4,5555555,als,10195,10195,series,"[боевики, драмы, русские]",[россия]
5,5555555,als,15443,15443,series,"[драмы, боевики, русские, военные]",[россия]
6,5555555,als,10451,10451,series,"[боевики, русские]",[россия]
7,5555555,als,879,879,series,"[боевики, русские]",[россия]
8,5555555,als,1577,1577,series,"[боевики, русские]",[россия]
9,5555555,als,4659,4659,series,"[боевики, русские]",[россия]


**avatar 1**
- Возраст от 45-54, доход 60-90, пол мужской, смотрим русские сериалы боевики (**id 5555555**)

In [506]:
df_result[df_result["avatar"] == 5555555][["model", "reco", "content_type", "genres", "countries"]]

Unnamed: 0,model,reco,content_type,genres,countries
0,als,9728,film,"[боевики, триллеры]","[великобритания, сша]"
1,als,2280,series,"[боевики, историческое, военные]","[россия, беларусь]"
2,als,13865,film,"[драмы, военные, приключения]",[россия]
3,als,9254,series,"[боевики, военные]",[беларусь]
4,als,10195,series,"[боевики, драмы, русские]",[россия]
5,als,15443,series,"[драмы, боевики, русские, военные]",[россия]
6,als,10451,series,"[боевики, русские]",[россия]
7,als,879,series,"[боевики, русские]",[россия]
8,als,1577,series,"[боевики, русские]",[россия]
9,als,4659,series,"[боевики, русские]",[россия]


**avatar 2**
- Возраст от 18-24, доход 20-40, пол мужской, смотрит фильмы жанра комедия страна любая кроме России (**id 6666666**)

In [507]:
df_result[df_result["avatar"] == 6666666][["model", "reco", "content_type", "genres", "countries"]]

Unnamed: 0,model,reco,content_type,genres,countries
20,als,4676,film,"[мелодрамы, комедии]",[кипр]
21,als,5061,film,"[приключения, комедии]",[кипр]
22,als,4088,film,"[драмы, комедии]",[кипр]
23,als,602,film,"[мультфильм, комедии]",[кипр]
24,als,9880,series,"[для детей, приключения, зарубежные, семейное,...",[кипр]
25,als,15141,film,"[зарубежные, триллеры]",[кипр]
26,als,599,film,[мультфильм],[кипр]
27,als,9728,film,"[боевики, триллеры]","[великобритания, сша]"
28,als,11237,film,[комедии],[россия]
29,als,1819,film,[комедии],[россия]


**avatar 3**
 - Возраст от 35-44, доход 40-60, пол женский, смотрим и фильмы и сериалы жанра мелодрамы неважна какая страна (**id 7777777**)

In [508]:
df_result[df_result["avatar"] == 7777777][["model", "reco", "content_type", "genres", "countries"]]

Unnamed: 0,model,reco,content_type,genres,countries
40,als,15297,series,"[драмы, мелодрамы]",[россия]
41,als,10275,film,"[приключения, мюзиклы, мелодрамы, семейное, ко...",[ссср]
42,als,1541,film,"[мелодрамы, мюзиклы, комедии]",[ссср]
43,als,2929,film,"[мелодрамы, мюзиклы, комедии]",[ссср]
44,als,10725,film,"[мелодрамы, мюзиклы, комедии]",[ссср]
45,als,12629,film,"[мелодрамы, мюзиклы, комедии]",[ссср]
46,als,776,film,"[драмы, спорт, мелодрамы]",[ссср]
47,als,10448,series,"[драмы, мюзиклы, мелодрамы]",[ссср]
48,als,13045,film,"[драмы, мюзиклы, мелодрамы]",[ссср]
49,als,1687,film,"[драмы, мюзиклы, мелодрамы]",[ссср]


- als имеет хорошие рекомендации, но тут может получиться зациклинность, что в реках будут выпадать только боевики
- lfm модели имеют хужие предсказания, но зато пользователю будут попадаться новые item из других категорий

**Возникла проблема, что LigthFM возвращает постоянно одни и те же рекомендации**, не удалось оддебажить. Если получится найти ошибку, буду благодарен!

### Сохраним вектора моделей для рекомендаций в сервисе

In [485]:
with open("../data/hw_4/als_users.npy", "wb") as file:
    np.save(file, augmented_user_embeddings_als)
    
with open("../data/hw_4/als_items.npy", "wb") as file:
    np.save(file, augmented_item_embeddings_als)

In [486]:
with open("../data/hw_4/lfm_users.npy", "wb") as file:
    np.save(file, augmented_user_embeddings_lfm)
    
with open("../data/hw_4/lfm_items.npy", "wb") as file:
    np.save(file, augmented_item_embeddings_lfm)

In [487]:
np.load("../data/hw_4/als_users.npy").shape

(809580, 238)

In [488]:
np.load("../data/hw_4/als_items.npy").shape

(14169, 238)

In [489]:
np.load("../data/hw_4/lfm_users.npy").shape

(809580, 19)

In [490]:
np.load("../data/hw_4/lfm_items.npy").shape

(14169, 19)

# Для холодных пользователей будут возвращаться популярное аналогично ДЗ-3, т.к. получился хороший результат популярного 0,08 по метрике MAP@10