In [1]:
import pandas as pd
import numpy as np
import requests
import optuna

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization


import warnings

warnings.filterwarnings(action='ignore', category=UserWarning)



In [2]:
from implicit.gpu import HAS_CUDA

print('HAS_CUDA:', HAS_CUDA)

HAS_CUDA: False


In [3]:
import os

os.environ['OPENBLAS_NUM_THREADS'] = '1'

In [4]:
interactions = pd.read_csv('../datasets/kion/interactions.csv')
users = pd.read_csv('../datasets/kion/users.csv')
items = pd.read_csv('../datasets/kion/items.csv')

# Обработка данных

In [5]:
Columns.Datetime = 'last_watch_dt'

In [6]:
interactions.drop(
    interactions[interactions[Columns.Datetime].str.len() != 10].index,
    inplace=True,
)
interactions[Columns.Datetime] = pd.to_datetime(
    interactions[Columns.Datetime], format='%Y-%m-%d'
)
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [7]:
# Разделяем на train и test
train = interactions[
    interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)
].copy()
test = interactions[
    interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)
].copy()

In [8]:
train.drop(train.query('total_dur < 300').index, inplace=True)
cold_users = set(test[Columns.User]) - set(train[Columns.User])
len(cold_users)

72930

In [9]:
# Отбрасываем холодных пользователей
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

# Подготовка фич

## User features

In [10]:
users.fillna('Unknown', inplace=True)
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [11]:
user_features_frames = []
for feature in ['sex', 'age', 'income']:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ['id', 'value']
    feature_frame['feature'] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


## Item features

In [12]:
items.fillna('Unknown', inplace=True)
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

  items.fillna('Unknown', inplace=True)


In [13]:
items['genre'] = (
    items['genres']
    .str.lower()
    .str.replace(', ', ',', regex=False)
    .str.split(',')
)
genre_feature = items[['item_id', 'genre']].explode('genre')
genre_feature.columns = ['id', 'value']
genre_feature['feature'] = 'genre'
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [14]:
content_feature = items.reindex(columns=[Columns.Item, 'content_type'])
content_feature.columns = ['id', 'value']
content_feature['feature'] = 'content_type'

In [15]:
countries_feature = items.reindex(columns=[Columns.Item, 'countries'])
countries_feature.columns = ['id', 'value']
countries_feature['feature'] = 'countries'

In [16]:
item_features = pd.concat((genre_feature, content_feature, countries_feature))

In [17]:
metrics_name = {
    'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [18]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=['sex', 'age', 'income'],
    item_features_df=item_features,
    cat_item_features=['genre', 'content_type', 'countries'],
)

TEST_USERS = test[Columns.User].unique()

# Подбор гиперпараметров

### Подбор гиперпараметров ImplicitALS

In [None]:
def ials_objective(trial):
    factors = trial.suggest_categorical('factors', [4, 8, 16, 32, 64])
    regularization = trial.suggest_float('regularization', 0.001, 0.1, log=True)
    iterations = trial.suggest_categorical('iterations', [1, 3, 5, 10, 15])

    model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            iterations=iterations,
            random_state=42,
            num_threads=4,
            use_gpu=False,
        ),
        fit_features_together=True,
    )

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )

    map10 = calc_metrics(metrics, recos, test, train)['MAP@10']
    return map10

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(ials_objective, n_trials=20)

In [None]:
print(f'Best MAP@10 value: {study.best_value}')
print(f'Best parameters: {study.best_params}')

In [None]:
# Best MAP@10 value: 0.075254069127234
# Best parameters: {'factors': 32, 'regularization': 0.01883534498756549, 'iterations': 5}

### Подбор гиперпараметров LightFM

In [19]:
def lfm_objective(trial):

    no_components = trial.suggest_categorical('no_components', [8, 16, 32, 64, 128])
    learning_rate = trial.suggest_float('learning_rate', 0.005, 0.05, log=True)
    rho = trial.suggest_float('rho', 0.9, 0.99, log=True)
    epsilon = trial.suggest_float('epsilon', 1e-6, 1e-5, log=True)

    model = LightFMWrapperModel(
        LightFM(
            no_components=no_components,
            learning_rate=learning_rate,
            rho=rho,
            epsilon=epsilon,
            user_alpha=0,
            item_alpha=0,
            random_state=42,
        ),
        epochs=1,
        num_threads=16,
    )
    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )

    map10 = calc_metrics(metrics, recos, test, train)['MAP@10']
    return map10

In [20]:
study = optuna.create_study(direction='maximize')
study.optimize(lfm_objective, n_trials=30)

[I 2025-01-25 14:26:07,868] A new study created in memory with name: no-name-e3bad160-2073-4f62-92cc-c1c573916219
[I 2025-01-25 14:26:45,009] Trial 0 finished with value: 0.0002382267634487237 and parameters: {'no_components': 16, 'learning_rate': 0.04295147490032618, 'rho': 0.921676231757281, 'epsilon': 2.6737107619409933e-06}. Best is trial 0 with value: 0.0002382267634487237.
[I 2025-01-25 14:27:37,290] Trial 1 finished with value: 0.00023863222720477706 and parameters: {'no_components': 32, 'learning_rate': 0.025066398765681862, 'rho': 0.9192331449196115, 'epsilon': 9.930421332822e-06}. Best is trial 1 with value: 0.00023863222720477706.
[I 2025-01-25 14:29:04,851] Trial 2 finished with value: 0.00024089281338128088 and parameters: {'no_components': 64, 'learning_rate': 0.011401635979739495, 'rho': 0.9256017349342114, 'epsilon': 3.0173627011991097e-06}. Best is trial 2 with value: 0.00024089281338128088.
[I 2025-01-25 14:30:01,359] Trial 3 finished with value: 0.0002382515298454158

# Лучшая модель

In [None]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=['sex', 'age', 'income'],
    item_features_df=item_features,
    cat_item_features=['genre', 'content_type', 'countries'],
)

TEST_USERS = test[Columns.User].unique()

In [None]:
model = ImplicitALSWrapperModel(
    model=AlternatingLeastSquares(
        factors=32,
        regularization=0.01883534498756549,
        iterations=5,
        random_state=42,
        num_threads=4,
        use_gpu=False,
    ),
    fit_features_together=True,
)

# Искусственные пользователи

Егор - любитель фильмов Квентина Тарантино

In [None]:
users = pd.concat(
    [
        users,
        pd.DataFrame([
            {
                'user_id': 1100000,
                'age': 'age_18_24',
                'income': 'income_60_90',
                'sex': 'М',
                'kids_flg': 0,
            }
        ]),
    ],
    ignore_index=True,
)


In [None]:
users.iloc[-1]

In [None]:
items.loc[
    items['item_id'].isin([14804, 7693, 11115, 8148, 16382, 4072, 898, 13715]),
    ['item_id', 'title', 'content_type', 'countries', 'genre'],
]

In [None]:
first_items = [14804, 7693, 11115, 8148, 16382, 4072, 898, 13715]
first_num_items = len(first_items)
first_avatar = pd.DataFrame({
    'user_id': np.full(first_num_items, fill_value=1100000),
    'item_id': first_items,
    'last_watch_dt': np.full(first_num_items, fill_value='2021-05-29'),
    'total_dur': np.full(first_num_items, fill_value=np.nan),
    'watched_pct': first_num_items * [100.0],
    'weight': [3, 3, 3, 3, 3, 1, 3, 3],
})

first_avatar[Columns.Datetime] = pd.to_datetime(
    first_avatar[Columns.Datetime], format='%Y-%m-%d'
)

In [None]:
avatars = pd.concat([first_avatar])
train = pd.concat([train, avatars], ignore_index=True)

In [None]:
# Соберем датасет
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=['sex', 'age', 'income'],
    item_features_df=item_features,
    cat_item_features=['genre', 'content_type', 'countries'],
)

avatars_ids = avatars['user_id'].unique()

In [None]:
model.fit(dataset)

In [None]:
recos = model.recommend(
    users=avatars_ids,
    dataset=dataset,
    k=10,
    filter_viewed=True,
)

In [None]:
recs = recos.merge(
    items[['item_id', 'title', 'content_type', 'countries', 'genre']],
    on='item_id',
).sort_values(['user_id', 'rank'])

In [None]:
# Рекомендации для пользователя, смотревшего только сериалы
recs[recs['user_id'] == avatars_ids[0]]