In [28]:
import warnings

import numpy as np
import optuna
import pandas as pd
from implicit.als import AlternatingLeastSquares
from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import MAP, Precision, Recall, calc_metrics
from rectools.models import (
    ImplicitALSWrapperModel,
    LightFMWrapperModel,
    PopularModel,
    RandomModel,
)

warnings.filterwarnings(action='ignore', category=UserWarning)

In [29]:
from enum import Enum


class ItemsFeatureTopKConfig(int, Enum):
    """Конфигурация для ограничения количества топовых значений фич."""

    DIRECTORS_TOP_K = 30
    STUDIOS_TOP_K = 15


In [31]:
interactions = pd.read_csv('interactions_processed.csv')
users = pd.read_csv('users_processed.csv')
items = pd.read_csv('items_processed.csv')

# Обработка данных

In [32]:
Columns.Datetime = 'last_watch_dt'

In [33]:
interactions.drop(
    interactions[interactions[Columns.Datetime].str.len() != 10].index,
    inplace=True,
)
interactions[Columns.Datetime] = pd.to_datetime(
    interactions[Columns.Datetime], format='%Y-%m-%d'
)
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 20, 3, 1)

In [34]:
# Разделяем на train и test
train = interactions[
    interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)
].copy()
test = interactions[
    interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)
].copy()

In [35]:
train.drop(train.query('total_dur < 300').index, inplace=True)
cold_users = set(test[Columns.User]) - set(train[Columns.User])
len(cold_users)

72930

In [36]:
# Отбрасываем холодных пользователей
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

# Подготовка фич

## User features

In [37]:
users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,True
1,962099,age_18_24,income_20_40,М,False
2,1047345,age_45_54,income_40_60,Ж,False
3,721985,age_45_54,income_20_40,Ж,False
4,704055,age_35_44,income_60_90,Ж,False
...,...,...,...,...,...
840192,339025,age_65_inf,income_0_20,Ж,False
840193,983617,age_18_24,income_20_40,Ж,True
840194,251008,age_unknown,income_unknown,sex_unknown,False
840195,590706,age_unknown,income_unknown,Ж,False


In [38]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [39]:
user_features_frames = []
for feature in ['sex', 'age', 'income']:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ['id', 'value']
    feature_frame['feature'] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


## Item features

In [40]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [41]:
items['genre'] = (
    items['genres'].str.replace(', ', ',', regex=False).str.split(',')
)
genre_feature = items[[Columns.Item, 'genre']].explode('genre')
genre_feature.columns = ['id', 'value']
genre_feature['feature'] = 'genre'
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


In [42]:
content_feature = items.reindex(columns=[Columns.Item, 'content_type'])
content_feature.columns = ['id', 'value']
content_feature['feature'] = 'content_type'

In [43]:
countries_feature = items.reindex(columns=[Columns.Item, 'countries'])
countries_feature.columns = ['id', 'value']
countries_feature['feature'] = 'countries'

In [44]:
release_decade_feature = items.reindex(columns=[Columns.Item, 'release_decade'])
release_decade_feature.columns = ['id', 'value']
release_decade_feature['feature'] = 'release_decade'

In [45]:
release_decade_feature['value'].value_counts()

value
2010.0s                 8091
2000.0s                 1955
2020.0s                 1682
1980.0s                  613
1990.0s                  572
1970.0s                  467
1960.0s                  270
1950.0s                  143
1940.0s                   91
1930.0s                   80
release_year_unknown      31
1920.0s                   17
1910.0s                    6
Name: count, dtype: int64

In [46]:
age_rating_feature = items.reindex(columns=[Columns.Item, 'age_rating'])
age_rating_feature.columns = ['id', 'value']
age_rating_feature['feature'] = 'age_rating'

Берем только ТОП-K студий, а остальные заменяем на 'other_studio'. 'other_studio' и 'unknown_studio' - **разные** вещи!

In [47]:
def replace_rare_studios(studio_list):
    return [
        studio if studio in top_studios else 'other_studio'
        for studio in studio_list
    ]


items['studio'] = items['studios'].str.split(r',\s*')
top_studios = (
    items['studio']
    .explode()
    .value_counts()
    .head(ItemsFeatureTopKConfig.STUDIOS_TOP_K)
    .index
)
items['studio'] = items['studio'].apply(replace_rare_studios)

In [48]:
studios_feature = items[[Columns.Item, 'studio']].explode('studio')
studios_feature.columns = ['id', 'value']
studios_feature['feature'] = 'studios'

In [49]:
# Для директоров оставляем топ-30, остальные заменяем на 'other'
items['directors'] = (
    items['directors'].str.replace(', ', ',', regex=False).str.split(',')
)
top_directors = (
    items['directors']
    .explode()
    .value_counts()
    .head(ItemsFeatureTopKConfig.DIRECTORS_TOP_K)
    .index
)

items['director'] = items['directors'].apply(
    lambda x: [d if d in top_directors else 'other_director' for d in x]
)

In [50]:
directors_feature = items[[Columns.Item, 'director']].explode('director')
directors_feature.columns = ['id', 'value']
directors_feature['feature'] = 'director'
directors_feature.head()

Unnamed: 0,id,value,feature
0,10711,other_director,director
1,2508,other_director,director
2,10716,other_director,director
3,7868,other_director,director
4,16268,other_director,director


Можно еще добавить description_feature

In [51]:
item_features = pd.concat((
    genre_feature,
    content_feature,
    countries_feature,
    release_decade_feature,
    age_rating_feature,
    studios_feature,
    directors_feature,
))


In [52]:
item_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 123469 entries, 0 to 15961
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   id       123469 non-null  int64 
 1   value    123469 non-null  object
 2   feature  123469 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.8+ MB


In [53]:
metrics_name = {
    'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [54]:
CAT_USER_FEATURES = list(user_features['feature'].unique())
CAT_ITEM_FEATURES = list(item_features['feature'].unique())

In [55]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=CAT_USER_FEATURES,
    item_features_df=item_features,
    cat_item_features=CAT_ITEM_FEATURES,
)

TEST_USERS = test[Columns.User].unique()

# Подбор гиперпараметров

### Подбор гиперпараметров ImplicitALS

In [None]:
def ials_objective(trial):
    factors = trial.suggest_categorical('factors', [4, 8, 16, 32, 64])
    regularization = trial.suggest_float('regularization', 0.001, 0.1, log=True)
    iterations = trial.suggest_categorical('iterations', [1, 3, 5, 10, 15])

    model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            iterations=iterations,
            random_state=42,
            num_threads=4,
            use_gpu=False,
        ),
        fit_features_together=True,
    )

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )

    map10 = calc_metrics(metrics, recos, test, train)['MAP@10']
    return map10

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(ials_objective, n_trials=20)

In [None]:
print(f'Best MAP@10 value: {study.best_value}')
print(f'Best parameters: {study.best_params}')

In [None]:
# Best MAP@10 value: 0.075254069127234
# Best parameters: {'factors': 32, 'regularization': 0.01883534498756549, 'iterations': 5}

### Подбор гиперпараметров LightFM

In [None]:
def lfm_objective(trial):
    no_components = trial.suggest_categorical(
        'no_components', [8, 16, 32, 64, 128]
    )
    learning_rate = trial.suggest_float('learning_rate', 0.005, 0.05, log=True)
    rho = trial.suggest_float('rho', 0.9, 0.99, log=True)
    epsilon = trial.suggest_float('epsilon', 1e-6, 1e-5, log=True)

    model = LightFMWrapperModel(
        LightFM(
            no_components=no_components,
            learning_rate=learning_rate,
            rho=rho,
            epsilon=epsilon,
            user_alpha=0,
            item_alpha=0,
            random_state=42,
        ),
        epochs=1,
        num_threads=16,
    )
    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )

    map10 = calc_metrics(metrics, recos, test, train)['MAP@10']
    return map10

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(lfm_objective, n_trials=30)

# Лучшая модель

In [56]:
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=CAT_USER_FEATURES,
    item_features_df=item_features,
    cat_item_features=CAT_ITEM_FEATURES,
)

TEST_USERS = test[Columns.User].unique()

In [57]:
model = ImplicitALSWrapperModel(
    model=AlternatingLeastSquares(
        factors=32,
        regularization=0.01883534498756549,
        iterations=5,
        random_state=42,
        num_threads=4,
        use_gpu=False,
    ),
    fit_features_together=True,
)

  check_blas_config()


# Искусственные пользователи

Егор - любитель фильмов Квентина Тарантино

In [58]:
users = pd.concat(
    [
        users,
        pd.DataFrame([
            {
                'user_id': 1100000,
                'age': 'age_18_24',
                'income': 'income_60_90',
                'sex': 'М',
                'kids_flg': 0,
            }
        ]),
    ],
    ignore_index=True,
)


In [59]:
items.loc[
    items['item_id'].isin([14804, 7693, 11115, 8148, 16382, 4072, 898, 13715]),
    ['item_id', 'title', *CAT_ITEM_FEATURES],
]

Unnamed: 0,item_id,title,genre,content_type,countries,release_decade,age_rating,studios,director
107,14804,криминальное чтиво,"[зарубежные, триллеры, криминал, комедии]",film,сша,1990.0s,18.0,studios_unknown,[other_director]
3134,16382,человек-паук,"[боевики, фантастика, приключения]",film,сша,2000.0s,12.0,studios_unknown,[other_director]
4766,7693,джанго освобождённый,"[боевики, драмы, вестерн, комедии]",film,сша,2010.0s,18.0,studios_unknown,[other_director]
5806,11115,бесславные ублюдки,"[боевики, драмы, военные, комедии]",film,"германия, сша",2000.0s,16.0,studios_unknown,[other_director]
6992,898,большой куш,"[боевики, комедии]",film,"великобритания, сша",2000.0s,16.0,studios_unknown,[other_director]
10425,13715,стекло,"[драмы, фантастика, триллеры]",film,"сша, китай",2010.0s,16.0,studios_unknown,[other_director]
13802,8148,бешеные псы,"[драмы, триллеры]",film,сша,1990.0s,18.0,studios_unknown,[other_director]
15624,4072,человек-паук 3: враг в отражении,"[боевики, фантастика, приключения]",film,сша,2000.0s,12.0,studios_unknown,[other_director]


In [60]:
first_items = [14804, 7693, 11115, 8148, 16382, 4072, 898, 13715]
first_num_items = len(first_items)
first_avatar = pd.DataFrame({
    'user_id': np.full(first_num_items, fill_value=1100000),
    'item_id': first_items,
    'last_watch_dt': np.full(first_num_items, fill_value='2021-05-29'),
    'total_dur': np.full(first_num_items, fill_value=np.nan),
    'watched_pct': first_num_items * [100.0],
    'weight': [3, 3, 3, 3, 3, 3, 3, 1],
})

first_avatar[Columns.Datetime] = pd.to_datetime(
    first_avatar[Columns.Datetime], format='%Y-%m-%d'
)

In [61]:
avatars = pd.concat([first_avatar])
interactions = pd.concat([interactions, avatars], ignore_index=True)

In [62]:
# Соберем датасет
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=CAT_USER_FEATURES,
    item_features_df=item_features,
    cat_item_features=CAT_ITEM_FEATURES,
)

avatars_ids = avatars['user_id'].unique()

In [63]:
%%time
model.fit(dataset)

CPU times: total: 23min 38s
Wall time: 5min 35s


<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x13a6a289d00>

In [70]:
%%time
model.fit(dataset)

3434336646

In [64]:
recos = model.recommend(
    users=avatars_ids,
    dataset=dataset,
    k=10,
    filter_viewed=True,
)

In [65]:
recs = recos.merge(
    items[['item_id', 'title', *CAT_ITEM_FEATURES]],
    on='item_id',
).sort_values(['user_id', 'rank'])

In [66]:
recs[recs['user_id'] == avatars_ids[0]]

Unnamed: 0,user_id,item_id,score,rank,title,genre,content_type,countries,release_decade,age_rating,studios,director
0,1100000,9728,0.029119,1,гнев человеческий,"[боевики, триллеры]",film,"великобритания, сша",2020.0s,18.0,studios_unknown,[other_director]
1,1100000,16447,0.028002,2,бойцовский клуб,"[драмы, триллеры, криминал]",film,"германия, сша",1990.0s,18.0,studios_unknown,[other_director]
2,1100000,7731,0.026378,3,спартанец,"[криминал, детективы, драмы, триллеры, боевики]",film,"германия, сша",2000.0s,18.0,studios_unknown,[other_director]
3,1100000,12798,0.026273,4,игрушки для взрослых,"[боевики, популярное, триллеры, комедии]",film,"сша, китай",2010.0s,18.0,studios_unknown,[other_director]
4,1100000,3345,0.025621,5,рэмбо 4,"[боевики, триллеры]",film,"германия, сша",2000.0s,16.0,studios_unknown,[other_director]
5,1100000,7531,0.025442,6,шафт,"[боевики, триллеры]",film,"германия, сша",2000.0s,16.0,studios_unknown,[other_director]
6,1100000,3389,0.025424,7,золотой глаз,"[боевики, триллеры]",film,"великобритания, сша",1990.0s,12.0,studios_unknown,[other_director]
7,1100000,5530,0.025381,8,завтра не умрёт никогда,"[боевики, триллеры]",film,"великобритания, сша",1990.0s,12.0,studios_unknown,[other_director]
8,1100000,6582,0.025293,9,образцовый самец,[комедии],film,"германия, сша",2000.0s,18.0,studios_unknown,[other_director]
9,1100000,3044,0.025224,10,и целого мира мало,"[боевики, триллеры]",film,"великобритания, сша",1990.0s,16.0,studios_unknown,[other_director]
