In [1]:
import os

In [2]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm
import optuna
from lightfm import LightFM
import dill
from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

In [5]:
DATA_PATH = Path("kion_train")

# LOAD DATA 

In [6]:
%%time
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

CPU times: total: 5.08 s
Wall time: 5.13 s


# Preprocess

In [7]:
Columns.Datetime = 'last_watch_dt'

In [8]:
interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)

In [9]:
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

In [10]:
max_date = interactions[Columns.Datetime].max()

In [11]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [12]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [13]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [14]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [15]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

# Применим optuna.

In [16]:
# Количество рекомендаций, воспроизводимость, количество запусков. 

k = 10
random_state = 42
n_trials = 12

In [17]:
# В качестве метрики используем MAP

metric = MAP(k = 10)
dataset = Dataset.construct(interactions_df=train)

In [18]:
# В качестве параметров для перебора возьмём следующие:
# no_components - размерность признакового пространства. В лекции указано, что обычно берут 16-256.
# learning_schedule - оптимизатор
# learning_rate - скорость обучения

![image.png](attachment:image.png)

In [19]:
# Функция для запуска optuna по перебору параметров. 
# Так как считается очень долго, то придётся указываать не очень широкие диапазоны для параметров. 

def optuna_objective(trial):
    
    no_components = trial.suggest_int('no_components', 128, 256)
    learning_schedule = trial.suggest_categorical("learning_schedule", ['adagrad', 'adadelta'])
    learning_rate = trial.suggest_float('learning_rate', 3e-2, 0.3)
    
    lightfm = LightFMWrapperModel(
                                 model = LightFM(
                                 no_components = no_components,
                                 learning_schedule = learning_schedule,
                                 loss = 'bpr',
                                 rho = 0.2,
                                 epsilon = 1e-4,
                                 learning_rate = learning_rate,
                                 random_state = random_state,
                                 max_sampled = 5    
                                 )
              )

    
    lightfm.fit(dataset)
    
    recos = lightfm.recommend(users=test[Columns.User].unique(), dataset=dataset, k=k, filter_viewed=True)
    
    map10 = metric.calc_per_user(recos, test)
    
    return map10.mean()

In [20]:
# Запустим перебор параметров через optuna. Так как считается очень долго много запусков сделать не могу.

study = optuna.create_study(directions = ['maximize'])
study.optimize(optuna_objective, n_trials=n_trials)

[32m[I 2022-12-09 02:23:13,736][0m A new study created in memory with name: no-name-e104ee67-a13f-4c92-9f2c-a82bf59c1b6d[0m
[32m[I 2022-12-09 02:35:15,763][0m Trial 0 finished with value: 0.06346693635652204 and parameters: {'no_components': 145, 'learning_schedule': 'adadelta', 'learning_rate': 0.12348277166653901}. Best is trial 0 with value: 0.06346693635652204.[0m
[32m[I 2022-12-09 02:49:06,379][0m Trial 1 finished with value: 0.04371464303933754 and parameters: {'no_components': 227, 'learning_schedule': 'adagrad', 'learning_rate': 0.09712629726516481}. Best is trial 0 with value: 0.06346693635652204.[0m
[32m[I 2022-12-09 03:01:19,953][0m Trial 2 finished with value: 0.06420685173880264 and parameters: {'no_components': 129, 'learning_schedule': 'adadelta', 'learning_rate': 0.2025740177228356}. Best is trial 2 with value: 0.06420685173880264.[0m
[32m[I 2022-12-09 03:21:45,238][0m Trial 3 finished with value: 0.06230974658544847 and parameters: {'no_components': 195, 

In [21]:
# Посмотрим на лучшие параметры, найденные optuna.

study.best_params

{'no_components': 129,
 'learning_schedule': 'adadelta',
 'learning_rate': 0.2025740177228356}

In [23]:
# Теперь обучим модель на всех данных. Используем лучшие параметры, найденные optuna.

all_dataset = Dataset.construct(interactions_df=interactions)

final_lightfm = LightFMWrapperModel(model = LightFM(
                                   no_components = study.best_params['no_components'],
                                   learning_schedule = study.best_params['learning_schedule'],
                                   loss = 'bpr',
                                   learning_rate = study.best_params['learning_rate'],
                                   rho = 0.2,
                                   epsilon = 1e-4,
                                   max_sampled = 5,
                                   random_state = random_state
                                   )
                )

final_lightfm.fit(all_dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x2040038e880>

In [24]:
# Сохраним нашу модель.

with open('lightfm.dill', 'wb') as f:
    dill.dump(final_lightfm, f)

In [25]:
# Проверим, что модель выдает рекомендации. Возьмём user_id = 10347 

final_lightfm.recommend(pd.DataFrame([10347])[0], all_dataset, k=10, filter_viewed = True)

Unnamed: 0,user_id,item_id,score,rank
0,10347,3734,1.980597,1
1,10347,3363,1.944064,2
2,10347,7793,1.629913,3
3,10347,4696,1.51612,4
4,10347,8391,1.509667,5
5,10347,13787,1.492795,6
6,10347,7280,1.476374,7
7,10347,8209,1.450331,8
8,10347,10777,1.447083,9
9,10347,6033,1.445017,10
