In [2]:
import typing as tp

import dill
import implicit
import numpy as np
import threadpoolctl

threadpoolctl.threadpool_limits(1, "blas")
import optuna
import pandas as pd
from implicit.als import AlternatingLeastSquares
from optuna.samplers import TPESampler
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import MAP
from rectools.models import (
    ImplicitALSWrapperModel,
)

optuna.logging.set_verbosity(10)
implicit.gpu.HAS_CUDA



False

### Загрузка данных

In [3]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
request = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
  total_sipe_in_bytes = int(request.headers.get('Content-Length', 0))
  progress_bar = tqdm(desc='KION dataset download', total=total_sipe_in_bytes, unit='iB', unit_scale=True)
  for chunk in request.iter_content(chunk_size= 2 ** 20):
    progress_bar.update(len(chunk))
    fd.write(chunk)

In [None]:
files = zf.ZipFile('kion.zip', 'r')
files.extractall()
files.close()

In [None]:
idf = pd.read_csv("/content/data_original/interactions.csv", parse_dates=["last_watch_dt"])
idf.rename(columns={"last_watch_dt": rectools.Columns.Datetime, "total_dur": rectools.Columns.Weight}, inplace=True)
interactions = Interactions(idf)

In [4]:
Columns.Datetime = 'last_watch_dt'
interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')

interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

### Предобработка данных

In [5]:
max_date = interactions[Columns.Datetime].max()
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

In [None]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [None]:
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

### Выбор фичей как из лекции

In [6]:
def get_user_features(users: pd.DataFrame, interactions: pd.DataFrame, features: tp.List[str]):
    users.fillna("Unknown", inplace=True)
    users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
    user_features_frames = []
    for feature in features:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    user_features = pd.concat(user_features_frames)
    return user_features

def get_item_features(items: pd.DataFrame, interactions: pd.DataFrame):
    items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()
    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"
    content_feature = items.reindex(columns=[Columns.Item, "content_type"])
    content_feature.columns = ["id", "value"]
    content_feature["feature"] = "content_type"
    item_features = pd.concat((genre_feature, content_feature))
    return item_features

In [8]:
user_features = get_user_features(users, train, ["sex", "age", "income"])
item_features = get_item_features(items, train)

### Создадим датасет из удобный для ректулз и map@10 

In [9]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

metric = MAP(k=10)
K_RECOS = 10
RANDOM_STATE = 42

# Тюнинг модели здесь

In [11]:
def als_optuna_objective(trial):
    test_users = test[Columns.User].unique()
    factors = trial.suggest_categorical("n_factors", [8, 16, 32])
    fit_features_together = trial.suggest_categorical("fit_features_together", [True, False])
    regularization = trial.suggest_float('regularization', 0.001, 0.1, log=True)
    iterations = trial.suggest_int('iterations', 5, 20)

    model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=factors,
            random_state=RANDOM_STATE,
            regularization=regularization,
            iterations=iterations,
            num_threads=24,
        ),
        fit_features_together=fit_features_together,
    )

    model.fit(dataset)
    recos = model.recommend(
        users=test_users,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    mtrc = metric.calc_per_user(recos, test)
    return mtrc.mean()

In [13]:
print(f'Лучшее значение MAP@10: {study.best_value}')
print(f'Лучшие параметры: {study.best_params}')

Лучшее значение MAP@10: 0.07741930704805924
Лучшие параметры: {'n_factors': 8, 'fit_features_together': True, 'regularization': 0.02140145192621051, 'iterations': 10}


## Лучшие параметры найдены, обучим на них модель

In [None]:
user_features = get_user_features(users, interactions, ["sex", "age", "income"])
item_features = get_item_features(items, interactions)

dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

model = ImplicitALSWrapperModel(
    AlternatingLeastSquares(
        factors=16,
        random_state=RANDOM_STATE,
        regularization=0.00113,
        iterations=11,
        num_threads=24,
    ),
    fit_features_together=True
)

model.fit(dataset)

In [None]:
with open(f'asl.dill', 'wb') as f:
    dill.dump(model, f)

In [None]:
with open('dataset.dill', 'wb') as f:
    dill.dump(dataset, f)