В этой работе будем иследовать одну из популярнейших рекомендательных моделей - **Latent Factor Model** - https://arxiv.org/pdf/1912.04754.

Перед выполнением задания нужно убедиться, что прогоняется бейзлайн. Для этого:
1) Скачайте  файлы - **node2name.json** и **clickstream.parque** с необходимыми данными
2) Положите в репозиторий ноутбука и запустите код

В этой работе вам нужно:
1) перебрать параметры модели - edim,batch_size, lr, epoch , num_negatives -   (по **1 балу - 5 балов**)
2) Тип OPTIMIZER_NAME - (**4 бала за 5 оптимизаторов**)
3) На основе имеющихся данных собрать лучшую модель (по **precision@30**) и рассчитать ее метрики (**4 бала**)
4) Попробовать другие модели (например  als - https://benfred.github.io/implicit/ , gru4rec, sasrec  ) - за sasrec на хорошем уровне сразу **10 балов**. За другие модели по **3 бала**
5) По окончанию работы в mlflow настроить графики для сравнения моделей. Можно проявить фантазию, но обязательно должно быть сравнение с бейзлайном (данный ноутбук) против других моделей
6) В mlflow залогировать последнюю версию ноутбука - необходимое условия. Либо в github, но тогда прикрепить ссылку в [mlflow](http://84.201.128.89:90/) . Эксперимент в формате - **homework-\<name\>**
7) Доп балы (**20 баллов**) тому у кого будет наибольший скор на тесте. Но ваш ноутбук должен прогонятся и быть вопроизводимым.

Суммарно за работу **20 балов**

In [None]:
!pip install optuna
!pip install mlflow
!pip install implicit

In [34]:
import json
import random
import mlflow
import os
import torch
import optuna
import pandas as pd
import numpy as np
import implicit
from scipy.sparse import csr_matrix
from torch import nn
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader

In [2]:
SEED = 42

def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True

    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
    os.environ['CUDA_VISIBLE_DEVICES']= '1'


set_seed()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Prepare Data

In [4]:
with open('/content/drive/MyDrive/Avito_mlflow/node2name.json', 'r') as f:
    node2name = json.load(f)

node2name = {int(k):v for k,v in node2name.items()}

In [5]:
df = pd.read_parquet('/content/drive/MyDrive/Avito_mlflow/clickstream.parque')
df = df.head(100_000)
df.head()

Unnamed: 0,cookie_id,event_date,node_id
0,15157399,2024-02-21 11:20:01,1047840
1,15157399,2024-03-05 10:24:54,1047561
2,15157399,2024-03-05 10:28:55,1047561
3,15157399,2024-04-13 11:22:25,1047835
4,15157399,2024-04-13 11:22:45,1047835


In [6]:
df['is_train'] = df['event_date']< df['event_date'].max() - pd.Timedelta('2 day')
df['names'] = df['node_id'].map(node2name)

In [7]:
train_cooks = df[df['is_train']]['cookie_id'].unique()
train_items = df[df['is_train']]['node_id'].unique()


df = df[(df['cookie_id'].isin(train_cooks)) & (df['node_id'].isin(train_items))]

In [9]:
user_indes, index2user_id = pd.factorize(df['cookie_id'])
df['user_index'] = user_indes

node_indes, index2node = pd.factorize(df['node_id'])
df['node_index'] = node_indes


In [10]:
df['node_index'].max()

2175

In [11]:
df_train, df_test = df[df['is_train']], df[~df['is_train']]
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


df_train.shape, df_test.shape

((96611, 7), (3333, 7))

# Define LatentFactorModel

In [13]:
class RecDataset(Dataset):
    def __init__(self, users, items, item_per_users):
        self.users = users
        self.items = items
        self.item_per_users=item_per_users

    def __len__(self):
        return len(self.users)

    def __getitem__(self, i):
        user = self.users[i]
        return torch.tensor(user), torch.tensor(self.items[i]), self.item_per_users[user]


class LatentFactorModel(nn.Module):
    def __init__(self, edim, user_indexes, node_indexes):
        super(LatentFactorModel, self).__init__()
        self.edim = edim
        self.users = nn.Embedding(max(user_indexes) + 1, edim)
        self.items = nn.Embedding(max(node_indexes) + 1, edim)

    def forward(self, users, items):
        user_embedings = self.users(users).reshape(-1, self.edim )
        item_embedings = self.items(items)
        res = torch.einsum('be,bne->bn', user_embedings, item_embedings)
        return res

    def pred_top_k(self, users, K=10):
        user_embedings = self.users(users).reshape(-1, self.edim )
        item_embedings = self.items.weight
        res = torch.einsum('ue,ie->ui', user_embedings, item_embedings)
        return torch.topk(res, K, dim=1)




def collate_fn(batch, num_negatives, num_items):
    users, target_items, users_negatives = [],[], []
    for triplets in batch:
        user, target_item, seen_item = triplets

        users.append(user)
        target_items.append(target_item)
        user_negatives = []

        while len(user_negatives)< num_negatives:
            candidate = random.randint(0, num_items)
            if candidate not in seen_item:
                user_negatives.append(candidate)

        users_negatives.append(user_negatives)


    positive = torch.ones(len(batch), 1)
    negatives = torch.zeros(len(batch), num_negatives)
    labels = torch.hstack([positive, negatives])
    items = torch.hstack([torch.tensor(target_items).reshape(-1, 1), torch.tensor(users_negatives)])
    return torch.hstack(users), items, labels



In [14]:
user2seen = df_train.groupby('user_index')['node_index'].agg(lambda x: list(set(x)))

# Baseline

In [106]:
train_dataset = RecDataset(df_train['user_index'].values, df_train['node_index'], user2seen)
dataloader = DataLoader(train_dataset, shuffle=True,num_workers=0, batch_size=50_000,collate_fn=lambda x: collate_fn(x, NUM_NEGATIVES, max(df['node_index'].values)))

In [None]:
BATCH_SIZE = 50_000
NUM_NEGATIVES = 5
EDIM = 128
EPOCH = 10
OPTIMIZER_NAME = 'Adam'
LR = 1

model = LatentFactorModel(EDIM, user_indes, node_indes)
optimizer = torch.optim.Adam(model.parameters(), LR)

bar = tqdm(total = EPOCH )


In [None]:

for i in range(EPOCH):
    bar_loader = tqdm(total = len(dataloader),)
    losses = []
    for i in dataloader:
        users, items, labels = i
        optimizer.zero_grad()
        logits = model(users, items)
        loss = torch.nn.functional.binary_cross_entropy_with_logits(
            logits, labels
        )
        loss.backward()
        optimizer.step()
        bar_loader.update(1)
        bar_loader.set_description(f'batch loss - {loss.item()}')
        losses.append(loss.item())

    bar.update(1)
    bar.set_description(f'epoch loss - {sum(losses)/len(losses)}')


In [113]:
hitrate, precision = calculate_metrics(model)
hitrate, precision

(0.7151424287856072, 0.04252873563218391)

In [114]:
with mlflow.start_run(run_name='baseline'):
        mlflow.log_metrics(
            {
                'hitrate-30': hitrate,
                'precision-30': precision,
            }
        )
        mlflow.log_params(
            {
                'BATCH_SIZE': BATCH_SIZE,
                'EPOCH': EPOCH,
                'NUM_NEGATIVES': NUM_NEGATIVES,
                'EDIM': EDIM,
                'OPTIMIZER_NAME': OPTIMIZER_NAME,
                'LR': LR
            }
        )

In [None]:
USER = 0

preds = list(model.pred_top_k(torch.tensor([USER]), 10)[1][0].numpy())
df[(df['user_index'] == USER) & (df['node_index'].isin(user2seen[USER]))]['names'].tolist()


In [None]:
[node2name[index2node[i]] for i in preds]

In [None]:
K = 100

test_users = df_test['user_index'].unique()


preds = model.pred_top_k(torch.tensor(test_users), K)[1].numpy()
df_preds = pd.DataFrame({'node_index': list(preds), 'user_index': test_users, 'rank': [[j for j in range(0, K)]for i in range(len(preds))]})

df_preds = df_preds.explode(['node_index', 'rank']).merge(
    df_test[['user_index', 'node_index']].assign(relevant=1).drop_duplicates(),
    on = ['user_index', 'node_index'],
    how='left' ,
)
df_preds['relevant'] = df_preds['relevant'].fillna(0)

In [None]:

def calc_hitrate(df_preds, K):
    return  df_preds[df_preds['rank']<K].groupby('user_index')['relevant'].max().mean()

def calc_prec(df_preds, K):
    return  (df_preds[df_preds['rank']<K].groupby('user_index')['relevant'].mean()).mean()

hitrate = calc_hitrate(df_preds, K)

hitrate, K

In [None]:
calc_prec(df_preds, 30)

In [None]:
df_train['node_index'].max()

In [None]:
top_popular = df_train[['node_index']].assign(v=1).groupby('node_index').count().reset_index().sort_values(by='v').tail(K)['node_index'].values


In [None]:
node2name[index2node[top_popular[-1]]]

In [None]:
df_preds_top_poplular = pd.DataFrame({'node_index': [list(top_popular) for i in test_users], 'user_index': test_users, 'rank': [[j for j in range(0, K)]for i in range(len(test_users))]})


df_preds_top_poplular = df_preds_top_poplular.explode(
    ['node_index', 'rank']
).merge(
    df_test[['user_index', 'node_index']].assign(relevant=1).drop_duplicates(),
    on = ['user_index', 'node_index'],
    how='left' ,
)
df_preds_top_poplular['relevant'] = df_preds_top_poplular['relevant'].fillna(0)

calc_hitrate(df_preds_top_poplular, K)

In [None]:
calc_prec(df_preds_top_poplular, 30)

# Search best batch_size, edim, num_negatives, lr, Optimizer

In [39]:
mlflow.set_tracking_uri('http://84.201.128.89:90/')
mlflow.set_experiment('homework-msshelestov')

2024/06/18 11:30:29 INFO mlflow.tracking.fluent: Experiment with name 'homework-msshelestov' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/39', creation_time=1718710229128, experiment_id='39', last_update_time=1718710229128, lifecycle_stage='active', name='homework-msshelestov', tags={}>

In [41]:
def calc_hitrate(df_preds: pd.DataFrame, K: int = 30) -> float:
    return  df_preds[df_preds['rank'] < K].groupby('user_index')['relevant'].max().mean()


def calc_prec(df_preds: pd.DataFrame, K: int = 30) -> float:
    return  (df_preds[df_preds['rank'] < K].groupby('user_index')['relevant'].mean()).mean()


def calculate_metrics(model: torch.nn.Module, K: int = 30) -> float:
    test_users = df_test['user_index'].unique()
    preds = model.pred_top_k(torch.tensor(test_users), K)[1].numpy()
    df_preds = pd.DataFrame({'node_index': list(preds), 'user_index': test_users, 'rank': [[j for j in range(0, K)]for i in range(len(preds))]})
    df_preds = df_preds.explode(['node_index', 'rank']).merge(
        df_test[['user_index', 'node_index']].assign(relevant=1).drop_duplicates(),
        on = ['user_index', 'node_index'],
        how='left' ,
    )
    df_preds['relevant'] = df_preds['relevant'].fillna(0)
    return calc_hitrate(df_preds), calc_prec(df_preds)


def train_model(model: torch.nn.Module, optimizer: torch.optim, dataloader: DataLoader, epoch: int) -> torch.nn.Module:
    for i in range(epoch):
        losses = []
        for i in dataloader:
            users, items, labels = i
            optimizer.zero_grad()
            logits = model(users, items)
            loss = torch.nn.functional.binary_cross_entropy_with_logits(
                logits, labels
            )
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
    return model


def get_optimizer(model, optimizer: str, lr: int) -> torch.optim:
    if optimizer == 'adam':
        return torch.optim.Adam(model.parameters(), lr)
    elif optimizer == 'adamw':
        return torch.optim.AdamW(model.parameters(), lr)
    elif optimizer == 'rmsprop':
        return torch.optim.RMSprop(model.parameters(), lr)
    elif optimizer == 'adadelta':
        return torch.optim.Adadelta(model.parameters(), lr)
    return torch.optim.SGD(model.parameters(), lr, momentum=0.1)


def objective(trial) -> float:
    EPOCH = trial.suggest_int('EPOCH', 5, 15)
    BATCH_SIZE = trial.suggest_int('BATCH_SIZE', 64, 50000)
    NUM_NEGATIVES = trial.suggest_int('NUM_NEGATIVES', 1, 20)
    EDIM = trial.suggest_int('EDIM', 32, 512)
    OPTIMIZER_NAME = trial.suggest_categorical('optimizer', ['adam', 'adamw', 'rmsprop', 'adadelta', 'sgd'])
    LR = trial.suggest_float('LR', 1e-4, 5e-2)

    train_dataset = RecDataset(df_train['user_index'].values, df_train['node_index'], user2seen)
    dataloader = DataLoader(train_dataset, shuffle=True, num_workers=0, batch_size=BATCH_SIZE,
                            collate_fn=lambda x: collate_fn(x, NUM_NEGATIVES, max(df['node_index'].values)))
    model = LatentFactorModel(EDIM, user_indes, node_indes)
    optimizer = get_optimizer(model, OPTIMIZER_NAME, LR)

    model = train_model(model, optimizer, dataloader, EPOCH)
    hitrate, precision = calculate_metrics(model)

    with mlflow.start_run(run_name=f'{trial.number}-trial optuna tuning'):
        mlflow.log_metrics(
            {
                'hitrate-30': hitrate,
                'precision-30': precision,
            }
        )
        mlflow.log_params(
            {
                'BATCH_SIZE': BATCH_SIZE,
                'EPOCH': EPOCH,
                'NUM_NEGATIVES': NUM_NEGATIVES,
                'EDIM': EDIM,
                'OPTIMIZER_NAME': OPTIMIZER_NAME,
                'LR': LR
            }
        )
    return precision


sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(
    study_name='Optimize_LatentModel',
    sampler=sampler,
    direction='maximize',
)
study.optimize(objective, n_trials=75)

[I 2024-06-18 11:37:03,580] A new study created in memory with name: Optimize_LatentModel
[I 2024-06-18 11:39:05,178] Trial 0 finished with value: 0.0010494752623688155 and parameters: {'EPOCH': 9, 'BATCH_SIZE': 47539, 'NUM_NEGATIVES': 15, 'EDIM': 319, 'optimizer': 'adadelta', 'LR': 0.035432821632022674}. Best is trial 0 with value: 0.0010494752623688155.
[I 2024-06-18 11:39:58,760] Trial 1 finished with value: 0.0014492753623188406 and parameters: {'EPOCH': 5, 'BATCH_SIZE': 48498, 'NUM_NEGATIVES': 17, 'EDIM': 134, 'optimizer': 'adadelta', 'LR': 0.01463233409588229}. Best is trial 1 with value: 0.0014492753623188406.
[I 2024-06-18 11:41:19,657] Trial 2 finished with value: 0.0016491754122938533 and parameters: {'EPOCH': 11, 'BATCH_SIZE': 7029, 'NUM_NEGATIVES': 6, 'EDIM': 208, 'optimizer': 'adamw', 'LR': 0.002417875594727886}. Best is trial 2 with value: 0.0016491754122938533.
[I 2024-06-18 11:42:30,168] Trial 3 finished with value: 0.022038980509745128 and parameters: {'EPOCH': 11, 'BA

KeyboardInterrupt: 

In [44]:
print(study.best_params)
study.best_trial

{'EPOCH': 13, 'BATCH_SIZE': 11454, 'NUM_NEGATIVES': 19, 'EDIM': 146, 'optimizer': 'rmsprop', 'LR': 0.03240999255022217}


FrozenTrial(number=38, state=TrialState.COMPLETE, values=[0.04887556221889055], datetime_start=datetime.datetime(2024, 6, 18, 13, 2, 7, 565805), datetime_complete=datetime.datetime(2024, 6, 18, 13, 4, 43, 420939), params={'EPOCH': 13, 'BATCH_SIZE': 11454, 'NUM_NEGATIVES': 19, 'EDIM': 146, 'optimizer': 'rmsprop', 'LR': 0.03240999255022217}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'EPOCH': IntDistribution(high=15, log=False, low=5, step=1), 'BATCH_SIZE': IntDistribution(high=50000, log=False, low=64, step=1), 'NUM_NEGATIVES': IntDistribution(high=20, log=False, low=1, step=1), 'EDIM': IntDistribution(high=512, log=False, low=32, step=1), 'optimizer': CategoricalDistribution(choices=('adam', 'adamw', 'rmsprop', 'adadelta', 'sgd')), 'LR': FloatDistribution(high=0.05, log=False, low=0.0001, step=None)}, trial_id=38, value=None)

In [45]:
EPOCH = 13
BATCH_SIZE = 11454
NUM_NEGATIVES = 19
EDIM = 146
OPTIMIZER_NAME = 'rmsprop'
LR = 0.03240999255022217
train_dataset = RecDataset(df_train['user_index'].values, df_train['node_index'], user2seen)
dataloader = DataLoader(train_dataset, shuffle=True, num_workers=0, batch_size=BATCH_SIZE,
                        collate_fn=lambda x: collate_fn(x, NUM_NEGATIVES, max(df['node_index'].values)))
model = LatentFactorModel(EDIM, user_indes, node_indes)
optimizer = get_optimizer(model, OPTIMIZER_NAME, LR)
model = train_model(model, optimizer, dataloader, EPOCH)
hitrate, precision = calculate_metrics(model)
hitrate, precision

(0.7451274362818591, 0.04922538730634682)

# ALS

In [95]:
def create_csr_matrix_matrix_string(df: pd.DataFrame):
    nodes_count = df.loc[df['is_train'], 'node_index'].nunique()
    train_list = []
    tmp = df.loc[df['is_train'], ['user_index', 'node_index']].groupby('user_index')
    for user_id in df.loc[:, 'user_index'].unique():
        items_string = [0] * nodes_count
        for item in tmp.get_group(user_id).node_index.values:
            items_string[int(item)] = 1
        train_list.append(items_string)
    return train_list


train_data = csr_matrix(create_csr_matrix_matrix_string(df))

In [None]:
model = implicit.als.AlternatingLeastSquares(factors=16,
                                             iterations=15,
                                             random_state=42
                                             )
model.fit(train_data)
test_users = df_test['user_index'].unique()
preds = model.recommend(test_users, train_data[test_users].tocsr(), N=30)[0]
df_preds = pd.DataFrame({'node_index': list(preds), 'user_index': test_users, 'rank': [[j for j in range(30)]for i in range(len(preds))]})
df_preds = df_preds.explode(['node_index', 'rank']).merge(
    df_test[['user_index', 'node_index']].assign(relevant=1).drop_duplicates(),
    on = ['user_index', 'node_index'],
    how='left'
)
df_preds['relevant'] = df_preds['relevant'].fillna(0)
hitrate = calc_hitrate(df_preds, 30)
prec = calc_prec(df_preds, 30)
hitrate, prec

In [105]:
with mlflow.start_run(run_name='ALS'):
        mlflow.log_metrics(
            {
                'hitrate-30': hitrate,
                'precision-30': prec,
            }
        )
        mlflow.log_params(
            {
                'factors': model.factors,
                'iterations': model.iterations,
                'random_state': 42,
            }
        )