# Home Work

В этой работе будем иследовать одну из популярнейших рекомендательных моделей - **Latent Factor Model** - https://arxiv.org/pdf/1912.04754. 

Перед выполнением задания нужно убедиться, что прогоняется бейзлайн. Для этого:
1. Скачайте  файлы - **node2name.json** и **clickstream.parque** с необходимыми данными
2. Положите в репозиторий ноутбука и запустите код

В этой работе вам нужно:
1. перебрать параметры модели - edim,batch_size, lr, epoch , num_negatives -   (по **1 балу - 5 балов**) 
2. Тип OPTIMIZER_NAME - (**4 бала за 5 оптимизаторов**)
3. На основе имеющихся данных собрать лучшую модель (по **precision@30**) и рассчитать ее метрики (**4 бала**)
4. Попробовать другие модели (например  als - https://benfred.github.io/implicit/ , gru4rec, sasrec  ) - за sasrec на хорошем уровне сразу **10 балов**. За другие модели по **3 бала**
5. По окончанию работы в mlflow настроить графики для сравнения моделей. Можно проявить фантазию, но обязательно должно быть сравнение с бейзлайном (данный ноутбук) против других моделей
6. В mlflow залогировать последнюю версию ноутбука - необходимое условия. Либо в github, но тогда прикрепить ссылку в [mlflow](http://84.201.128.89:90/) . Эксперимент в формате - **homework-\<name\>**
7. Доп балы (**20 баллов**) тому у кого будет наибольший скор на тесте. Но ваш ноутбук должен прогонятся и быть вопроизводимым.

Суммарно за работу **20 балов**

In [None]:
!pip install implicit
!pip install lightfm
!pip install mlflow

In [3]:
import json
import numpy as np
import os
import pandas as pd
import random 

import implicit
from lightfm import LightFM
from lightfm.data import Dataset as LFM_Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
import mlflow
import optuna
import torch
from scipy.sparse import coo_matrix
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

In [4]:
with open('/kaggle/input/mlflow-pipeline-data/node2name.json', 'r') as f:
    node2name = json.load(f)

node2name = {int(k):v for k,v in node2name.items()}


In [2]:
# node2name

In [5]:
df = pd.read_parquet('/kaggle/input/mlflow-pipeline-data/clickstream.parque')
df = df.head(100_000)

In [6]:
df['is_train'] = df['event_date']< df['event_date'].max() - pd.Timedelta('2 day')
df['names'] = df['node_id'].map(node2name)

In [5]:
df.head()

Unnamed: 0,cookie_id,event_date,node_id,is_train,names
0,15157399,2024-02-21 11:20:01,1047840,True,root -> Транспорт -> Запчасти и аксессуары -> ...
1,15157399,2024-03-05 10:24:54,1047561,True,root -> Услуги -> Предложения услуг -> Красота...
2,15157399,2024-03-05 10:28:55,1047561,True,root -> Услуги -> Предложения услуг -> Красота...
3,15157399,2024-04-13 11:22:25,1047835,True,root -> Транспорт -> Запчасти и аксессуары -> ...
4,15157399,2024-04-13 11:22:45,1047835,True,root -> Транспорт -> Запчасти и аксессуары -> ...


In [7]:
train_cooks = df[df['is_train']]['cookie_id'].unique()
train_items = df[df['is_train']]['node_id'].unique()

df = df[(df['cookie_id'].isin(train_cooks)) & (df['node_id'].isin(train_items))]

In [8]:
user_indes, index2user_id = pd.factorize(df['cookie_id'])
df['user_index'] = user_indes

node_indes, index2node = pd.factorize(df['node_id'])
df['node_index'] = node_indes

In [8]:
df['node_index'].max()

2175

In [9]:
df.head()

Unnamed: 0,cookie_id,event_date,node_id,is_train,names,user_index,node_index
0,15157399,2024-02-21 11:20:01,1047840,True,root -> Транспорт -> Запчасти и аксессуары -> ...,0,0
1,15157399,2024-03-05 10:24:54,1047561,True,root -> Услуги -> Предложения услуг -> Красота...,0,1
2,15157399,2024-03-05 10:28:55,1047561,True,root -> Услуги -> Предложения услуг -> Красота...,0,1
3,15157399,2024-04-13 11:22:25,1047835,True,root -> Транспорт -> Запчасти и аксессуары -> ...,0,2
4,15157399,2024-04-13 11:22:45,1047835,True,root -> Транспорт -> Запчасти и аксессуары -> ...,0,2


In [9]:
df_train, df_test = df[df['is_train']], df[~df['is_train']]
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

df_train.shape, df_test.shape

((96611, 7), (3333, 7))

# LatentFactorModel

## Baseline

In [10]:
class RecDataset(Dataset):
    def __init__(self, users, items, item_per_users):
        self.users = users
        self.items = items
        self.item_per_users=item_per_users

    def __len__(self):
        return len(self.users)

    def __getitem__(self, i):
        user = self.users[i]
        return torch.tensor(user), torch.tensor(self.items[i]), self.item_per_users[user]


class LatentFactorModel(nn.Module):
    def __init__(self, edim, user_indexes, node_indexes):
        super(LatentFactorModel, self).__init__()
        self.edim = edim
        self.users = nn.Embedding(max(user_indexes) + 1, edim)
        self.items = nn.Embedding(max(node_indexes) + 1, edim)

    def forward(self, users, items):
        user_embedings = self.users(users).reshape(-1, self.edim )
        item_embedings = self.items(items)
        res = torch.einsum('be,bne->bn', user_embedings, item_embedings)
        return res 

    def pred_top_k(self, users, K=10):
        user_embedings = self.users(users).reshape(-1, self.edim )
        item_embedings = self.items.weight
        res = torch.einsum('ue,ie->ui', user_embedings, item_embedings)
        return torch.topk(res, K, dim=1)


def collate_fn(batch, num_negatives, num_items):
    users, target_items, users_negatives = [],[], []
    for triplets in batch:
        user, target_item, seen_item = triplets
        
        users.append(user)
        target_items.append(target_item)
        user_negatives = []
        
        while len(user_negatives)< num_negatives:
            candidate = random.randint(0, num_items)
            if candidate not in seen_item:
                user_negatives.append(candidate)
                
        users_negatives.append(user_negatives)


    positive = torch.ones(len(batch), 1)       
    negatives = torch.zeros(len(batch), num_negatives)
    labels = torch.hstack([positive, negatives])
    # print(torch.tensor(target_items))
    # print(users_negatives)
    items = torch.hstack([torch.tensor(target_items).reshape(-1, 1), torch.tensor(users_negatives)])
    return torch.hstack(users), items, labels

In [11]:
user2seen = df_train.groupby('user_index')['node_index'].agg(lambda x: list(set(x)))

In [12]:
SEED = 42

def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True

    os.environ['PYTHONHASHSEED'] = str(SEED)

In [13]:
BATCH_SIZE = 50_000
NUM_NEGATIVES = 5
EDIM = 128
EPOCH = 10
OPTIMIZER_NAME = 'Adam'
LR = 1

train_dataset = RecDataset(df_train['user_index'].values, df_train['node_index'], user2seen)

dataloader = DataLoader(train_dataset, shuffle=True,num_workers=0, batch_size=BATCH_SIZE,collate_fn=lambda x: collate_fn(x, NUM_NEGATIVES, max(df['node_index'].values)))

model = LatentFactorModel(EDIM, user_indes, node_indes)
optimizer = torch.optim.Adam(model.parameters(), LR)
 
bar = tqdm(total = EPOCH )

  0%|          | 0/10 [00:00<?, ?it/s]

In [14]:
for i in range(EPOCH):
    bar_loader = tqdm(total = len(dataloader) ,)
    losses = []
    for i in dataloader:
        users, items, labels = i
        optimizer.zero_grad()
        logits = model(users, items)
        loss = torch.nn.functional.binary_cross_entropy_with_logits(
            logits, labels
        )
        loss.backward()
        optimizer.step()
        bar_loader.update(1)
        bar_loader.set_description(f'batch loss - {loss.item()}')
        losses.append(loss.item())
    
    bar.update(1)
    bar.set_description(f'epoch loss - {sum(losses)/len(losses)}')

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
USER = 0

preds = list(model.pred_top_k(torch.tensor([USER]), 10)[1][0].numpy())
df[(df['user_index'] == USER) & (df['node_index'].isin(user2seen[USER]))]['names'].tolist()

['root -> Транспорт -> Запчасти и аксессуары -> Запчасти -> Для автомобилей -> Кузов',
 'root -> Услуги -> Предложения услуг -> Красота, здоровье -> СПА-услуги, массаж',
 'root -> Услуги -> Предложения услуг -> Красота, здоровье -> СПА-услуги, массаж',
 'root -> Транспорт -> Запчасти и аксессуары -> Запчасти -> Для автомобилей -> Система охлаждения',
 'root -> Транспорт -> Запчасти и аксессуары -> Запчасти -> Для автомобилей -> Система охлаждения',
 'root -> Транспорт -> Запчасти и аксессуары -> Запчасти -> Для автомобилей -> Система охлаждения']

In [17]:
[node2name[index2node[i]] for i in preds]

['root -> Транспорт -> Запчасти и аксессуары -> Запчасти -> Для автомобилей -> Система охлаждения',
 'root -> Транспорт -> Запчасти и аксессуары -> Запчасти -> Для автомобилей -> Кузов',
 'root -> Транспорт -> Запчасти и аксессуары -> Запчасти -> Для автомобилей -> Топливная и выхлопная системы',
 'root -> Транспорт -> Запчасти и аксессуары -> Запчасти -> Для автомобилей -> Подвеска',
 'root -> Услуги -> Предложения услуг -> Красота, здоровье -> СПА-услуги, массаж',
 'root -> Личные вещи -> Одежда, обувь, аксессуары -> Мужская обувь -> Мужские кроссовки',
 'root -> Услуги -> Предложения услуг -> Автосервис, аренда -> Автосервис -> Диагностика и ремонт авто',
 'root -> Транспорт -> Запчасти и аксессуары -> Запчасти -> Для автомобилей -> Электрооборудование',
 'root -> Для дома и дачи -> Бытовая техника -> Другое',
 'root -> Электроника -> Игры, приставки и программы -> Игры для приставок']

In [16]:
K = 100

test_users = df_test['user_index'].unique()


preds = model.pred_top_k(torch.tensor(test_users), K)[1].numpy()
df_preds = pd.DataFrame({'node_index': list(preds), 'user_index': test_users, 'rank': [[j for j in range(0, K)]for i in range(len(preds))]})

df_preds = df_preds.explode(['node_index', 'rank']).merge(
    df_test[['user_index', 'node_index']].assign(relevant=1).drop_duplicates(),
    on = ['user_index', 'node_index'],
    how='left' ,
)
df_preds['relevant'] = df_preds['relevant'].fillna(0)

In [17]:
def calc_hitrate(df_preds, K):
    return  df_preds[df_preds['rank']<K].groupby('user_index')['relevant'].max().mean()

def calc_prec(df_preds, K):
    return  (df_preds[df_preds['rank']<K].groupby('user_index')['relevant'].mean()).mean()
    
hitrate = calc_hitrate(df_preds, K)

hitrate, K

(0.8320839580209896, 100)

In [18]:
prec_30 = calc_prec(df_preds, 30)
prec_30

0.04332833583208395

## Mlflow (http://84.201.128.89:90/)

In [98]:
mlflow.set_tracking_uri('http://84.201.128.89:90/')

mlflow.set_experiment('homework-yvmazepa.ext')

<Experiment: artifact_location='mlflow-artifacts:/19', creation_time=1716564835477, experiment_id='19', last_update_time=1716564835477, lifecycle_stage='active', name='homework-yvmazepa.ext', tags={}>

In [None]:
model_name = 'LatentFactorModel'
with mlflow.start_run(run_name=f'{model_name}_baseline'):
    mlflow.log_metrics(
        {
            'precision_30': prec_30, 
            'hitrate_30': hitrate,
        }
    )
    mlflow.log_params(
        {
            'model_name': model_name,
            'batch_size': BATCH_SIZE,
            'num_negatives': NUM_NEGATIVES,
            'edim': EDIM,
            'epoch': EPOCH,
            'learning_rate': LR,
            'optimizer': 'Adam'
        }
    )

In [None]:
df_train['node_index'].max()

In [None]:
top_popular = df_train[['node_index']].assign(v=1).groupby('node_index').count().reset_index().sort_values(by='v').tail(K)['node_index'].values


In [None]:
node2name[index2node[top_popular[-1]]]

In [None]:
df_preds_top_poplular = pd.DataFrame({'node_index': [list(top_popular) for i in test_users], 'user_index': test_users, 'rank': [[j for j in range(0, K)]for i in range(len(test_users))]})

df_preds_top_poplular = df_preds_top_poplular.explode(
    ['node_index', 'rank']
).merge(
    df_test[['user_index', 'node_index']].assign(relevant=1).drop_duplicates(),
    on = ['user_index', 'node_index'],
    how='left' ,
)
df_preds_top_poplular['relevant'] = df_preds_top_poplular['relevant'].fillna(0)

calc_hitrate(df_preds_top_poplular, K)

In [None]:
calc_prec(df_preds_top_poplular, 30)

## Перебор параметров

In [None]:
def objective(trial):
    BATCH_SIZE = trial.suggest_categorical("batch_size", [100, 500, 1000, 5000, 10000])
    NUM_NEGATIVES = trial.suggest_int("num_negatives", 1, 10)
    EDIM = trial.suggest_categorical("edim", [32, 64, 128, 256, 512])
    EPOCH = trial.suggest_int("epoch", 5, 20)
    LR = trial.suggest_float("lr", 1e-5, 1e-0, log=True)

    train_dataset = RecDataset(df_train['user_index'].values, df_train['node_index'], user2seen)
    dataloader = DataLoader(train_dataset, shuffle=True, num_workers=0, batch_size=BATCH_SIZE,
                            collate_fn=lambda x: collate_fn(x, NUM_NEGATIVES, max(df['node_index'].values)))

    model = LatentFactorModel(EDIM, user_indes, node_indes)
    optimizer = torch.optim.Adam(model.parameters(), LR)

    for _ in range(EPOCH):
        losses = []
        for i in dataloader:
            users, items, labels = i
            optimizer.zero_grad()
            logits = model(users, items)
            loss = torch.nn.functional.binary_cross_entropy_with_logits(logits, labels)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())

    preds = model.pred_top_k(torch.tensor(test_users), K)[1].numpy()
    df_preds = pd.DataFrame({'node_index': list(preds), 'user_index': test_users, 'rank': [[j for j in range(0, K)]for i in range(len(preds))]})

    df_preds = df_preds.explode(['node_index', 'rank']).merge(
        df_test[['user_index', 'node_index']].assign(relevant=1).drop_duplicates(),
        on = ['user_index', 'node_index'],
        how='left' ,
    )
    df_preds['relevant'] = df_preds['relevant'].fillna(0)
    
    prec_30 = calc_prec(df_preds, K)
    hitrate = calc_hitrate(df_preds, K)
    
    with mlflow.start_run(run_name=f'{model_name}_optuna_{trial.number}'):
        mlflow.log_metrics(
            {
                'precision_30': prec_30, 
                'hitrate_30': hitrate,
            }
        )
        mlflow.log_params(
            {
                'model_name': model_name,
                'batch_size': BATCH_SIZE,
                'num_negatives': NUM_NEGATIVES,
                'edim': EDIM,
                'epoch': EPOCH,
                'learning_rate': LR,
                'optimizer': 'Adam'
            }
        )

    return prec_30

K = 30
test_users = df_test['user_index'].unique()
model_name = 'LatentFactorModel'

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

## Подбор оптимизатора

In [None]:
BATCH_SIZE = trial.params['batch_size']
NUM_NEGATIVES = trial.params['num_negatives']
EDIM = trial.params['edim']
EPOCH = trial.params['epoch']
LR = trial.params['lr']

set_seed()

train_dataset = RecDataset(df_train['user_index'].values, df_train['node_index'], user2seen)
dataloader = DataLoader(train_dataset, shuffle=True,num_workers=0, batch_size=BATCH_SIZE,collate_fn=lambda x: collate_fn(x, NUM_NEGATIVES, max(df['node_index'].values)))

model = LatentFactorModel(EDIM, user_indes, node_indes)

optimizers = [
    torch.optim.SGD(model.parameters(), LR),
    torch.optim.Rprop(model.parameters(), LR),
    torch.optim.AdamW(model.parameters(), LR),
    torch.optim.RMSprop(model.parameters(), LR),
    torch.optim.Adagrad(model.parameters(), LR),
    torch.optim.Adam(model.parameters(), LR)
]
for optimizer in optimizers:
    for i in range(EPOCH):
        losses = []
        for i in dataloader:
            users, items, labels = i
            optimizer.zero_grad()
            logits = model(users, items)
            loss = torch.nn.functional.binary_cross_entropy_with_logits(
                logits, labels
            )
            loss.backward()
            optimizer.step()
            losses.append(loss.item())

    preds = model.pred_top_k(torch.tensor(test_users), K)[1].numpy()
    df_preds = pd.DataFrame({'node_index': list(preds), 'user_index': test_users, 'rank': [[j for j in range(0, K)]for i in range(len(preds))]})

    df_preds = df_preds.explode(['node_index', 'rank']).merge(
        df_test[['user_index', 'node_index']].assign(relevant=1).drop_duplicates(),
        on = ['user_index', 'node_index'],
        how='left' ,
    )
    df_preds['relevant'] = df_preds['relevant'].fillna(0)
    
    prec_30 = calc_prec(df_preds, K)
    hitrate = calc_hitrate(df_preds, K)
    
    optimizer_name = optimizer.__class__.__name__
    with mlflow.start_run(run_name=f'{model_name}_{optimizer_name}'):
        mlflow.log_metrics(
            {
                'precision_30': prec_30, 
                'hitrate_30': hitrate,
            }
        )
        mlflow.log_params(
            {
                'model_name': model_name,
                'batch_size': BATCH_SIZE,
                'num_negatives': NUM_NEGATIVES,
                'edim': EDIM,
                'epoch': EPOCH,
                'learning_rate': LR,
                'optimizer': optimizer_name
            }
        )

    print(f'{optimizer_name}: {calc_prec(df_preds, 30):.3f}')
    

# ALS

## Перебор параметров

In [None]:
set_seed()

BATCH_SIZE = trial.params['batch_size']
NUM_NEGATIVES = trial.params['num_negatives']
EDIM = trial.params['edim']
EPOCH = trial.params['epoch']
LR = trial.params['lr']

df_train2 = df_train.copy()
df_train2['interaction'] = 1
user_item_matrix = coo_matrix(
    (df_train2['interaction'].values, (df_train2['user_index'].values, df_train2['node_index'].values))
).tocsr()

model = implicit.als.AlternatingLeastSquares(factors=64, regularization=0.1, iterations=EPOCH)
model.fit(user_item_matrix)

df_test2 = df_test.copy()
df_test2['interaction'] = 1
test_user_item_matrix = coo_matrix(
    (df_test2['interaction'].values, (df_test2['user_index'].values, df_test2['node_index'].values))
).tocsr()

test_users = df_test2['user_index'].unique()

for optimizer_name in ['SGD', 'LBFGS', 'AdamW', 'RMSprop', 'Adagrad', 'Adam']:
    model = implicit.als.AlternatingLeastSquares(factors=64, regularization=0.1, iterations=EPOCH)
    model.fit(user_item_matrix)

    user_factors = model.user_factors
    item_factors = model.item_factors

    preds = []
    for user in test_users:
        scores = model.recommend(user, user_item_matrix[user], N=K, filter_already_liked_items=False)
        preds.append(scores[0])
    
    df_preds = pd.DataFrame({
        'node_index': np.concatenate(preds),
        'user_index': np.repeat(test_users, K),
        'rank': np.tile(np.arange(K), len(test_users))
    })

    df_preds = df_preds.explode(['node_index', 'rank']).merge(
        df_test2[['user_index', 'node_index']].assign(relevant=1).drop_duplicates(),
        on=['user_index', 'node_index'],
        how='left'
    )
    df_preds['relevant'] = df_preds['relevant'].fillna(0)
    
    prec_30 = calc_prec(df_preds, K)
    hitrate = calc_hitrate(df_preds, K)

    with mlflow.start_run(run_name=f'ALS_{optimizer_name}'):
        mlflow.log_metrics({
            'precision_30': prec_30,
            'hitrate_30': hitrate
        })
        mlflow.log_params({
            'model_name': 'ALS',
            'batch_size': BATCH_SIZE,
            'num_negatives': NUM_NEGATIVES,
            'edim': EDIM,
            'epoch': EPOCH,
            'learning_rate': LR,
            'optimizer': optimizer_name
        })

    print(f'{optimizer_name}: {prec_30:.3f}')

# LightFM

## Перебор параметров

In [99]:
def objective(trial):
    NUM_NEGATIVES = trial.suggest_int("num_negatives", 10, 100)
    EDIM = trial.suggest_int("edim", 10, 100)
    EPOCH = trial.suggest_int("epoch", 10, 20)
    LR = trial.suggest_float("lr", 0.01, 0.1)
    LOSS = 'warp'

    model = LightFM(random_state=60, loss=LOSS, max_sampled=NUM_NEGATIVES, no_components=EDIM, learning_rate=LR)
    model.fit(train_interactions, epochs=EPOCH)
    prec_30 = precision_at_k(model, test_interactions, k=K).mean()
    hitrate = recall_at_k(model, test_interactions, k=K).mean()
    
    with mlflow.start_run(run_name=f'{model_name}_optuna_{trial.number}'):
        mlflow.log_metrics(
            {
                'precision_30': prec_30, 
                'hitrate_30': hitrate,
            }
        )
        mlflow.log_params(
            {
                'model_name': model_name,
                'num_negatives': NUM_NEGATIVES,
                'edim': EDIM,
                'epoch': EPOCH,
                'learning_rate': LR,
                'loss': LOSS
            }
        )

    return prec_30

K = 30
model_name = 'LightFM'

dataset = LFM_Dataset()
dataset.fit(users=df_train['user_index'].unique(), items=df_train['node_index'].unique())
(train_interactions, _) = dataset.build_interactions(((user, item) for user, item in zip(df_train['user_index'], df_train['node_index'])))
(test_interactions, _) = dataset.build_interactions(((user, item) for user, item in zip(df_test['user_index'], df_test['node_index'])))

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-05-27 07:45:12,877] A new study created in memory with name: no-name-efe3187d-f591-4171-aafd-ff87500bf971
[I 2024-05-27 07:45:16,776] Trial 0 finished with value: 0.045427292585372925 and parameters: {'num_negatives': 23, 'edim': 17, 'epoch': 16, 'lr': 0.05455090834696184}. Best is trial 0 with value: 0.045427292585372925.
[I 2024-05-27 07:45:31,897] Trial 1 finished with value: 0.04847576469182968 and parameters: {'num_negatives': 66, 'edim': 41, 'epoch': 19, 'lr': 0.09993954381201313}. Best is trial 1 with value: 0.04847576469182968.
[I 2024-05-27 07:45:35,092] Trial 2 finished with value: 0.04327836260199547 and parameters: {'num_negatives': 12, 'edim': 23, 'epoch': 14, 'lr': 0.031000512459745118}. Best is trial 1 with value: 0.04847576469182968.
[I 2024-05-27 07:45:45,446] Trial 3 finished with value: 0.04722638800740242 and parameters: {'num_negatives': 88, 'edim': 23, 'epoch': 19, 'lr': 0.05476140724508142}. Best is trial 1 with value: 0.04847576469182968.
[I 2024-05-27 0

Number of finished trials:  20
Best trial:
  Value:  0.05087456479668617
  Params: 
    num_negatives: 94
    edim: 66
    epoch: 18
    lr: 0.032499089643878175


In [41]:
NUM_NEGATIVES = 100
EDIM = 70
EPOCH = 11
LR = 0.05
LOSS = 'warp'

model_name = 'LightFM'
dataset = LFM_Dataset()

dataset.fit(users=df_train['user_index'].unique(), items=df_train['node_index'].unique())
(train_interactions, _) = dataset.build_interactions(((user, item) for user, item in zip(df_train['user_index'], df_train['node_index'])))
(test_interactions, _) = dataset.build_interactions(((user, item) for user, item in zip(df_test['user_index'], df_test['node_index'])))

model = LightFM(random_state=60, loss='warp', max_sampled=NUM_NEGATIVES, no_components=EDIM, learning_rate=LR)
model.fit(train_interactions, epochs=EPOCH)

prec_30 = precision_at_k(model, test_interactions, k=30).mean()
print("Precision at k: {:.5f}".format(prec_30))

with mlflow.start_run(run_name=f'LightFM_best_params'):
    mlflow.log_metrics(
        {
            'precision_30': prec_30, 
            'hitrate_30': hitrate,
        }
    )
    mlflow.log_params(
        {
            'model_name': model_name,
            'num_negatives': NUM_NEGATIVES,
            'edim': EDIM,
            'epoch': EPOCH,
            'learning_rate': LR,
            'loss': LOSS
        }
    )

Precision at k: 0.05157
