## Global settings and imports

In [1]:
from transformers import AutoTokenizer, AutoModel
from evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore

import sys
import torch
import torch
import pandas as pd
from typing import Dict, List, Optional
import numpy as np
from pathlib import Path
import polars as pl

from utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
    DEFAULT_SUBTITLE_COL,
    DEFAULT_LABELS_COL,
    DEFAULT_TITLE_COL,
    DEFAULT_USER_COL,
)

from utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history,
)
from utils._articles import convert_text2encoding_with_transformers
from utils._polars import concat_str_columns, slice_join_dataframes
from utils._articles import create_article_id_to_value_mapping
from utils._nlp import get_transformers_word_embeddings
from utils._python import write_submission_file, rank_predictions_by_score

from models_pytorch.model_config import hparams_nrms
from models_pytorch.nrms import NRMSModel

from transformers import AutoModel, AutoTokenizer


# Check Python version
print(f"Python version: {sys.version.split()[0]}")

# Check PyTorch version
print(f"PyTorch version: {torch.__version__}")

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"Current GPU device: {torch.cuda.get_device_name()}")
    print(f"Number of GPUs available: {torch.cuda.device_count()}")

Python version: 3.10.12
PyTorch version: 2.4.1+cu124
CUDA available: True
Current GPU device: NVIDIA GeForce RTX 3060 Laptop GPU
Number of GPUs available: 1


## Looking at behaviours and history

In [2]:
# Let's first check the data structure
behaviors_path = Path("ebnerd_demo/train/behaviors.parquet")
df = pd.read_parquet(behaviors_path)
print("Behaviors columns:", df.columns)

Behaviors columns: Index(['impression_id', 'article_id', 'impression_time', 'read_time',
       'scroll_percentage', 'device_type', 'article_ids_inview',
       'article_ids_clicked', 'user_id', 'is_sso_user', 'gender', 'postcode',
       'age', 'is_subscriber', 'session_id', 'next_read_time',
       'next_scroll_percentage'],
      dtype='object')


In [3]:
# Debug print for history file structure
history_df = pd.read_parquet("ebnerd_demo/train/history.parquet")
print("\nHistory file columns:", history_df.columns)
print("\nFirst row of history:", history_df.iloc[0])


History file columns: Index(['user_id', 'impression_time_fixed', 'scroll_percentage_fixed',
       'article_id_fixed', 'read_time_fixed'],
      dtype='object')

First row of history: user_id                                                                13538
impression_time_fixed      [2023-04-27T10:17:43.000000, 2023-04-27T10:18:...
scroll_percentage_fixed    [100.0, 35.0, 100.0, 24.0, 100.0, 23.0, 100.0,...
article_id_fixed           [9738663, 9738569, 9738663, 9738490, 9738663, ...
read_time_fixed            [17.0, 12.0, 4.0, 5.0, 4.0, 9.0, 5.0, 46.0, 11...
Name: 0, dtype: object


## Download and load data

In [4]:
PATH = Path("./ebnerd_demo")  # Base path for your data directory
print(PATH)


ebnerd_demo


In [5]:
def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
    """
    Load ebnerd - function
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)
        .pipe(
            truncate_history,
            column=DEFAULT_HISTORY_ARTICLE_ID_COL,
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on=DEFAULT_USER_COL,
            how="left",
        )
    )
    return df_behaviors

# Get the current working directory
CURRENT_DIR = Path.cwd()
print("Current directory:", CURRENT_DIR)

# Set the path directly to ebnerd_small in your project
PATH = CURRENT_DIR / "ebnerd_small"

COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
]
HISTORY_SIZE = 10
FRACTION = 0.01

df_train = (
    ebnerd_from_path(PATH.joinpath("train"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=4,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)

df_validation = (
    ebnerd_from_path(PATH.joinpath("validation"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)

df_train.head(2)

Current directory: /home/miki/Study/2_second/deep learning/Deeplearning-RecSys-Challenge-2024


user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,labels
u32,list[i32],list[i64],list[i64],u32,list[i8]
2386777,"[9760735, 9761569, … 9766560]","[9774823, 9774823, … 9772957]",[9774708],95268651,"[0, 0, … 0]"
1146933,"[9742173, 9770491, … 9767697]","[9762122, 9769504, … 9773045]",[9772882],289779601,"[0, 0, … 0]"


In [6]:
df_articles = pl.read_parquet(PATH.joinpath("articles.parquet"))
df_articles.head(2)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3001353,"""Natascha var ikke den første""","""Politiet frygter nu, at Natasc…",2023-06-29 06:20:33,False,"""Sagen om den østriske Natascha…",2006-08-31 08:06:45,[3150850],"""article_default""","""https://ekstrabladet.dk/krimi/…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative"""
3003065,"""Kun Star Wars tjente mere""","""Biografgængerne strømmer ind f…",2023-06-29 06:20:35,False,"""Vatikanet har opfordret til at…",2006-05-21 16:57:00,[3006712],"""article_default""","""https://ekstrabladet.dk/underh…",[],[],"[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive"""


In [7]:
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)
# =>
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)



# Init dataloaders



In [8]:
# train_dataloader = NRMSDataLoader(
#     behaviors=df_train,
#     article_dict=article_mapping,
#     unknown_representation="zeros",
#     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#     eval_mode=False,
#     batch_size=64,
# )
# val_dataloader = NRMSDataLoader(
#     behaviors=df_validation,
#     article_dict=article_mapping,
#     unknown_representation="zeros",
#     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#     eval_mode=True,
#     batch_size=32,
# )

## Load articles

In [9]:
df_articles = pl.read_parquet(PATH.joinpath("articles.parquet"))
df_articles.head(2)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3001353,"""Natascha var ikke den første""","""Politiet frygter nu, at Natasc…",2023-06-29 06:20:33,False,"""Sagen om den østriske Natascha…",2006-08-31 08:06:45,[3150850],"""article_default""","""https://ekstrabladet.dk/krimi/…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9955,"""Negative"""
3003065,"""Kun Star Wars tjente mere""","""Biografgængerne strømmer ind f…",2023-06-29 06:20:35,False,"""Vatikanet har opfordret til at…",2006-05-21 16:57:00,[3006712],"""article_default""","""https://ekstrabladet.dk/underh…",[],[],"[""Underholdning"", ""Film og tv"", ""Økonomi""]",414,"[433, 434]","""underholdning""",,,,0.846,"""Positive"""


## Init model using HuggingFace's tokenizer and wordembedding

In [10]:
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 30

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)
# =>
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)

## Initiate the dataloaders

In [11]:
from models.dataloader import create_nrms_dataloaders


train_dataloader, val_dataloader = create_nrms_dataloaders(
    train_behaviors=df_train,
    val_behaviors=df_validation, 
    article_dict=article_mapping,
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL
)

Loaded data: X shape = 2342, y shape = 2342
Loaded data: X shape = 2446, y shape = 2446
Dataset length requested: 37
Dataset length requested: 37


In [12]:
from models.dataloader import NRMSDataset, DataLoader
# import models.dataloader_tensorflow as df

2024-11-17 14:32:30.303278: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731850350.363314    9288 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731850350.380434    9288 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-17 14:32:30.524242: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
print("Testing dataset creation...")
dataset = NRMSDataset(
    behaviors=df_train,
    article_dict=article_mapping,
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    unknown_representation="zeros",
    batch_size=32
)

print("\nTesting direct indexing...")
first_item = dataset[0]

print("\nTesting with NRMSDataset...")
loader = DataLoader(
    dataset,
    batch_size=None,
    shuffle=True,
    num_workers=0
)
# print("Getting first batch...")
# first_batch = next(iter(loader))

# print("\nTesting TensorFlow loader:")
# tf_loader = df.NRMSDataLoader(
#     behaviors=df_train,
#     article_dict=article_mapping,
#     history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
#     unknown_representation="zeros",
#     eval_mode=False,
# )
# batch = tf_loader[0]

Testing dataset creation...
Loaded data: X shape = 2342, y shape = 2342

Testing direct indexing...
WTF------------------

PyTorch Dataloader - Batch 0:
Initial batch_X shape: 32
Initial batch_y shape: 32
Training batch_y shape: torch.Size([32, 5])
his_input_title initial shape: torch.Size([32, 10, 1, 1, 30])
pred_input_title initial shape: torch.Size([32, 5, 1, 1, 30])

Final shapes:
his_input_title: torch.Size([32, 10, 1, 30])
pred_input_title: torch.Size([32, 5, 1, 30])
batch_y: torch.Size([32, 5])

Sample values:
his_input_title first element: [[16148.0, 1360.0, 4620.0, 92.0, 3366.0, 17.0, 1368.0, 72.0, 122.0, 27882.0, 20356.0, 45887.0, 150065.0, 5.0, 49085.0, 138271.0, 33791.0, 18.0, 1459.0, 128102.0, 47640.0, 109.0, 14384.0, 1015.0, 45331.0, 1042.0, 4620.0, 33791.0, 18.0, 1459.0]]
pred_input_title first element: [[18118.0, 449.0, 1272.0, 20928.0, 67.0, 50825.0, 757.0, 11565.0, 139.0, 35988.0, 1953.0, 22.0, 24792.0, 106543.0, 16749.0, 4034.0, 128.0, 48983.0, 18.0, 4444.0, 4.0, 106

## Train the NRMS model

In [37]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

# Define paths
MODEL_NAME = "NRMS"
LOG_DIR = os.path.join("downloads", "runs", MODEL_NAME)
MODEL_WEIGHTS = os.path.join("downloads", "data", "state_dict", MODEL_NAME, "weights.pth")

# Create directories if they don't exist
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(os.path.dirname(MODEL_WEIGHTS), exist_ok=True)

class EarlyStopping:
    def __init__(self, patience=2, verbose=True, delta=0):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.delta = delta

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
            if self.verbose:
                print(f"Initial validation loss: {val_loss:.6f}")
        elif val_loss > self.best_loss + self.delta:
            self.counter += 1
            if self.verbose:
                print(f"Validation loss did not improve from {self.best_loss:.6f}. Counter: {self.counter}/{self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            if self.verbose:
                print(f"Validation loss improved from {self.best_loss:.6f} to {val_loss:.6f}")
            self.best_loss = val_loss
            self.counter = 0

class ModelCheckpoint:
    def __init__(self, filepath, verbose=True):
        self.filepath = filepath
        self.verbose = verbose
        self.best_loss = float('inf')

    def __call__(self, model, val_loss):
        if val_loss < self.best_loss:
            self.best_loss = val_loss
            self.save_checkpoint(model)

    def save_checkpoint(self, model):
        torch.save(model.state_dict(), self.filepath)
        if self.verbose:
            print(f"Model checkpoint saved: {self.filepath}")

def train_nrms_model(model, train_dataloader, val_dataloader, num_epochs=100):
    # Setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    model = model.to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Initialize callbacks
    writer = SummaryWriter(log_dir=LOG_DIR)
    early_stopping = EarlyStopping(patience=2)
    model_checkpoint = ModelCheckpoint(filepath=MODEL_WEIGHTS)

    # Debug first batch shapes
    print("\nDebugging initial batch shapes:")
    for batch_idx, (inputs, targets) in enumerate(train_dataloader):
        print(f"\nBatch {batch_idx}:")
        print(f"Number of inputs: {len(inputs)}")
        for i, inp in enumerate(inputs):
            print(f"Input {i} shape: {inp.shape}")
        print(f"Targets shape: {targets.shape}")
        break

    # Training loop
    for epoch in range(1, num_epochs + 1):
        print(f"\nEpoch {epoch}/{num_epochs}")
        
        # Training phase
        model.train()
        train_loss = 0
        train_bar = tqdm(train_dataloader, desc="Training")
        
        for batch_idx, (inputs, targets) in enumerate(train_bar):
            # Move data to device
            inputs = [inp.to(device) for inp in inputs]
            targets = targets.to(device)
            
            # Forward pass
            optimizer.zero_grad()
            try:
                outputs = model(*inputs)
                loss = criterion(outputs, targets)
                
                # Backward pass
                loss.backward()
                optimizer.step()
                
                # Update metrics
                train_loss += loss.item()
                avg_train_loss = train_loss / (batch_idx + 1)
                train_bar.set_postfix(loss=f"{avg_train_loss:.4f}")
                
            except Exception as e:
                print(f"\nError in batch {batch_idx}:")
                print(f"Input shapes when error occurred:")
                for i, inp in enumerate(inputs):
                    print(f"Input {i} shape: {inp.shape}")
                raise e

        avg_train_loss = train_loss / len(train_dataloader)
        writer.add_scalar('Loss/Train', avg_train_loss, epoch)
        
        # Validation phase
        model.eval()
        val_loss = 0
        val_bar = tqdm(val_dataloader, desc="Validation")
        
        with torch.no_grad():
            for inputs, targets in val_bar:
                # Move data to device
                inputs = [inp.to(device) for inp in inputs]
                targets = targets.to(device)
                
                # Forward pass
                outputs = model(*inputs)
                loss = criterion(outputs, targets)
                
                # Update metrics
                val_loss += loss.item()
                avg_val_loss = val_loss / len(val_dataloader)
                val_bar.set_postfix(loss=f"{avg_val_loss:.4f}")

        avg_val_loss = val_loss / len(val_dataloader)
        writer.add_scalar('Loss/Validation', avg_val_loss, epoch)

        # Print epoch summary
        print(f"\nEpoch {epoch} summary:")
        print(f"Training Loss: {avg_train_loss:.6f}")
        print(f"Validation Loss: {avg_val_loss:.6f}")

        # Check for improvement
        model_checkpoint(model, avg_val_loss)
        early_stopping(avg_val_loss)

        if early_stopping.early_stop:
            print("Early stopping triggered!")
            break

    # Load best model
    model.load_state_dict(torch.load(MODEL_WEIGHTS))
    print(f"Loaded best model weights from {MODEL_WEIGHTS}")
    writer.close()
    
    return model

# Use the training function
# Initialize your model and data
hparams_nrms.history_size = HISTORY_SIZE
model = NRMSModel(
    hparams=hparams_nrms.__dict__,
    word2vec_embedding=word2vec_embedding,
    seed=42,
)

# Train the model
model = train_nrms_model(model, train_dataloader, val_dataloader)

Using device: cuda

Debugging initial batch shapes:

Batch 0:
Number of inputs: 2
Input 0 shape: torch.Size([64, 1, 10, 30])
Input 1 shape: torch.Size([64, 1, 5, 30])
Targets shape: torch.Size([64, 1, 5])

Epoch 1/100


Training:   0%|          | 0/37 [00:00<?, ?it/s]


Error in batch 0:
Input shapes when error occurred:
Input 0 shape: torch.Size([64, 1, 10, 30])
Input 1 shape: torch.Size([64, 1, 5, 30])





ValueError: too many values to unpack (expected 3)

## Example how to compute some metrics:

In [None]:
pred_validation = model.predict(val_dataloader)

## Add the predictions to the dataframe

In [None]:
df_validation = add_prediction_scores(df_validation, pred_validation.tolist()).pipe(
    add_known_user_column, known_users=df_train[DEFAULT_USER_COL]
)
df_validation.head(2)

## Compute metrics

In [None]:
metrics = MetricEvaluator(
    labels=df_validation["labels"].to_list(),
    predictions=df_validation["scores"].to_list(),
    metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
)
metrics.evaluate()

## References

https://github.com/ebanalyse/ebnerd-benchmark/blob/main/examples/00_quick_start/nrms_ebnerd.ipynb