In [1]:
import os
import random
import numpy as np
from functools import partial

import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss

from src.category_tree.category_tree import CategoryTree
from src.metrics.transformers_metrics import hierarchical_accuracy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def seed_everything(seed: int):
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
CAT_ID_COL = "cat_id"
TITLE_COL = "source_name"

TITLE_MODEL_COL = "text"
CAT_ID_MODEL_COL = "label"
PART_TYPE_COL = "part_type"
PART_COL = "part"

TRAIN_PATH = "./data/processed/train.parquet"
VAL_PATH = "./data/processed/val.parquet"
CAT_TREE_PATH = "./data/raw/category_tree.csv"

MODEL = "cointegrated/rubert-tiny2"
NUM_EPOCHS = 1

RANDOM_STATE = 56

# Load dataset

In [4]:
category_tree = CategoryTree(category_tree_path=CAT_TREE_PATH)

In [None]:
badlabel = pd.read_csv('./data/processed/bad_labeled_qwen2.5:3b.csv')
badlabel.dropna(how='all', inplace=True)

categor = pd.read_csv(CAT_TREE_PATH)
badlabel = pd.merge(badlabel, categor, left_on='pred_cat_id', right_on='cat_name')

badlabel = badlabel[['source_name', 'cat_id']]
badlabel.rename(columns={'source_name':'text', 'cat_id':'label'}, inplace=True)
badlabel['part'] = 'train'

badlabel = badlabel[badlabel['label'].isin(category_tree.leaf_nodes)]

In [None]:
train = pd.read_parquet(TRAIN_PATH)
val = pd.read_parquet(VAL_PATH)

df = pd.concat([train, val])
df = df.rename(columns={TITLE_COL:TITLE_MODEL_COL, CAT_ID_COL:CAT_ID_MODEL_COL})
df = pd.concat([df, badlabel])

df[CAT_ID_MODEL_COL] = category_tree.label_encoder.transform(df[CAT_ID_MODEL_COL])
df = df.sample(frac=1,random_state=RANDOM_STATE).reset_index(drop=True)

# Tokenization

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
def tokenize_function(examples):
    return tokenizer(examples[TITLE_MODEL_COL], truncation=True)

def load_experiment_dataset(df: pd.DataFrame):
    parts_datasets = {
        part: Dataset.from_pandas(
            df[df[PART_COL]==part][[TITLE_MODEL_COL, CAT_ID_MODEL_COL]],
            split=part
        )
        for part in ["train", "val"]
    }

    dataset = DatasetDict(parts_datasets)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    return tokenized_dataset

def load_full_dataset(df: pd.DataFrame):
    dataset = Dataset.from_pandas(df[[TITLE_MODEL_COL, CAT_ID_MODEL_COL]])
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset

# Label Smoothing
##### Idea: for each node, distribute its probability mass over its neighboring leaf nodes (should reduce the impact of labeling errors)


In [8]:
class LabelSmoothingCrossEntropyLoss(nn.Module):
    def __init__(self, category_tree: CategoryTree, smoothing: float = 0.2, reduction: str = "mean"):
        super().__init__()

        self.smoothing = smoothing
        self.category_tree = category_tree

        self.label_encoder = self.category_tree.label_encoder
        self.leaf_nodes = set(self.category_tree.leaf_nodes)
        self.category_tree_edges = self.category_tree.inverted_edge_dict

        self.nearest_neighbors = self._precompute_nearest_neighbors()

        self.loss_fct = CrossEntropyLoss(reduction=reduction)

    def forward(self, input, target):
        num_classes = input.shape[-1]

        true_dist = self._smooth_labels(target=target, num_classes=num_classes)
        loss = self.loss_fct(input, true_dist)
        return loss

    def _smooth_labels(self, target, num_classes):
        # Initialize smoothed label distribution
        true_dist = torch.zeros(target.size(0), num_classes).to(target.device)

        target_inv = self.label_encoder.inverse_transform(target.tolist())
        for i, (label, label_inv) in enumerate(zip(target, target_inv)):
            # Distribute smoothing factor among nearest neighbors
            neighbors = self.nearest_neighbors[label_inv]
            neighbors = self.label_encoder.transform(neighbors)

            # Set the true label probability
            if len(neighbors) == 0:
                true_dist[i, label] = 1.0
            else:
                true_dist[i, label] = 1.0 - self.smoothing
                neighbor_prob = self.smoothing / len(neighbors)
                true_dist[i, neighbors] = neighbor_prob

        return true_dist

    def _precompute_nearest_neighbors(self):
        neighbors = dict()

        for label in self.leaf_nodes:
            target_parent = self.category_tree_edges[label]
            label_neighbors = []
            for node, parent in self.category_tree_edges.items():
                if parent == target_parent and node != label and node in self.leaf_nodes:
                    label_neighbors.append(node)
            neighbors[label] = label_neighbors

        return neighbors

class LabelSmoothingCrossEntropyLossTrainer(Trainer):
    def __init__(self, category_tree: CategoryTree, smoothing: float, reduction: str, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fct = LabelSmoothingCrossEntropyLoss(category_tree=category_tree, smoothing=smoothing, reduction=reduction)
        self.ce_loss = CrossEntropyLoss(reduction=reduction)

    def compute_loss(self, model, inputs, num_items_in_batch=0, return_outputs=False):
        outputs = model(**inputs)
        logits, labels = outputs.logits, inputs["labels"]

        if model.training:
            loss = self.loss_fct(logits, labels)
        else:
            loss = self.ce_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Training and validation with Label Smoothing

In [9]:
tokenized_datasets = load_experiment_dataset(df)

Map: 100%|██████████| 588870/588870 [00:17<00:00, 34604.20 examples/s]
Map: 100%|██████████| 122980/122980 [00:03<00:00, 32136.68 examples/s]


In [None]:
seed_everything(56)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=len(category_tree.leaf_nodes)
)

training_args = TrainingArguments(
    output_dir="models/rubert_training",
    eval_strategy="steps",
    eval_steps=500,
    num_train_epochs=1,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=512,
    report_to="none" # disable wandb
)

trainer = LabelSmoothingCrossEntropyLossTrainer(
    model=model,
    category_tree=category_tree,
    smoothing=0.2,
    reduction="mean",
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["val"],
    processing_class=tokenizer, # Automatic DataCollatorWithPadding
    compute_metrics=partial(hierarchical_accuracy, category_tree=category_tree.inverted_edge_dict)
)

trainer.train()

# Final model training

In [11]:
tokenized_datasets = load_full_dataset(df)

Map: 100%|██████████| 711850/711850 [00:19<00:00, 37209.51 examples/s]


In [None]:
seed_everything(56)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL,
    num_labels=len(category_tree.leaf_nodes)
)

training_args = TrainingArguments(
    output_dir="models/rubert_full",
    num_train_epochs=1,
    per_device_train_batch_size=128,
    report_to="none" # disable wandb
)

trainer = LabelSmoothingCrossEntropyLossTrainer(
    model=model,
    category_tree=category_tree,
    smoothing=0.2,
    reduction="mean",
    args=training_args,
    train_dataset=tokenized_datasets,
    processing_class=tokenizer, # Automatic DataCollatorWithPadding
)

trainer.train()