# Overview

Guide: Baseline Guide<br>
Inference: USPPPM: DeBERTa V3 Small [Inference]

# Libraries

In [1]:
!pip install transformers



In [2]:
import sys
sys.path.append("../input/torch-components-library/torch-components-main")
sys.path.append("../input/transformers/src")
sys.path.append("../input/mixout-github-code/mixout")

import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.optim import lr_scheduler
from torch.cuda.amp import GradScaler, autocast
from torch.utils.checkpoint import checkpoint
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig
from torch_components import Configuration as Config, Timer, Averager
from torch_components.callbacks import EarlyStopping, ModelCheckpoint
from torch_components.utils import seed_everything, get_lr, get_optimizer, get_scheduler
from torch_components.import_utils import wandb_run_exists
from sklearn.model_selection import StratifiedGroupKFold
from mixout import MixLinear, Mixout
from tqdm.notebook import tqdm
from IPython.display import display
from datetime import timedelta
import scipy
import pandas as pd
import numpy as np
import warnings
import wandb
import os
import shutil
import gc
from kaggle_secrets import UserSecretsClient


os.environ["EXPERIMENT_NAME"] = "microsoft/deberta-v3-base"

EXPERIMENT_NAME = os.environ.get("EXPERIMENT_NAME")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
WANDB = False
DEBUG = True
USER_SECRETS = UserSecretsClient()


if WANDB:
    os.environ["WANDB_PROJECT"] = "uspppm"
    os.environ["WANDB_ENTITY"] = "uspppm"
    os.environ["WANDB_SILENT"] = "true"
    
    wandb_secret_name = "wandb_api_key"
    wandb_key = USER_SECRETS.get_secret(wandb_secret_name)
    
    EXPERIMENT_NAME = EXPERIMENT_NAME if EXPERIMENT_NAME != "none" else wandb.util.generate_id()
    wandb.login(key=wandb_key)
    
os.environ["TOKENIZERS_PARALLELISM"] = "true"
warnings.simplefilter("ignore")

# Configuration

In [3]:
config = Config(model=dict(model_path="microsoft/deberta-v3-base"),
                optimizer=dict(name="AdamW", parameters=dict(lr=2e-5, weight_decay=0.01)),
                scheduler=dict(name="get_cosine_with_hard_restarts_schedule_with_warmup", 
                               parameters=dict(num_cycles=2, last_epoch=-1)),
                warmup=0.1,
                scheduling_after="step",
                seed=42,
                max_length=75,
                batch_size=24,
                epochs=5,
                num_workers=4,
                pin_memory=True,
                folds=4,
                validation_steps=200, 
                gradient_accumulation_steps=1,
                gradient_norm=1.0,
                gradient_scaling=True,
                delta=1e-4,
                verbose=100,
                save_model=False,
                device=DEVICE,
                input_directory="./",
                output_directory="./",
                cv_monitor_value="pearson",
                amp=True, 
                debug=True,
                decimals=4)

In [4]:
config.seed = seed_everything(config.seed)

# Utilities

In [5]:
def make_directory(directory, overwriting=False):
    if not os.path.exists(directory):
        os.mkdir(directory)
    else:
        if overwriting:
            shutil.rmtree(directory)
            os.mkdir(directory)

            
def create_folds(data_frame, targets, groups, folds=4, seed=42, shuffle=True, fold_column="fold"):
    cv_strategy = StratifiedGroupKFold(n_splits=folds, random_state=seed, shuffle=shuffle)
    folds = cv_strategy.split(X=data_frame, y=targets, groups=groups)
    for fold, (train_indexes, validation_indexes) in enumerate(folds):
        data_frame.loc[validation_indexes, fold_column] =  int(fold+1)
        
    data_frame[fold_column] = data_frame[fold_column].astype(int)
    
    return data_frame

In [6]:
def training_loop(train_loader, 
                  model,
                  optimizer,
                  scheduler=None,
                  scheduling_after="step",
                  epochs=1,
                  validation_loader=None, 
                  gradient_accumulation_steps=1, 
                  gradient_scaling=False,
                  gradient_norm=1,
                  validation_steps="epoch", 
                  amp=False,
                  recalculate_metrics_at_end=True, 
                  return_validation_outputs=True,
                  debug=True, 
                  teacher_model=None,
                  pseudo_loader=None,
                  verbose=100, 
                  device="cpu", 
                  time_format="{hours}:{minutes}:{seconds}", 
                  logger=["print", "wandb"], 
                  decimals=4):
    
    training_steps = len(train_loader) * epochs
    
    if isinstance(validation_steps, float):
        validation_steps = int(training_steps * validation_steps)
    elif validation_steps == "epoch":
        validation_steps = len(train_loader)
    
    if debug:
        print(f"Epochs: {epochs}")
        print(f"Auto Mixed Precision: {amp}")
        print(f"Gradient norm: {gradient_norm}")
        print(f"Gradient scaling: {gradient_scaling}")
        print(f"Gradient accumulation steps: {gradient_accumulation_steps}")
        print(f"Validation steps: {validation_steps}")
        print(f"Device: {device}")
        print()
        
    if wandb_run_exists() and "wandb" in logger:
        print(f"Weights & Biases Run: {wandb.run.get_url()}", end="\n"*2)
        
    passed_steps = 1
    train_loss, train_metrics = Averager(), Averager()
    scaler = GradScaler() if gradient_scaling else None
    best_validation_loss, best_validation_metrics, best_validation_outputs = None, None, None
    total_time = timedelta(seconds=0)
    
    if device is not None: 
        model.to(device)
        
        if teacher_model is not None: teacher_model.to(device)
    
    for epoch in range(1, epochs+1):
        if "tqdm" in logger:
            bar_format = "{l_bar} {bar} {n_fmt}/{total_fmt} - remain: {remaining}{postfix}"
            train_loader = tqdm(iterable=train_loader, 
                                total=len(train_loader),
                                colour="#000",
                                bar_format=bar_format)
            
            train_loader.set_description_str(f"Epoch {epoch}/{epochs}")
        
        if "print" in logger:
            print(f"\nEpoch {epoch}/{epochs}", end="\n"*2)
            
        epoch_train_loss, epoch_train_metrics = Averager(), Averager()
        timer = Timer(time_format)
        steps = len(train_loader)    
        
        model.zero_grad()
        for step, batch in enumerate(train_loader, 1):
            batch_size = train_loader.batch_size
            
            step_timer =  Timer(time_format)
            pseudo_batch = next(iter(pseudo_loader)) if pseudo_loader is not None else None
            batch_loss, batch_metrics = training_step(batch=batch, 
                                                      model=model, 
                                                      optimizer=optimizer,
                                                      gradient_norm=gradient_norm,
                                                      gradient_accumulation_steps=gradient_accumulation_steps, 
                                                      amp=amp, 
                                                      scaler=scaler, 
                                                      device=device, 
                                                      overall_loss=epoch_train_loss.average, 
                                                      overall_metrics=epoch_train_metrics.average,
                                                      step=passed_steps, 
                                                      epoch=epoch, 
                                                      teacher_model=teacher_model,
                                                      pseudo_batch=pseudo_batch)
            
            lr_key = "lr"
            lr = get_lr(optimizer, only_last=True, key=lr_key)
            
            if step % gradient_accumulation_steps == 0:
                optimization_step(model=model, optimizer=optimizer, scaler=scaler)
    
                if scheduling_after == "step":
                    scheduling_step(scheduler, loop="training")
            
            elapsed, remain = step_timer(1/1)
            step_seconds = step_timer.elapsed_time.total_seconds()
            sample_seconds = step_seconds / batch_size
            
            if wandb_run_exists() and "wandb" in logger:
                logs = {"train/seconds vs step": step_seconds, 
                        "train/seconds vs sample": sample_seconds}
                
                wandb.log(logs, step=passed_steps)
            
            train_loss.update(batch_loss, n=batch_size)
            epoch_train_loss.update(batch_loss, n=batch_size)
            train_metrics.update(batch_metrics, n=batch_size)
            epoch_train_metrics.update(batch_metrics, n=batch_size)
            
            
            logs = {"train/loss": train_loss.average, 
                    "train/loss vs batch": batch_loss, 
                    "train/loss vs epoch": epoch_train_loss.average,
                    "lr": lr}
            
            for metric in batch_metrics:
                logs.update({f"train/{metric}": train_metrics.average[metric], 
                             f"train/{metric} vs batch": batch_metrics[metric], 
                             f"train/{metric} vs epoch": epoch_train_metrics.average[metric]})
                
            if wandb_run_exists() and "wandb" in logger:
                wandb.log(logs, step=passed_steps) 
            
            if "tqdm" in logger:
                train_loader.set_postfix_str(f"loss: {epoch_train_loss.average:.{decimals}}"
                                             f"{format_metrics(epoch_train_metrics.average, decimals=decimals)}")
            if "print" in logger:
                 if step % verbose == 0 or step == steps and verbose > 0:
                    elapsed, remain = timer(step/steps)
                    print(f"{step}/{steps} - "
                          f"remain: {remain} - "
                          f"loss: {epoch_train_loss.average:.{decimals}}"
                          f"{format_metrics(epoch_train_metrics.average, decimals=decimals)} - "
                          f"lr: {lr}")
                    
            
            if validation_loader is not None:
                if (passed_steps % validation_steps) == 0:
                    if step > validation_steps: print()
                    validation_loop_steps = len(validation_loader)
                    validation_batch_size = validation_loader.batch_size
                    
                    validation_timer =  Timer(time_format)
                    validation_loss, validation_metrics, validation_outputs = validation_loop(loader=validation_loader, 
                                                                                              model=model,
                                                                                              gradient_accumulation_steps=gradient_accumulation_steps,
                                                                                              amp=amp, 
                                                                                              return_outputs=True, 
                                                                                              verbose=verbose, 
                                                                                              recalculate_metrics_at_end=True, 
                                                                                              device=device, 
                                                                                              logger=logger)
                    
                    
                    elapsed, remain = validation_timer(1/1)
                    validation_seconds = validation_timer.elapsed_time.total_seconds()
                    validation_step_seconds = validation_seconds / validation_loop_steps
                    validation_sample_seconds = validation_step_seconds / validation_batch_size
            
                    if wandb_run_exists() and "wandb" in logger:
                        logs = {"validation/seconds vs step": validation_step_seconds, 
                                "validation/seconds vs sample": validation_sample_seconds}
                
                        wandb.log(logs, step=passed_steps)
                    
                    
                    logs = {"validation/loss": validation_loss, 
                            "train/loss vs validation steps": epoch_train_loss.average}
    
                    for metric, value in validation_metrics.items():
                        logs.update({f"validation/{metric}": value, 
                                     f"train/{metric} vs validation steps": epoch_train_metrics.average[metric]})
                    
                    if wandb_run_exists() and "wandb" in logger:
                        wandb.log(logs, step=passed_steps)
                    
                    is_checkpoint_saved = model_checkpointing(loss=validation_loss, 
                                                              metrics=validation_metrics,
                                                              model=model, 
                                                              optimizer=optimizer, 
                                                              scheduler=scheduler, 
                                                              step=passed_steps, 
                                                              best_loss=best_validation_loss, 
                                                              best_metrics=validation_metrics)
                    
                    if is_checkpoint_saved:
                        best_validation_loss = validation_loss
                        best_validation_metrics = validation_metrics
                        best_validation_outputs = validation_outputs
                        
                    scheduling_step(scheduler, loss=validation_loss, loop="validation")
                    print()
            
            passed_steps += 1
        
        if scheduling_after == "epoch":
            scheduling_step(scheduler, loop="training")
        
        on_epoch_end(model=model, 
                     step=passed_steps, 
                     epoch=epoch)
        
        if "tqdm" in logger and "print" not in logger:
            elapsed, remain = timer(1/1)
        
        epoch_elapsed_seconds = timer.elapsed_time.total_seconds()
        total_time += timedelta(seconds=epoch_elapsed_seconds)
        
        if wandb_run_exists() and "wandb" in logger:
            wandb.log({"epoch": epoch}, step=passed_steps)
        
        if "tqdm" in logger: train_loader.close()
            
        print(f"\nTraining loss: {epoch_train_loss.average:.{decimals}}"
              f"{format_metrics(epoch_train_metrics.average, decimals=decimals)}")
        
        if validation_loader is not None:
            print(f"Validation loss: {best_validation_loss:.{decimals}}"
                  f"{format_metrics(best_validation_metrics, decimals=decimals)}")
        
        total_time_string = Timer.format_time(total_time, time_format=time_format)
        print(f"Total time: {total_time_string}")
    
    if validation_loader is not None:
        if return_validation_outputs:
            return (epoch_train_loss.average, epoch_train_metrics.average), (best_validation_loss, best_validation_metrics, best_validation_outputs)
        
        return (epoch_train_loss.average, epoch_train_metrics.average), (best_validation_loss, best_validation_metrics)

    return (epoch_train_loss.average, epoch_train_metrics.average)
        
def validation_loop(loader, 
                    model, 
                    gradient_accumulation_steps=1,
                    amp=False, 
                    return_outputs=True, 
                    recalculate_metrics_at_end=True, 
                    verbose=1, 
                    device="cpu", 
                    time_format="{hours}:{minutes}:{seconds}",
                    logger=["print"], 
                    decimals=4):
    
    model.eval()
    loss, metrics = Averager(), Averager()
    timer = Timer(time_format)
    outputs, targets = [], []
    steps = len(loader)
    
    if "tqdm" in logger:
        bar_format = "{l_bar} {bar} {n_fmt}/{total_fmt} - remain: {remaining}{postfix}"
        loader = tqdm(iterable=loader, 
                      total=len(loader),
                      colour="#000",
                      bar_format=bar_format)
            
        loader.set_description_str("[Validation]")
    
    is_targets = False
    for step, batch in enumerate(loader, 1):
        with torch.no_grad():
            with autocast(enabled=amp):
                batch_loss, batch_outputs = calculate_loss(batch=batch, model=model, return_outputs=True, device=device)
                
                batch_loss /= gradient_accumulation_steps
                loss.update(batch_loss.item(), n=len(batch))
                
                batch_targets = get_targets(batch)
                batch_metrics = calculate_metrics(predictions=batch_outputs, targets=batch_targets, device=device)
                metrics.update(batch_metrics, n=len(batch))
                
                if batch_targets is not None:
                    if isinstance(batch_targets, dict):
                        targets.append(batch_targets)
                    else:
                        targets.extend(batch_targets.to("cpu").tolist())
                        
                    is_targets = True
                
                outputs.extend(batch_outputs.to("cpu").tolist())
                
                if step == steps and recalculate_metrics_at_end and is_targets:
                    outputs = torch.tensor(outputs)
                    targets = torch.tensor(targets)
                        
                    metrics = Averager(calculate_metrics(predictions=outputs, targets=targets))
                
                if "tqdm" in logger:
                    loader.set_postfix_str(f"loss: {loss.average:.{decimals}}"
                                           f"{format_metrics(metrics.average, decimals=decimals)}")
                
                if "print" in logger:
                    if step % verbose == 0 or step == steps and verbose > 0:
                        elapsed, remain = timer(step/steps)

                        print(f"[Validation] "
                              f"{step}/{steps} - "
                              f"remain: {remain} - "
                              f"loss: {loss.average:.{decimals}}"
                              f"{format_metrics(metrics.average, decimals=decimals)}")
                    
    if not recalculate_metrics_at_end: 
        outputs = torch.tensor(outputs)
        
    if "tqdm" in logger:
        loader.close()
        
    return (loss.average, metrics.average, outputs) if return_outputs else (loss.average, metrics.average)


def format_metrics(metrics, sep=" - ", add_sep_to_start=True, decimals=4):
    if metrics != {}:
        string = sep.join([f"{k}: {v:.{decimals}}" for k, v in metrics.items()])
        return sep + string if add_sep_to_start else string 
    
    return ""

    
def training_step(batch, 
                  model, 
                  optimizer, 
                  gradient_norm=1.0, 
                  amp=False, 
                  gradient_accumulation_steps=1, 
                  scaler=None, 
                  device="cpu", 
                  overall_loss=None, 
                  overall_metrics=None, 
                  step=None, 
                  epoch=None,
                  teacher_model=None,
                  pseudo_batch=None):
    
    model.train()
    with autocast(enabled=amp):
        loss, outputs = calculate_loss(batch=batch, model=model, return_outputs=True, device=device)
        targets = get_targets(batch)
        metrics = calculate_metrics(predictions=outputs, targets=targets, device=device)
        
        loss /= gradient_accumulation_steps
        loss = backward_step(loss=loss, optimizer=optimizer, scaler=scaler)
        
        adversarial_loss = adversarial_step(batch=batch, 
                                            model=model, 
                                            device=device, 
                                            loss=overall_loss, 
                                            metrics=overall_metrics, 
                                            step=step, 
                                            epoch=epoch)
        
        if adversarial_loss is not None:
            adversarial_loss = backward_step(loss=adversarial_loss, optimizer=optimizer, scaler=scaler)
        
        if pseudo_batch is not None and teacher_model is not None:
            pseudo_loss = pseudo_labeling_step(batch=batch,
                                               pseudo_batch=pseudo_batch,
                                               model=model, 
                                               teacher_model=teacher_model, 
                                               loss=loss, 
                                               metrics=metrics,
                                               step=step, 
                                               epoch=epoch, 
                                               device=device)
        
            if pseudo_loss is not None:
                pseudo_loss = backward_step(loss=pseudo_loss, optimizer=optimizer, scaler=scaler)
            
    if gradient_norm > 0:
        if scaler is not None:
            scaler.unscale_(optimizer)
                            
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=gradient_norm)
        
    return loss.detach(), metrics

def backward_step(loss, optimizer, scaler=None):
    if scaler is not None:
        scaler.scale(loss).backward()
    else:
        loss.backward()
        
    return loss
        

def optimization_step(model, optimizer, scaler=None):                        
    if scaler is not None:
        scaler.step(optimizer)
        scaler.update()
    else:
        optimizer.step()
        
    model.zero_grad()
        

def scheduling_step(scheduler=None, loss=None, loop="training"):
    if scheduler is not None:
        if loop == "validation":
            if isinstance(scheduler, lr_scheduler.ReduceLROnPlateau):
                scheduler.step(loss)
        else:
            if not isinstance(scheduler, lr_scheduler.ReduceLROnPlateau):
                scheduler.step()

                
def adversarial_step(batch, 
                     model, 
                     device="cpu", 
                     loss=None, 
                     metrics=None, 
                     step=None, 
                     epoch=None):
    pass

                
    
def calculate_loss(batch, model, return_outputs=True, device="cpu"):
    raise NotImplementedError(f"`calculate_loss` function is not implemented.")
                
def calculate_metrics(predictions, targets, device="cpu"):
    return dict()

def get_targets(batch):
    return []


def on_epoch_end(model=None, step=None, epoch=None):
    pass


def model_checkpointing(loss, 
                        metrics, 
                        model, 
                        optimizer=None, 
                        scheduler=None, 
                        step=None, 
                        best_loss=None, 
                        best_metrics=None):
    
    return True


def pseudo_labeling_step(batch, 
                         pseudo_batch, 
                         model, 
                         teacher_model, 
                         loss=None, 
                         metrics=None, 
                         step=None, 
                         epoch=None, 
                         device="cpu"):
    pass

In [7]:
def calculate_loss(batch, model, return_outputs=True, device="cpu"):
    input_ids, attention_mask, targets = batch
    
    input_ids = input_ids.to(device).long()
    attention_mask = attention_mask.to(device).long()
    targets = targets.to(device).float()
    
    outputs = model(input_ids, attention_mask)
    outputs = outputs.sigmoid().squeeze(dim=-1)
    loss = F.mse_loss(outputs, targets, reduction="mean")
    
    return (loss, outputs) if return_outputs else loss


def calculate_metrics(predictions, targets, device="cpu"):
    predictions = predictions.sigmoid().detach().view(-1).to("cpu").float().numpy()
    targets = targets.view(-1).to("cpu").float().numpy()
    
    return dict(pearson=scipy.stats.pearsonr(predictions, targets)[0])


def get_targets(batch):
    *_, targets = batch
    return targets


def model_checkpointing(loss, 
                        metrics, 
                        model, 
                        optimizer=None, 
                        scheduler=None, 
                        step=None, 
                        best_loss=None, 
                        best_metrics=None):
    
    is_saved_checkpoint = model_checkpoint(value=metrics["pearson"], 
                                           model=model, 
                                           optimizer=optimizer, 
                                           scheduler=scheduler, 
                                           step=step)
    return is_saved_checkpoint

# Dataset

In [8]:
class DynamicPadding:
    def __init__(self, tokenizer, max_length=None, padding=True, pad_to_multiple_of=None, return_tensors="pt"):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.padding = padding
        self.pad_to_multiple_of = pad_to_multiple_of
        self.return_tensors = return_tensors
    
    def __call__(self, tokenized):
        max_length = max(len(_["input_ids"]) for _ in tokenized)
        max_length = min(max_length, self.max_length) if self.max_length is not None else max_length
                
        padded = self.tokenizer.pad(encoded_inputs=tokenized,
                                    max_length=max_length,
                                    padding=self.padding, 
                                    pad_to_multiple_of=self.pad_to_multiple_of, 
                                    return_tensors=self.return_tensors)
        
        return padded
    
    
    
class Collator:
    def __init__(self, return_targets=True, **kwargs):
        self.dynamic_padding = DynamicPadding(**kwargs)
        self.return_targets = return_targets
    
    def __call__(self, batch):
        all_tokenized, all_targets = [], []
        for sample in batch:
            if self.return_targets:
                tokenized, target = sample
                all_targets.append(target)
            else:
                tokenized = sample
                
            all_tokenized.append(tokenized)
        
        tokenized = self.dynamic_padding(all_tokenized)
        
        input_ids = torch.tensor(tokenized.input_ids)
        attention_mask = torch.tensor(tokenized.attention_mask)
        
        if self.return_targets:
            all_targets = torch.tensor(all_targets)
        
            return input_ids, attention_mask, all_targets
        
        return input_ids, attention_mask

In [9]:
class Dataset:
    def __init__(self, texts, pair_texts, tokenizer, contexts=None, sep=None, targets=None, max_length=128):
        self.texts = texts
        self.pair_texts = pair_texts
        self.contexts = contexts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.sep = sep if sep is not None else self.tokenizer.sep_token
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index].lower()
        pair_text = self.pair_texts[index].lower()
        
        if self.contexts is not None:
            context = self.contexts[index].lower()
            text = text + self.sep + context
    
        
        tokenized = self.tokenizer(text=text, 
                                   text_pair=pair_text, 
                                   add_special_tokens=True,
                                   #max_length=self.max_length,
                                   #padding="max_length",
                                   #truncation=True,
                                   return_attention_mask=True,
                                   return_token_type_ids=False,
                                   return_offsets_mapping=False)
        
        
        if self.targets is not None:
            target = self.targets[index]
            
            return tokenized, target
            
        return tokenized

# Model

In [10]:
class Model(nn.Module):
    def __init__(self, model_path="microsoft/deberta-v3-base", config_path=None, config_updates={}, reinitialization_layers=0, mixout=0.0):
        super(Model, self).__init__()
        if config_path is None:
            self.config = AutoConfig.from_pretrained(model_path)
        else:
            self.config = AutoConfig.from_pretrained(config_path)
        
        self.config.output_hidden_states = True
        self.config.update(config_updates)
        
        if config_path is None:
            self.model = AutoModel.from_pretrained(model_path, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        
        self.model.gradient_checkpointing_enable()
        print(f"Gradient Checkpointing: {self.model.is_gradient_checkpointing}")
        
        if mixout > 0:
            for module in self.model.modules():
                for name, submodule in module.named_children():
                    if isinstance(submodule, nn.Dropout):
                        module.p = 0.0
                    if isinstance(submodule, nn.Linear):
                        target_state_dict = submodule.state_dict()
                        bias = True if submodule.bias is not None else False
                        
                        new_module = MixLinear(in_features=submodule.in_features, 
                                               out_features=submodule.out_features, 
                                               bias=bias, 
                                               target=target_state_dict["weight"], 
                                               p=mixout)
                        
                        new_module.load_state_dict(target_state_dict)
                        setattr(module, name, new_module)
                
            print(f"Initialized Mixout (p={mixout}) Regularization")
        
        if reinitialization_layers > 0:
            layers = ...
            for layer in layers[-reinitialization_layers:]:
                for name, module in layer.named_modules():
                    self.init_weights(module, std=self.config.initializer_range)
            
            print(f"Reinitializated last {n} layers.")

        self.head = nn.Linear(in_features=self.config.hidden_size, out_features=1)
        self.init_weights(self.head, std=self.config.initializer_range)
            
    
    def init_weights(self, module, std=0.02):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    
    def forward(self, input_ids, attention_mask=None):
        transformer_outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        features = transformer_outputs.hidden_states[-1]
        features = features[:, 0, :]
        outputs = self.head(features)
        
        return outputs

# Loading dataset

In [11]:
train_path = "../input/us-patent-phrase-to-phrase-matching/train.csv"
test_path = "../input/us-patent-phrase-to-phrase-matching/test.csv"
sample_submission_path = "../input/us-patent-phrase-to-phrase-matching/sample_submission.csv"
cpc_codes_path = "../input/cpc-codes/titles.csv"

In [12]:
cpc_codes = pd.read_csv(cpc_codes_path)
train = pd.read_csv(train_path)
train = train.merge(cpc_codes, left_on="context", right_on="code")

if DEBUG:
    display(train)

Unnamed: 0,id,anchor,target,context,score,code,title,section,class,subclass,group,main_group
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
36468,718f1c6953e3942f,undulation,undulatory swimmers,B31,0.00,B31,"MAKING ARTICLES OF PAPER, CARDBOARD OR MATERIA...",B,31.0,,,
36469,4dc407e6d0aa7844,undulation,voltage fluctuate,B31,0.00,B31,"MAKING ARTICLES OF PAPER, CARDBOARD OR MATERIA...",B,31.0,,,
36470,de69548ad79caccc,web transfer,transfer from web,B31,0.75,B31,"MAKING ARTICLES OF PAPER, CARDBOARD OR MATERIA...",B,31.0,,,
36471,6620317413e6e03f,web transfer,transfer to web,B31,0.25,B31,"MAKING ARTICLES OF PAPER, CARDBOARD OR MATERIA...",B,31.0,,,


In [13]:
cpc_texts = torch.load("../input/foldsdump/cpc_texts.pth")
train['context_text'] = train['context'].map(cpc_texts)
train['text'] = train['anchor'] + '[SEP]' + train['target'] + '[SEP]'  + train['context_text']
train['text'] = train['text'].apply(str.lower)

In [14]:
train

Unnamed: 0,id,anchor,target,context,score,code,title,section,class,subclass,group,main_group,context_text,text
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[sep]abatement of pollution[sep]human...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[sep]act of abating[sep]human necessi...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[sep]active catalyst[sep]human necess...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[sep]eliminating process[sep]human ne...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,HUMAN NECESSITIES. FURNITURE; DOMESTIC ARTICLE...,abatement[sep]forest region[sep]human necessit...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36468,718f1c6953e3942f,undulation,undulatory swimmers,B31,0.00,B31,"MAKING ARTICLES OF PAPER, CARDBOARD OR MATERIA...",B,31.0,,,,PERFORMING OPERATIONS; TRANSPORTING. MAKING AR...,undulation[sep]undulatory swimmers[sep]perform...
36469,4dc407e6d0aa7844,undulation,voltage fluctuate,B31,0.00,B31,"MAKING ARTICLES OF PAPER, CARDBOARD OR MATERIA...",B,31.0,,,,PERFORMING OPERATIONS; TRANSPORTING. MAKING AR...,undulation[sep]voltage fluctuate[sep]performin...
36470,de69548ad79caccc,web transfer,transfer from web,B31,0.75,B31,"MAKING ARTICLES OF PAPER, CARDBOARD OR MATERIA...",B,31.0,,,,PERFORMING OPERATIONS; TRANSPORTING. MAKING AR...,web transfer[sep]transfer from web[sep]perform...
36471,6620317413e6e03f,web transfer,transfer to web,B31,0.25,B31,"MAKING ARTICLES OF PAPER, CARDBOARD OR MATERIA...",B,31.0,,,,PERFORMING OPERATIONS; TRANSPORTING. MAKING AR...,web transfer[sep]transfer to web[sep]performin...


# Cross-Validation split

In [15]:
train["score_bin"] = pd.cut(train["score"], bins=4, labels=False)
train = create_folds(data_frame=train, 
                     targets=train["score_bin"].values,
                     groups=train["text"].values,
                     folds=config.folds, 
                     seed=config.seed, 
                     shuffle=True)

if DEBUG:
    folds_samples_count = train.groupby("fold").size()
    display(folds_samples_count)

fold
1    9119
2    9118
3    9118
4    9118
dtype: int64

# Tokenizer

In [16]:
tokenizer = AutoTokenizer.from_pretrained(config.model.model_path)
tokenizer_path = os.path.join(config.output_directory, "tokenizer/")
tokenizer_files = tokenizer.save_pretrained(tokenizer_path)

if DEBUG:
    print(f"Tokenizer: {tokenizer}")

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer: PreTrainedTokenizer(name_or_path='microsoft/deberta-v3-base', vocab_size=128000, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


# Cross-Validation

In [17]:
cv_scores = []
oof_data_frame = pd.DataFrame()
for fold in range(1, config.folds + 1):
    print(f"Fold {fold}/{config.folds}", end="\n"*2)
    
    fold_directory = os.path.join(config.output_directory, f"fold_{fold}")    
    make_directory(fold_directory)
    model_path = os.path.join(fold_directory, "model.pth")
    model_config_path = os.path.join(fold_directory, "model_config.json")
    checkpoints_directory = os.path.join(fold_directory, "checkpoints/")
    make_directory(checkpoints_directory)
    
    collator = Collator(tokenizer=tokenizer, max_length=config.max_length)
    
    train_fold = train[~train["fold"].isin([fold])]
    train_dataset = Dataset(texts=train_fold["text"].values, 
                            pair_texts=train_fold["target"].values,
                            contexts=train_fold["title"].values,
                            targets=train_fold["score"].values, 
                            max_length=config.max_length,
                            sep=tokenizer.sep_token,
                            tokenizer=tokenizer)
    
    train_loader = DataLoader(dataset=train_dataset, 
                              batch_size=config.batch_size, 
                              num_workers=config.num_workers,
                              pin_memory=config.pin_memory,
                              collate_fn=collator,
                              shuffle=True, 
                              drop_last=False)
    
    print(f"Train samples: {len(train_dataset)}")
    
    validation_fold = train[train["fold"].isin([fold])]
    validation_dataset = Dataset(texts=validation_fold["text"].values, 
                                 pair_texts=validation_fold["target"].values,
                                 contexts=validation_fold["title"].values,
                                 targets=validation_fold["score"].values,
                                 max_length=config.max_length,
                                 sep=tokenizer.sep_token,
                                 tokenizer=tokenizer)
    
    validation_loader = DataLoader(dataset=validation_dataset, 
                                   batch_size=config.batch_size*2, 
                                   num_workers=config.num_workers,
                                   pin_memory=config.pin_memory,
                                   collate_fn=collator,
                                   shuffle=False, 
                                   drop_last=False)
    
    print(f"Validation samples: {len(validation_dataset)}")
    
    
    model = Model(**config.model)
    model.config.to_json_file(model_config_path)
    model_parameters = model.parameters()
    
    optimizer = get_optimizer(**config.optimizer, model_parameters=model_parameters)
    
    if "scheduler" in config:
        training_steps = len(train_loader) * config.epochs
        training_steps = int(training_steps // config.gradient_accumulation_steps)
        
        config.scheduler.parameters.num_training_steps = training_steps
        config.scheduler.parameters.num_warmup_steps = training_steps * config.get("warmup", 0)
        scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup(optimizer=optimizer,num_warmup_steps=training_steps * config.get("warmup", 0),num_training_steps=training_steps,num_cycles=2,last_epoch=-1)
    else:
        scheduler = None
        
    model_checkpoint = ModelCheckpoint(mode="max", 
                                       delta=config.delta, 
                                       directory=checkpoints_directory, 
                                       overwriting=True, 
                                       filename_format="checkpoint.pth", 
                                       num_candidates=1)


    if WANDB: wandb.init(group=EXPERIMENT_NAME, name=f"Fold {fold}", config=config)
    (train_loss, train_metrics), (validation_loss, validation_metrics, validation_outputs) = training_loop(model=model, 
                                                                                                           optimizer=optimizer, 
                                                                                                           scheduler=scheduler,
                                                                                                           scheduling_after=config.scheduling_after,
                                                                                                           train_loader=train_loader,
                                                                                                           validation_loader=validation_loader,
                                                                                                           epochs=config.epochs, 
                                                                                                           gradient_accumulation_steps=config.gradient_accumulation_steps, 
                                                                                                           gradient_scaling=config.gradient_scaling, 
                                                                                                           gradient_norm=config.gradient_norm, 
                                                                                                           validation_steps=config.validation_steps, 
                                                                                                           amp=config.amp,
                                                                                                           debug=config.debug, 
                                                                                                           verbose=config.verbose, 
                                                                                                           device=config.device, 
                                                                                                           recalculate_metrics_at_end=True, 
                                                                                                           return_validation_outputs=True, 
                                                                                                           logger=["print", "wandb"], 
                                                                                                           decimals=config.decimals)
    
    if WANDB: wandb.finish()
    
    if config.save_model:
        model_state = model.state_dict()
        torch.save(model_state, model_path)
        print(f"Model's path: {model_path}")
    
    validation_fold["prediction"] = validation_outputs.to("cpu").numpy()
    oof_data_frame = pd.concat([oof_data_frame, validation_fold])
        
    cv_monitor_value = validation_loss if config.cv_monitor_value == "loss" else validation_metrics.get(config.cv_monitor_value, np.nan)
    cv_scores.append(cv_monitor_value)
    
    
    del model, optimizer, validation_outputs, train_fold, validation_fold
    torch.cuda.empty_cache()
    gc.collect()
    print(end="\n"*5)
    
cv_scores = np.array(cv_scores).round(config.decimals)
np.save("cv_scores.npy", cv_scores)
oof_data_frame.to_pickle("oof.pkl")
configuration_path = config.to_json("configuration.json")

print(f"CV scores: {cv_scores}")
print(f"CV mean: {cv_scores.mean():.{config.decimals}}")
print(f"CV std: {cv_scores.std():.{config.decimals}}")

Fold 1/4

Train samples: 27354
Validation samples: 9119


Downloading:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Gradient Checkpointing: True
Epochs: 5
Auto Mixed Precision: True
Gradient norm: 1.0
Gradient scaling: True
Gradient accumulation steps: 1
Validation steps: 200
Device: cuda


Epoch 1/5

100/1140 - remain: 0:6:47 - loss: 0.06797 - pearson: 0.0201 - lr: 3.473684210526316e-06
200/1140 - remain: 0:6:1 - loss: 0.0637 - pearson: 0.2023 - lr: 6.982456140350878e-06
[Validation] 100/190 - remain: 0:0:10 - loss: 0.04086 - pearson: 0.6178
[Validation] 190/190 - remain: 0:0:0 - loss: 0.04229 - pearson: 0.614
'best_value' is improved by inf! New 'best_value': 0.6139574392851158. Checkpoint path: './fold_1/checkpoints/checkpoint.pth'.

300/1140 - remain: 0:6:38 - loss: 0.05707 - pearson: 0.3345 - lr: 1.0491228070175438e-05
400/1140 - remain: 0:5:34 - loss: 0.05192 - pearson: 0.4243 - lr: 1.4e-05

[Validation] 100/190 - remain: 0:0:10 - loss: 0.02981 - pearson: 0.7462
[Validation] 190/190 - remain: 0:0:0 - loss: 0.03058 - pearson: 0.74
'best_value' is improved by 0.12602311256687093! New 'best_value

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Gradient Checkpointing: True
Epochs: 5
Auto Mixed Precision: True
Gradient norm: 1.0
Gradient scaling: True
Gradient accumulation steps: 1
Validation steps: 200
Device: cuda


Epoch 1/5

100/1140 - remain: 0:6:35 - loss: 0.09683 - pearson: 0.01582 - lr: 3.473684210526316e-06
200/1140 - remain: 0:6:0 - loss: 0.07779 - pearson: 0.1672 - lr: 6.982456140350878e-06
[Validation] 100/190 - remain: 0:0:9 - loss: 0.0429 - pearson: 0.5855
[Validation] 190/190 - remain: 0:0:0 - loss: 0.04381 - pearson: 0.585
'best_value' is improved by inf! New 'best_value': 0.5850019499272179. Checkpoint path: './fold_2/checkpoints/checkpoint.pth'.

300/1140 - remain: 0:6:40 - loss: 0.067 - pearson: 0.3021 - lr: 1.0491228070175438e-05
400/1140 - remain: 0:5:37 - loss: 0.05886 - pearson: 0.3957 - lr: 1.4e-05

[Validation] 100/190 - remain: 0:0:10 - loss: 0.02975 - pearson: 0.7397
[Validation] 190/190 - remain: 0:0:0 - loss: 0.02968 - pearson: 0.752
'best_value' is improved by 0.16695332519460715! New 'best_value'

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Gradient Checkpointing: True
Epochs: 5
Auto Mixed Precision: True
Gradient norm: 1.0
Gradient scaling: True
Gradient accumulation steps: 1
Validation steps: 200
Device: cuda


Epoch 1/5

100/1140 - remain: 0:6:38 - loss: 0.07836 - pearson: 0.02097 - lr: 3.473684210526316e-06
200/1140 - remain: 0:5:59 - loss: 0.0674 - pearson: 0.197 - lr: 6.982456140350878e-06
[Validation] 100/190 - remain: 0:0:10 - loss: 0.0471 - pearson: 0.6193
[Validation] 190/190 - remain: 0:0:0 - loss: 0.04865 - pearson: 0.609
'best_value' is improved by inf! New 'best_value': 0.609020932850576. Checkpoint path: './fold_3/checkpoints/checkpoint.pth'.

300/1140 - remain: 0:6:41 - loss: 0.05908 - pearson: 0.3246 - lr: 1.0491228070175438e-05
400/1140 - remain: 0:5:36 - loss: 0.05309 - pearson: 0.4151 - lr: 1.4e-05

[Validation] 100/190 - remain: 0:0:10 - loss: 0.03745 - pearson: 0.7318
[Validation] 190/190 - remain: 0:0:0 - loss: 0.03629 - pearson: 0.7447
'best_value' is improved by 0.13563745046900866! New 'best_valu

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Gradient Checkpointing: True
Epochs: 5
Auto Mixed Precision: True
Gradient norm: 1.0
Gradient scaling: True
Gradient accumulation steps: 1
Validation steps: 200
Device: cuda


Epoch 1/5

100/1140 - remain: 0:6:48 - loss: 0.07296 - pearson: 0.007399 - lr: 3.473684210526316e-06
200/1140 - remain: 0:6:2 - loss: 0.06983 - pearson: 0.08119 - lr: 6.982456140350878e-06
[Validation] 100/190 - remain: 0:0:9 - loss: 0.0503 - pearson: 0.5387
[Validation] 190/190 - remain: 0:0:0 - loss: 0.05147 - pearson: 0.5209
'best_value' is improved by inf! New 'best_value': 0.520863965216065. Checkpoint path: './fold_4/checkpoints/checkpoint.pth'.

300/1140 - remain: 0:6:40 - loss: 0.06281 - pearson: 0.2429 - lr: 1.0491228070175438e-05
400/1140 - remain: 0:5:36 - loss: 0.05628 - pearson: 0.3515 - lr: 1.4e-05

[Validation] 100/190 - remain: 0:0:9 - loss: 0.02894 - pearson: 0.7381
[Validation] 190/190 - remain: 0:0:0 - loss: 0.02923 - pearson: 0.7478
'best_value' is improved by 0.22693185550897332! New 'best_va