In [1]:
import pandas as pd
import numpy as np
import csv
import math
import os
import random
import time
import copy
from tqdm.notebook import tqdm
import multiprocessing
import yaml
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold

from transformers import AutoModel, AutoTokenizer
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import Trainer, TrainingArguments
from transformers import AdamW
from transformers import DataCollatorWithPadding, DefaultDataCollator

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [3]:
# import wandb

# try:
#     from kaggle_secrets import UserSecretsClient
#     user_secrets = UserSecretsClient()
#     api_key = user_secrets.get_secret("wandb_api")
#     wandb.login(key=api_key)
#     anony = None
# except:
#     anony = "must"
#     print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

In [4]:
class Config:
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model_name = "microsoft/deberta-v3-base"
    model_save_name = "best_model.pth"
    train_batch_size = 16
    valid_batch_size = 16
    grad_max_norm = 10
    grad_acc = 1
    epochs = 1
    hidden_size = 768
    collate_fn = None
    weight_decay = 0.01
    lr = 1e-4
    seed = 42
    max_len = 64
    num_workers = 2

config = Config()


In [5]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
base_model = AutoModel.from_pretrained(config.model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
config.collate_fn = DataCollatorWithPadding(tokenizer)

In [7]:
# set seed
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(config.seed)

In [8]:
train = pd.read_csv("/kaggle/input/amazon-ml/dataset/train.csv")
train.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [9]:
target_column = "PRODUCT_LENGTH"

In [10]:
text = train['TITLE'].values[np.random.randint(0, len(train) - 1, 1)[0]]
print(f"Text of the title: {text}")

encoded_input = tokenizer(text, return_tensors='pt')
print(f"Input tokens: {encoded_input['input_ids']}")

decoded_input = tokenizer.decode(encoded_input['input_ids'][0])
print(f"Decoded tokens: {decoded_input}")

with torch.no_grad():
    output = base_model(**encoded_input)
print(f"last layer's output shape: {output.last_hidden_state.shape}")

Text of the title: PUDINI® for Vivo V23 PRO 5G Leather Holster Pouch Belt Clip Cases Waist Bag Pack for [Up to 6.5 Inch] Phone Holder - Black
Input tokens: tensor([[     1, 110060,  59206,   2139,    270,  35194,   1407,   3304,  12535,
            456,   1474,   9348,  96989,  50131,  11397,  22247,  24773,  38277,
           8296,   8019,    270,    647,   9396,    264,    525,    260,    524,
          19891,    592,   7151,  16996,    341,   1552,      2]])
Decoded tokens: [CLS] PUDINI® for Vivo V23 PRO 5G Leather Holster Pouch Belt Clip Cases Waist Bag Pack for [Up to 6.5 Inch] Phone Holder - Black[SEP]
last layer's output shape: torch.Size([1, 34, 768])


In [None]:
def create_folds(data, num_splits):
    data["kfold"] = -1
    data = data.sample(frac=1).reset_index(drop=True)
    y = data['PRODUCT_TYPE_ID']
    kf = StratifiedKFold(n_splits=num_splits)
    for f, (t_, v_) in enumerate(kf.split(X=data, y=y)):
        data.loc[v_, 'kfold'] = f
    return data
  
train = create_folds(train, 5)

In [13]:
train_subset = train[train['PRODUCT_LENGTH']<10000]
train_subset.shape

(2229490, 6)

In [14]:
train_subset['PRODUCT_LENGTH'] = train_subset['PRODUCT_LENGTH']/100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [38]:
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, mode="train", max_length=None):
        super(TextDataset, self).__init__()
        self.sentence = data["TITLE"].values
        if mode != "test":
            self.label = data[target_column].values
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mode = mode

    def __len__(self):
        return len(self.sentence)

    def __getitem__(self,idx):
        inp_tokens = self.tokenizer(self.sentence[idx], 
                                              padding=False, 
                                              add_special_tokens=True,
                                              max_length=self.max_length,
                                              truncation=True)
        item={
            "input_ids":torch.tensor(inp_tokens.input_ids,dtype=torch.long),
            "attention_mask":torch.tensor(inp_tokens.attention_mask,dtype=torch.long)
        }

        if self.mode != "test":
            item['labels'] = torch.tensor(self.label[idx], dtype=torch.long)

        return item

In [16]:
class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        one_hot = torch.zeros(cosine.size(), device=config.device)
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)  # you can use torch.where if your torch.__version__ is 0.4
        output *= self.s
        # print(output)

        return output

In [17]:
class Model(nn.Module):
    def __init__(self, 
                 base_model, 
                 last_hidden_size=config.hidden_size):
        
        super().__init__()
        self.base_model = base_model
#         self.arc_margin = ArcMarginProduct(last_hidden_size, 
#                                            num_classes, 
#                                            s=30.0, 
#                                            m=0.50, 
#                                            easy_margin=False)
        
        self.fc = nn.Linear(in_features=last_hidden_size, out_features=1)
    

    
    def forward(self, batch):
        out = self.base_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        last_hidden_state = out.last_hidden_state # shape: (batch_size, seq_length, bert_hidden_dim)
        attention_mask = batch['attention_mask']
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        logits = self.fc(mean_embeddings)
        return logits

In [18]:
class AvgMeter:
    def __init__(self, name="Metric"):
        self.name = name
        self.reset()
    
    def reset(self):
        self.avg, self.sum, self.count = [0]*3
    
    def update(self, val, count=1):
        self.count += count
        self.sum += val * count
        self.avg = self.sum / self.count
    
    def __repr__(self):
        text = f"{self.name}: {self.avg:.4f}"
        return text


In [19]:
def one_epoch(model, criterion, dataloader, epoch, scaler=None, optimizer=None, scheduler=None, mode='train'):
    
    loss_meter = AvgMeter()
    
    bar = tqdm(dataloader, total=len(dataloader))
    
    for idx, batch in enumerate(bar):
        batch = {k: v.to(config.device) for k, v in batch.items()}
        
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            preds = model(batch)
            
        loss = criterion(preds, batch['labels'].unsqueeze(-1))
        
        if mode == "train":
            loss = loss/config.grad_acc
            scaler.scale(loss).backward()
            if (idx+1)%config.grad_acc==0 or (idx+1)==len(dataloader):
                scaler.unscale_(optimizer)
                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_max_norm)
                scaler.step(optimizer)
                scaler.update()
                for param in model.parameters():
                    param.grad = None

            if scheduler:
                scheduler.step()
                
        count = batch['input_ids'].shape[0]
        loss_meter.update(loss.item(), count)
        
        
        if mode == "train":
            bar.set_postfix(epoch=epoch, train_loss=loss_meter.avg, lr=get_lr(optimizer))
        else:
            bar.set_postfix(epoch=epoch, valid_loss=loss_meter.avg)
    
    return loss_meter, acc_meter

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group["lr"]


In [20]:
def train_eval(epochs, model, train_loader, valid_loader, 
               criterion, optimizer, scheduler=None, scaler=None):
    
    best_loss = np.inf
    best_model_weights = copy.deepcopy(model.state_dict())
    
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}")
        
        model.train()
        train_loss, train_acc = one_epoch(model, 
                                          criterion, 
                                          train_loader, 
                                          epoch,
                                          scaler,
                                          optimizer=optimizer,
                                          scheduler=scheduler,
                                          mode="train")                     
        model.eval()
        with torch.no_grad():
            valid_loss, valid_acc = one_epoch(model, 
                                              criterion, 
                                              valid_loader, 
                                              epoch,
                                              optimizer=None,
                                              scheduler=None,
                                              mode="valid")
        
        if valid_loss.avg < best_loss:
            best_loss = valid_loss.avg
            best_model_weights = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), f'{config.model_save_name}')
            print("Saved best model!")
        
        print("=" * 30)

In [21]:
def optimizer_params(model, config=config):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': config.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    return optimizer_parameters

In [22]:
train_sample = train_subset.sample(frac=0.5)

In [23]:
train_sample.shape

(1114745, 6)

In [39]:
train_df, valid_df = train_test_split(train_sample, 
                                      test_size=0.33, 
                                      shuffle=True, 
                                      random_state=config.seed)

train_df=train_df.reset_index(drop=True)
valid_df=valid_df.reset_index(drop=True)

train_dataset = TextDataset(train_df, tokenizer, max_length=config.max_len)
train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=16, 
                                           num_workers=config.num_workers, 
                                           shuffle=True,
                                          collate_fn=config.collate_fn, 
                                          pin_memory=True)

valid_dataset = TextDataset(valid_df, tokenizer, max_length=config.max_len)
valid_loader = torch.utils.data.DataLoader(valid_dataset, 
                                           batch_size=config.valid_batch_size, 
                                           num_workers=config.num_workers, 
                                           shuffle=False,
                                          collate_fn=config.collate_fn,
                                          pin_memory=True)

In [27]:
model = Model(base_model).to(config.device)

In [28]:
num_steps = int(len(train_loader)*config.epochs/config.train_batch_size)

In [29]:
criterion = nn.L1Loss()
optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(0.3*num_steps), num_training_steps=num_steps)
scaler = torch.cuda.amp.GradScaler()



In [None]:
train_eval(config.epochs, model, train_loader, valid_loader,
           criterion, optimizer, scheduler=scheduler, scaler=scaler)

Epoch 1


  0%|          | 0/46680 [00:00<?, ?it/s]