In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import csv
import math
import os
import random
import time
import copy
from tqdm.notebook import tqdm
import multiprocessing
import yaml
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchmetrics import MeanAbsolutePercentageError

from sklearn.model_selection import train_test_split

from transformers import AutoModel, AutoTokenizer, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
# from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [None]:
class Config:
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model_name = "xlm-roberta-base"
    model_save_name = "best_model.pth"
    train_batch_size = 16
    valid_batch_size = 16
    grad_max_norm = 1000
    n_accumulate = 1
    epochs = 4
    collate_fn = None
    weight_decay = 1e-5
    lr = 1e-4
    min_lr = 6e-5
    seed = 42
    max_len = 32
    num_workers = 0
    pooling_method = 'weighted'
    folds = '01'
    debug = False
    wandb = True
    log_steps = 1000

config = Config()


In [None]:
def class2dict(f):
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

if config.wandb:
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

    run = wandb.init(project='amazon_ml',
                    config=class2dict(config),
                    job_type="train",
                    anonymous=anony)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
if config.model_name == 'gpt2':
    tokenizer.pad_token = tokenizer.eos_token
config.collate_fn = DataCollatorWithPadding(tokenizer)

In [None]:
# set seed
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(config.seed)

In [None]:
# train_fold0 = pd.read_csv("archive/train_fold0.csv")
# train_fold1 = pd.read_csv("archive/train_fold1.csv")
# train = pd.concat([train_fold0, train_fold1], axis=0).reset_index(drop=True)
train = pd.read_csv("dataset/train.csv")
train.shape

In [None]:
target_column = "PRODUCT_LENGTH"

In [None]:
train_subset = train[train['PRODUCT_LENGTH']<1000]
train_subset.shape

In [None]:
train_subset['PRODUCT_LENGTH'] = train_subset['PRODUCT_LENGTH']/100

In [None]:
# Drop nan values in TITLE col, rows from the dataset
train_subset = train_subset.dropna(subset=['TITLE'])
train_subset.isna().sum()

In [None]:
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, mode="train", max_length=None):
        super(TextDataset, self).__init__()
        self.sentence = data["TITLE"].values
        if mode != "test":
            self.label = data[target_column].values
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mode = mode

    def __len__(self):
        return len(self.sentence)

    def __getitem__(self,idx):
        inp_tokens = self.tokenizer(self.sentence[idx], 
                                              padding=False, 
                                              add_special_tokens=True,
                                              max_length=self.max_length,
                                              truncation=True)
        item={
            "input_ids":torch.tensor(inp_tokens.input_ids,dtype=torch.long),
            "attention_mask":torch.tensor(inp_tokens.attention_mask,dtype=torch.long)
        }

        if self.mode != "test":
            item['labels'] = torch.tensor(self.label[idx], dtype=torch.long)

        return item

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        last_hidden_state[input_mask_expanded == 0] = -1e9
        max_embeddings = torch.max(last_hidden_state, 1)[0]
        return max_embeddings

    
class ConcatPooling(nn.Module):
    def __init__(self):
        super(ConcatPooling, self).__init__()
        
    def forward(self, all_hidden_states):
        concatenate_pooling = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),-1
        )
        concat_embeddings = concatenate_pooling[:, 0]
        return concat_embeddings

    
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 9, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

    
class LSTMPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_lstm=256):
        super(LSTMPooling, self).__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_lstm = hiddendim_lstm
        self.lstm = nn.LSTM(self.hidden_size, self.hiddendim_lstm, batch_first=True)
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, all_hidden_states):
        ## forward
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers+1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out, _ = self.lstm(hidden_states, None)
        out = self.dropout(out[:, -1, :])
        return out
        

In [None]:
class CustomModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.model_config = AutoConfig.from_pretrained(config.model_name, output_hidden_states=True)
        self.model_config.hidden_dropout = 0.
        self.model_config.hidden_dropout_prob = 0.
        self.model_config.attention_dropout = 0.
        self.model_config.attention_probs_dropout_prob = 0.
       
        self.model = AutoModel.from_pretrained(config.model_name, config=self.model_config)
        
        if config.pooling_method == "mean":
            self.pool = MeanPooling()
            self.fc = nn.Linear(self.model_config.hidden_size, 1)
            
        elif config.pooling_method == 'max':
            self.pool = MaxPooling()
            self.fc = nn.Linear(self.model_config.hidden_size, 1)

        elif config.pooling_method == 'weighted':
            self.pool = WeightedLayerPooling(num_hidden_layers=self.model_config.num_hidden_layers, layer_start=9)
            self.fc = nn.Linear(self.model_config.hidden_size, 1)

        elif config.pooling_method == 'concat':
            self.pool = ConcatPooling()
            self.fc = nn.Linear(self.model_config.hidden_size*4, 1, )

            
        elif config.pooling_method == 'lstm':
            hidden_lstm_dim = 512
            self.pool = LSTMPooling(self.model_config.num_hidden_layers, 
                                    self.model_config.hidden_size, hidden_lstm_dim)
            self.fc = nn.Linear(hidden_lstm_dim, 1)
    

        if config.model_name in ['microsoft/deberta-v3-base', 'xlm-roberta-base', 'roberta-base']:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.layer[:9].requires_grad_(False)
            
        elif config.model_name in ['roberta-large', 'microsoft/deberta-v3-large']:
            self.model.embeddings.requires_grad_(False)
            self.model.encoder.layer[:18].requires_grad_(False) 

        elif 'gpt2' == config.model_name:
            self.model.wte.requires_grad_(False)
            self.model.h[:9].requires_grad_(False)
        
        elif 'distilbert-base-uncased' == config.model_name:
            self.model.embeddings.requires_grad_(False)
            self.model.transformer.layer[:1].requires_grad_(False)
        
        
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.model_config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        
        if self.config.pooling_method in ['mean', 'max']:
            last_hidden_states = outputs['last_hidden_state']
            pool_features = self.pool(last_hidden_states, inputs['attention_mask'])
        elif self.config.pooling_method in ['weighted', 'lstm', 'concat']:
            all_hidden_states = torch.stack(outputs['hidden_states'])
            pool_features = self.pool(all_hidden_states)
            if self.config.pooling_method == 'weighted':
                pool_features = pool_features[:, 0]
        
        return pool_features

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [None]:
class AvgMeter:
    def __init__(self, name="Metric"):
        self.name = name
        self.reset()
    
    def reset(self):
        self.avg, self.sum, self.count = [0]*3
    
    def update(self, val, count=1):
        self.count += count
        self.sum += val * count
        self.avg = self.sum / self.count
    
    def __repr__(self):
        text = f"{self.name}: {self.avg:.4f}"
        return text


In [None]:
def one_epoch(model, criterion, dataloader, epoch, scaler=None, optimizer=None, scheduler=None, metric=None, mode='train'):
    
    loss_meter = AvgMeter()
    mape_meter = None
    if metric:
        mape_meter = AvgMeter()
    
    bar = tqdm(dataloader, total=len(dataloader))
    
    for idx, batch in enumerate(bar):
        batch = {k: v.to(config.device) for k, v in batch.items()}
        
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            preds = model(batch)
  
        loss = criterion(preds, batch['labels'].unsqueeze(-1))
        
        if mode == "train":
            loss = loss/config.n_accumulate
            scaler.scale(loss).backward()
            if (idx+1)%config.n_accumulate==0 or (idx+1)==len(dataloader):
                scaler.unscale_(optimizer)
                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_max_norm)
                scaler.step(optimizer)
                scaler.update()
                for param in model.parameters():
                    param.grad = None

            if scheduler:
                scheduler.step()
        if metric: 
            mape = metric(preds, batch['labels'].unsqueeze(-1))
                
        count = batch['input_ids'].shape[0]
        loss_meter.update(loss.item(), count)
        if metric:
            mape_meter.update(mape.item(), count)
        
        
        if mode == "train":
            if metric:
                bar.set_postfix(epoch=epoch, train_loss=loss_meter.avg, mape=mape_meter.avg, lr=get_lr(optimizer))
            else:
                bar.set_postfix(epoch=epoch, train_loss=loss_meter.avg, lr=get_lr(optimizer))
        else:
            if metric:
                bar.set_postfix(epoch=epoch, valid_loss=loss_meter.avg, mape=mape_meter.avg)
            else:
                bar.set_postfix(epoch=epoch, valid_loss=loss_meter.avg)

        if (idx+1)%config.log_steps==0:
            print(f"epoch: {epoch}, iter: {idx+1}, loss: {loss_meter.avg:.4f}")
            if config.wandb:
                wandb.log({"epoch": epoch+1,"step_loss": loss_meter.avg, "step": idx+1})
    
    return (loss_meter, mape_meter)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group["lr"]


In [None]:
def train_eval(epochs, model, train_loader, valid_loader, 
               criterion, optimizer, scheduler=None, scaler=None, metric=None):
    
    best_loss = np.inf
    best_model_weights = copy.deepcopy(model.state_dict())
    
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}")
        
        model.train()
        train_loss, train_mape = one_epoch(model, 
                                          criterion, 
                                          train_loader, 
                                          epoch,
                                          scaler,
                                          optimizer=optimizer,
                                          scheduler=scheduler,
                                          metric=metric,
                                          mode="train")                     
        model.eval()
        with torch.no_grad():
            valid_loss, valid_mape = one_epoch(model, 
                                              criterion, 
                                              valid_loader, 
                                              epoch,
                                              optimizer=None,
                                              scheduler=None,
                                              metric=metric,
                                              mode="valid")
        if config.wandb:
            wandb.log({
                f"[fold_{config.folds}] epoch": epoch+1,
                f"[fold_{config.folds}] epoch_train_loss": train_loss.avg,
                f"[fold_{config.folds}] epoch_valid_loss": valid_loss.avg,
            })
        
        if valid_loss.avg < best_loss:
            best_loss = valid_loss.avg
            best_model_weights = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), f'{config.model_save_name}')
            print("Saved best model!")
        
        print("=" * 30)

In [None]:
def optimizer_params(model, config=config):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': config.weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
    ]
    return optimizer_parameters

In [None]:
train_df, valid_df = train_test_split(train_subset, 
                                      test_size=0.33, 
                                      shuffle=True, 
                                      random_state=config.seed)

train_df=train_df.reset_index(drop=True)
valid_df=valid_df.reset_index(drop=True)

if config.debug:
    train_df = train_df.sample(100)
    valid_df = valid_df.sample(100)

train_dataset = TextDataset(train_df, tokenizer, max_length=config.max_len)
train_loader = DataLoader(train_dataset, 
                        batch_size=config.train_batch_size, 
                        num_workers=config.num_workers, 
                        shuffle=True,
                        collate_fn=config.collate_fn)

valid_dataset = TextDataset(valid_df, tokenizer, max_length=config.max_len)
valid_loader = DataLoader(valid_dataset, 
                        batch_size=config.valid_batch_size, 
                        num_workers=config.num_workers, 
                        shuffle=False,
                        collate_fn=config.collate_fn)

In [None]:
import gc

In [None]:
def criterion(logits, labels):
    loss = nn.L1Loss()(logits, labels)
    return loss 

# metric = MeanAbsolutePercentageError().to(config.device)
metric = None

In [None]:
num_steps = int(len(train_loader)*config.epochs)
model = CustomModel(config).to(config.device)
optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr, weight_decay=config.weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_steps, eta_min=config.min_lr)
scaler = torch.cuda.amp.GradScaler()

In [None]:
train_eval(config.epochs, model, train_loader, valid_loader, 
               criterion, optimizer, scheduler, scaler, metric)

In [None]:
test_df = pd.read_csv("dataset/test.csv")
null_test = test_df[test_df['TITLE'].isna()]
test_without_null = test_df.drop(null_test.index, 0)

if config.debug:
    test_without_null = test_without_null.sample(100)

test_dataset = TextDataset(test_without_null, tokenizer, max_length=config.max_len, mode='test')
test_loader = DataLoader(test_dataset,
                        batch_size=16, 
                        num_workers=config.num_workers,
                        shuffle=False, 
                        collate_fn=DataCollatorWithPadding(tokenizer, padding='longest'))

In [None]:
def prediction(dataloader, model, device=config.device):
    preds = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader):
            batch = {k:v.to(device) for k,v in batch.items()}
            with torch.autocast(device_type='cuda', dtype=torch.float16):
                pred = model(batch)
            preds.append(pred.cpu().numpy())
    preds = np.concatenate(preds)
    return preds

In [None]:
preds = prediction(test_loader, model)

In [None]:
sub = pd.DataFrame()
sub['PRODUCT_ID'] = test_df['PRODUCT_ID']
sub['PRODUCT_LENGTH'] = preds*100
null_test['PRODUCT_LENGTH'] = 600
null_test_preds = null_test[['PRODUCT_ID', 'PRODUCT_LENGTH']]
sub_wo_clip = pd.concat([sub, null_test_preds] , axis=0)

In [None]:
sub_wo_clip.to_csv("submission.csv", index=False)

In [None]:
from sklearn.metrics import mean_absolute_percentage_error

train_fold4 = pd.read_csv("archive/train4.csv")
if config.debug:
    train_fold4 = train_fold4.sample(100)
train_fold4 = train_fold4.dropna(subset=['TITLE'])
train_fold4 = train_fold4.reset_index(drop=True)

fold_val_dataset = TextDataset(train_fold4, tokenizer, max_length=config.max_len, mode='test')
fold_val_loader = DataLoader(fold_val_dataset,
                        batch_size=16, 
                        num_workers=config.num_workers,
                        shuffle=False, 
                        collate_fn=DataCollatorWithPadding(tokenizer, padding='longest'))

preds_val = prediction(fold_val_loader, model)

print(mean_absolute_percentage_error(train_fold4['PRODUCT_LENGTH'].values.reshape((-1,1)), preds_val*100))