In [None]:
# https://huggingface.co/transformers/custom_datasets.html

In [None]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version 1.7 --apt-packages libomp5 libopenblas-dev
!pip install numpy

In [None]:
!wget https://s3-ap-southeast-1.amazonaws.com/he-public-data/dataset52a7b21.zip
!unzip dataset52a7b21.zip
!rm dataset/.~lock.train.csv#
!rm dataset52a7b21.zip

In [None]:
import csv
import numpy as np
import pickle
import pandas as pd
train = pd.read_csv("dataset/train.csv", escapechar = "\\", quoting = csv.QUOTE_NONE)
test = pd.read_csv("dataset/test.csv", escapechar = "\\", quoting = csv.QUOTE_NONE)

train.head()

In [None]:
class_counts = train.BROWSE_NODE_ID.value_counts()
drop_indices = class_counts[class_counts<50].index
train = train[~train.BROWSE_NODE_ID.isin(drop_indices)]
train.head()

In [None]:
from sklearn.model_selection import train_test_split
train_split, val_split = train_test_split(train, test_size=.05)
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
label_map = {}
for idx, value in enumerate(train.BROWSE_NODE_ID.unique()):
    label_map[value] = idx

with open('lable_map.pickle', 'wb') as handle:
    pickle.dump(label_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# import torch
# import torch.nn as nn
# class FocalLoss(nn.Module):
#     def __init__(self,reduction, gamma=2, eps=1e-7):
#         super(FocalLoss, self).__init__()
#         self.gamma = gamma
#         #print(self.gamma)
#         self.eps = eps
#         self.ce = torch.nn.CrossEntropyLoss(reduction=reduction)

#     def forward(self, input, target):
#         logp = self.ce(input, target.to(torch.long))
#         p = torch.exp(-logp)
#         loss = (1 - p) ** self.gamma * logp
#         return loss.mean()

import torch
import torch.nn as nn
import torch.nn.functional as F

class FocalLoss(nn.modules.loss._WeightedLoss):
    def __init__(self, weight=None, gamma=2,reduction='mean'):
        super(FocalLoss, self).__init__(weight,reduction=reduction)
        self.gamma = gamma
        self.weight = weight #weight parameter will act as the alpha parameter to balance class weights

    def forward(self, input, target):

        ce_loss = F.cross_entropy(input, target,reduction=self.reduction,weight=self.weight)
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma * ce_loss).mean()
        return focal_loss

In [None]:
from torch import nn
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = FocalLoss()
        loss = loss_fct(logits.view(-1, self.model.num_labels),
                        labels.float().view(-1, 1).squeeze())
        return (loss, outputs) if return_outputs else loss

In [None]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, is_train=True, label_map={}, max_length=128):
        self.df = df
        self.brand = df.BRAND.values
        self.title = df.TITLE.values
        self.desc = df.DESCRIPTION.values
        self.bullets = df.BULLET_POINTS.apply(lambda x: x[1:-1] if len(x)>0 and x[0]=='[' else x).values
        self.tokenizer = tokenizer
        if is_train:
            self.labels = df.BROWSE_NODE_ID.apply(lambda x: label_map[x]).values
            self.label_map = label_map
        self.is_train = is_train
        self.max_length = max_length
 
    def __getitem__(self, idx):
        
#         req_string = self.brand[idx] + '~'
        req_string = self.title[idx] + ' ~ '
        if torch.rand(1)>0.5:
            req_string += self.desc[idx]
        req_string += ' ~ '
        if torch.rand(1)>0.5:
            req_string += self.bullets[idx]
        tokenized_data = tokenizer.tokenize(req_string)
        to_append = ["[CLS]"] + tokenized_data[:self.max_length - 2] + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(to_append)
        input_mask = [1] * len(input_ids)
        padding = [0] * (self.max_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        item = {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(input_mask, dtype=torch.long)
        }
        if self.is_train:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
 
    def __len__(self):
        return len(self.df)

train_dataset = Dataset(train_split.fillna(""), tokenizer, is_train=True, label_map=label_map)
val_dataset = Dataset(val_split.fillna(""), tokenizer, is_train=True, label_map=label_map)
test_dataset = Dataset(test.fillna(""), tokenizer, is_train=False)

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    max_steps=15500,                  # total number of training epochs
    per_device_train_batch_size=256,
    # batch size per device during training
    per_device_eval_batch_size=256,  # batch size for evaluation
    warmup_steps=1000,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=500,
    dataloader_num_workers=4,
    report_to="tensorboard",
    label_smoothing_factor=0.1,
    tpu_num_cores=8,
    evaluation_strategy="steps",
    eval_steps=5000, # Evaluation and Save happens every 500 steps
    save_strategy = "steps",
    save_steps = 1000,
    save_total_limit=3, # Only last 5 models are saved. Older ones are deleted.
    load_best_model_at_end=True,   #best model is always saved
    prediction_loss_only = True,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
model.classifier = torch.nn.Linear(768, len(label_map))
model.num_labels = len(label_map)

In [None]:
# trainer = Trainer(
#     model=model,                         # the instantiated 🤗 Transformers model to be trained
#     args=training_args,                  # training arguments, defined above
#     train_dataset=train_dataset,         # training dataset
#     eval_dataset=val_dataset             # evaluation dataset
# )
trainer = CustomTrainer(model = model , args  = training_args , train_dataset = train_dataset , eval_dataset = val_dataset)

trainer.train()