In [1]:
# https://huggingface.co/transformers/custom_datasets.html

In [2]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version 1.7 --apt-packages libomp5 libopenblas-dev
!pip install numpy

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  5116  100  5116    0     0  31195      0 --:--:-- --:--:-- --:--:-- 31195
Updating... This may take around 2 minutes.
Updating TPU runtime to pytorch-1.7 ...
Found existing installation: torch 1.7.0
Uninstalling torch-1.7.0:
Done updating TPU runtime
  Successfully uninstalled torch-1.7.0
Found existing installation: torchvision 0.8.1
Uninstalling torchvision-0.8.1:
  Successfully uninstalled torchvision-0.8.1
Copying gs://tpu-pytorch/wheels/torch-1.7-cp37-cp37m-linux_x86_64.whl...

Operation completed over 1 objects/114.2 MiB.                                    
Copying gs://tpu-pytorch/wheels/torch_xla-1.7-cp37-cp37m-linux_x86_64.whl...

Operation completed over 1 objects/127.4 MiB.                                    
Copying gs://tpu-pytorch/wheels/torchvision-1.7-cp37-cp37m-linux_x86_64.whl...

Operati

In [3]:
!wget https://s3-ap-southeast-1.amazonaws.com/he-public-data/dataset52a7b21.zip
!unzip dataset52a7b21.zip
!rm dataset/.~lock.train.csv#
!rm dataset52a7b21.zip

--2021-07-31 11:41:31--  https://s3-ap-southeast-1.amazonaws.com/he-public-data/dataset52a7b21.zip
Resolving s3-ap-southeast-1.amazonaws.com (s3-ap-southeast-1.amazonaws.com)... 52.219.129.34
Connecting to s3-ap-southeast-1.amazonaws.com (s3-ap-southeast-1.amazonaws.com)|52.219.129.34|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1061576029 (1012M) [binary/octet-stream]
Saving to: ‘dataset52a7b21.zip’


2021-07-31 11:42:48 (13.3 MB/s) - ‘dataset52a7b21.zip’ saved [1061576029/1061576029]

Archive:  dataset52a7b21.zip
   creating: dataset/
  inflating: dataset/train.csv       
  inflating: dataset/sample_submission.csv  
  inflating: dataset/test.csv        
  inflating: dataset/.~lock.train.csv#  


In [4]:
import csv
import pickle
import pandas as pd
import numpy as np
train = pd.read_csv("dataset/train.csv", escapechar = "\\", quoting = csv.QUOTE_NONE)
# test = pd.read_csv("dataset/test.csv", escapechar = "\\", quoting = csv.QUOTE_NONE)

In [5]:
class_counts = train.BROWSE_NODE_ID.value_counts()
drop_indices = class_counts[class_counts<50].index
train = train[~train.BROWSE_NODE_ID.isin(drop_indices)]
train.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3
4,Teacher Planner Company A4 6 Lesson Academic T...,,,,4
5,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...,"[Color: Blue,Sleeve: Full Sleeve,Material: Cot...",Bhavya Enterprise,5


In [6]:
from sklearn.model_selection import train_test_split
train_split, val_split = train_test_split(train, test_size=.05)
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [7]:
label_map = {}
for idx, value in enumerate(train.BROWSE_NODE_ID.unique()):
    label_map[value] = idx

with open('lable_map.pickle', 'wb') as handle:
    pickle.dump(label_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

del train

In [8]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, is_train=True, label_map={}, max_length=128):
        self.df = df
        self.title = df.TITLE.values
        self.desc = df.DESCRIPTION.values
        self.bullets = df.BULLET_POINTS.apply(lambda x: x[1:-1] if len(x)>0 and x[0]=='[' else x).values
        self.tokenizer = tokenizer
        if is_train:
            self.labels = df.BROWSE_NODE_ID.apply(lambda x: label_map[x]).values
            self.label_map = label_map
        self.is_train = is_train
        self.max_length = max_length
 
    def __getitem__(self, idx):
        req_string = self.title[idx] + ' ~ '
        if torch.rand(1)>0.5:
            req_string += self.desc[idx]
        req_string += ' ~ '
        if torch.rand(1)>0.5:
            req_string += self.bullets[idx]
        tokenized_data = tokenizer.tokenize(req_string)
        to_append = ["[CLS]"] + tokenized_data[:self.max_length - 2] + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(to_append)
        input_mask = [1] * len(input_ids)
        padding = [0] * (self.max_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        item = {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(input_mask, dtype=torch.long)
        }
        if self.is_train:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
 
    def __len__(self):
        return len(self.df)

train_dataset = Dataset(train_split.fillna(""), tokenizer, is_train=True, label_map=label_map)
val_dataset = Dataset(val_split.fillna(""), tokenizer, is_train=True, label_map=label_map)
# train_dataset = Dataset(train.fillna(""), tokenizer, is_train=True, label_map=label_map)
# test_dataset = Dataset(test.fillna(""), tokenizer, is_train=False)

In [9]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=256, # batch size per device during training
    per_device_eval_batch_size=256,  # batch size for evaluation
    warmup_steps=1000,               # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=500,
    dataloader_num_workers=4,
    report_to="tensorboard",
    label_smoothing_factor=0.1,
    tpu_num_cores=8,
    evaluation_strategy="steps",
    eval_steps=4500,                 # Evaluation and Save happens every 500 steps
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=3,              # Only last 5 models are saved. Older ones are deleted.
    load_best_model_at_end=True,     # best model is always saved
    prediction_loss_only=True,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
model.classifier = torch.nn.Linear(768, len(label_map))
model.num_labels = len(label_map)

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [10]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Token indices sequence length is longer than the specified maximum sequence length for this model (849 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (737 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (544 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (711 > 512). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss
4500,2.3663,2.264403
9000,2.1582,2.094345
13500,2.0513,2.025699
18000,2.0064,1.989092


Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (548 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (760 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

TrainOutput(global_step=20880, training_loss=2.3692863932057815, metrics={'train_runtime': 11226.3827, 'train_samples_per_second': 1.86, 'total_flos': 3.94643260284288e+16, 'epoch': 2.0, 'init_mem_cpu_alloc_delta': 118349824, 'init_mem_cpu_peaked_delta': 163246080, 'train_mem_cpu_alloc_delta': 1570762752, 'train_mem_cpu_peaked_delta': 1408139264})