In [3]:
pip install pytorch-transformers

Note: you may need to restart the kernel to use updated packages.


In [4]:
import torch
import torchvision
import transformers

In [5]:
import os
import torch
import pandas as pd
from torch.utils.data import Dataset
# from utils import tokenizer, PAD, EOS
from torch.nn.utils.rnn import pad_sequence


In [6]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
PAD = tokenizer.pad_token_id
EOS = tokenizer.convert_tokens_to_ids('.')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
import torch.nn as nn
from pytorch_transformers import BertForSequenceClassification, modeling_bert
from pytorch_transformers.modeling_bert import BertConfig

bert_model = BertForSequenceClassification(BertConfig()).from_pretrained('bert-base-uncased')

print(bert_model)

class BertForClassification(modeling_bert.BertPreTrainedModel):
    def __init__(self):
        super(BertForClassification, self).__init__(BertConfig())

        self.embeddings = bert_model.bert.embeddings
        self.encoder = bert_model.bert.encoder
        self.pooler = bert_model.bert.pooler
        self.dropout = bert_model.dropout
        self.classifier = bert_model.classifier
        self.prediction = nn.Sequential( 
                nn.Linear(768, 64), 
                nn.ReLU(),
                nn.Dropout(p=0.2),
                nn.Linear(64, 1),
        )

        self.head_mask = [None] * 12

        self.apply(self._init_weights)

    def forward(self, text, position_ids, attention_mask):
                      
        attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        attention_mask = (1.0 - attention_mask) * -10000.0   # ( 64, 1, 1, 71)

        embeddings = self.embeddings(text)
        hidden_states = self.encoder(embeddings, attention_mask, head_mask=self.head_mask)[0] # (64, 71, 768)
        output = self.pooler(hidden_states) # (64, 768)
        output = self.dropout(output) # (64, 768)
        output = self.prediction(output) # (64, 2)
        return output



100%|██████████| 433/433 [00:00<00:00, 139691.84B/s]
100%|██████████| 440473133/440473133 [00:14<00:00, 29968830.16B/s]


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [8]:
def save_model(model, train_loss, val_loss, epoch, is_best):

    state = {
        'epoch' : epoch,
        'train_loss' : train_loss,
        'val_loss' : val_loss,
        'model_state_dict' : model.state_dict()
    }

    file_name = './checkpoint_latest.pth.tar'
    torch.save(state, file_name)

    if is_best:
        file_name = './checkpoint_best.pth.tar'
        torch.save(state, file_name)

In [9]:
class CommonlitDataset(Dataset):
    def __init__(self, root_dir, split):

        self.root_dir = root_dir
        self.split = split
        
        self.text_ids = []
        self.text = []
        self.targets = []
        self.std_err = []
        
        if split == 'train':
            self.load_data_train()
            self.load_fn = self.get_item_train
        else:
            self.load_data_test()
            self.load_fn = self.get_item_test
    
    def load_data_train(self):
        
        data_file = os.path.join(self.root_dir, self.split+'.csv')
        df = pd.read_csv(data_file)
        for i, row in df.iterrows():
            self.text_ids.append(row['id'])
            self.text.append(tokenizer.encode(row['excerpt']))
            self.targets.append([row['target']])
            self.std_err.append(row['standard_error'])
    
    def load_data_test(self):        
        data_file = os.path.join(self.root_dir, self.split+'.csv')
        df = pd.read_csv(data_file)
        for i, row in df.iterrows():
            self.text_ids.append(row['id'])
            self.text.append(tokenizer.encode(row['excerpt']))
   
    def get_item_train(self, idx):
        text_id = self.text_ids[idx]
        text = self.text[idx]
        text = text + [EOS]
        text = torch.tensor(text, dtype=torch.long)

        target = self.targets[idx]
        std_err = self.std_err[idx]

        return text_id, text, target, std_err

    def get_item_test(self, idx):
        text_id = self.text_ids[idx]
        text = self.text[idx]
        text = text + [EOS]
        text = torch.tensor(text, dtype=torch.long)

        return text_id, text
    
    def __len__(self):
        return len(self.text_ids)

    def __getitem__(self, idx):
        return self.load_fn(idx)

def collate_fn_train(batch):

    batch = list(zip(*batch))

    text_id = batch[0]
    text = pad_sequence(batch[1], batch_first=True, padding_value=PAD)
    target = torch.tensor(batch[2], dtype=torch.float)
    std_err = torch.tensor(batch[3], dtype=torch.float)

    return text_id, text, target, std_err


def collate_fn_test(batch):

    batch = list(zip(*batch))

    text_id = batch[0]
    text = pad_sequence(batch[1], batch_first=True, padding_value=PAD)

    return text_id, text

In [10]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
#from utils import PAD, save_model
from torch.utils.data import DataLoader
#from model import BertForClassification
from torch.nn.utils import clip_grad_norm_
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data.sampler import SubsetRandomSampler
#from dataset import CommonlitDataset, collate_fn_train, collate_fn_test
from pytorch_transformers.optimization import AdamW, WarmupCosineSchedule

In [11]:
def train(model, train_data_loader, optimizer, scheduler, loss_fn, epoch, device):
    
    running_loss = 0.0
    model.train()

    with tqdm(desc='Epoch %d - Train' % epoch, unit='it', total=len(train_data_loader)) as pbar:

        for idx, batch in enumerate(train_data_loader):
            # text_id, text, target, std_err
            text = batch[1].to(device)
            target = batch[2].to(device)
            std_err = batch[3].to(device)

            optimizer.zero_grad()

            position_ids = torch.arange(text.size(1), dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand_as(text)

            attention_mask = (text != PAD).float() #(batch_size, caption_len) (64, 21)
            
            pred = model(text, position_ids, attention_mask)
            loss = loss_fn(pred, target)
            loss.backward()
            clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            running_loss += loss.item()

            used_mem = torch.cuda.max_memory_allocated() / 1024.0 ** 3

            pbar.set_postfix(train_loss = running_loss /(idx + 1), mem = used_mem)
            pbar.update()

    return running_loss/len(train_data_loader)

In [12]:
def val(model, val_data_loader, loss_fn, epoch, device):
    running_loss = 0.0
    model.eval()

    with tqdm(desc='Epoch %d - validation' % epoch, unit='it', total=len(val_data_loader)) as pbar:

        for idx, batch in enumerate(train_data_loader):
            # text_id, text, target, std_err
            text = batch[1].to(device)
            target = batch[2].to(device)
            std_err = batch[3].to(device)

            position_ids = torch.arange(text.size(1), dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand_as(text)

            attention_mask = (text != PAD).float() #(batch_size, caption_len) (64, 21)
            
            pred = model(text, position_ids, attention_mask)
            loss = loss_fn(pred, target)
            
            running_loss += loss.item()

            used_mem = torch.cuda.max_memory_allocated() / 1024.0 ** 3

            pbar.set_postfix(validation_loss = running_loss /(idx + 1), mem = used_mem)
            pbar.update()

    return running_loss/len(val_data_loader)

In [13]:
def test(model, test_data_loader, epoch, device):
    ids = []
    target_values = []
    model.eval()

    with tqdm(desc='Epoch %d - test' % epoch, unit='it', total=len(test_data_loader)) as pbar:

        for idx, batch in enumerate(test_data_loader):
            # text_id, text, target, std_err
            text_id = batch[0]
            text = batch[1].to(device)

            position_ids = torch.arange(text.size(1), dtype=torch.long, device=device)
            position_ids = position_ids.unsqueeze(0).expand_as(text)

            attention_mask = (text != PAD).float() #(batch_size, caption_len) (64, 21)
            
            pred = model(text, position_ids, attention_mask)
            
            ids.append(text_id[0])
            target_values.append(pred[0].cpu().item())
    
    df = pd.DataFrame({'id':ids,
                        'target':target_values})
    df.to_csv('submission.csv', index=False)

In [14]:
batch_size = 16
shuffle_dataset = True
random_seed= 42
num_epochs=100
patience=10
min_loss = np.inf
device = "cuda"

lr = 5e-5
weight_decay = 1e-2
betas = (0.9, 0.999)
warmup_steps = 1000
max_steps = 100000

In [15]:
dataset = CommonlitDataset('../input/commonlitreadabilityprize', 'train')
    
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(0.2 * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]
train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

In [16]:
train_data_loader = DataLoader(dataset, 
                                    batch_size=batch_size,
                                    drop_last=True,
                                    collate_fn=collate_fn_train,
                                    sampler=train_sampler,
                                    num_workers=8)

val_data_loader = DataLoader(dataset, 
                                batch_size=batch_size,
                                drop_last=True,
                                collate_fn=collate_fn_train,
                                sampler=val_sampler,
                                num_workers=2)

In [17]:
test_dataset = CommonlitDataset('../input/commonlitreadabilityprize', 'test')
test_data_loader = DataLoader(test_dataset, 
                                batch_size=1,
                                shuffle=True,
                                drop_last=True,
                                collate_fn=collate_fn_test,
                                num_workers=2)

In [18]:
model = BertForClassification().to(device)

In [19]:
optimizer = AdamW(
        params=model.parameters(),
        lr=lr,
        weight_decay=weight_decay,
        betas=betas
    )

scheduler = WarmupCosineSchedule(
    optimizer=optimizer,
    warmup_steps=warmup_steps,
    t_total=max_steps
)

loss_fn = torch.nn.MSELoss()

In [20]:
import torch
import torchvision

In [21]:
writer = SummaryWriter('./tensorboard_logs/')

In [22]:
for epoch in range(num_epochs):

    is_best_model = False

    train_loss = train(model, train_data_loader, optimizer, scheduler, loss_fn, epoch, device)

    val_loss = val(model, val_data_loader, loss_fn, epoch, device)

    writer.add_scalar("Loss/train", train_loss, epoch)
    writer.add_scalar("Loss/val", val_loss, epoch)

    if val_loss < min_loss:
        is_best_model = True
        min_loss = val_loss
        test(model, test_data_loader, epoch, device)
    else:
        patience -= 1
    
    save_model(model, train_loss, val_loss, epoch, is_best_model)
    
    if patience <= 0:
        break
writer.flush()
writer.close()

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /opt/conda/conda-bld/pytorch_1603729138878/work/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
Epoch 0 - Train: 100%|██████████| 141/141 [01:12<00:00,  1.96it/s, mem=8.41, train_loss=1.38]
Epoch 0 - validation: 141it [00:22,  6.34it/s, mem=12.3, validation_loss=1.09]                        
Epoch 0 - test:   0%|          | 0/7 [00:00<?, ?it/s]
Epoch 1 - Train:  25%|██▍       | 35/141 [00:18<00:55,  1.92it/s, mem=12.3, train_loss=1.09]


KeyboardInterrupt: 