# Pre-train BERT Classification

## Preprocess

### Import modules

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator
import torchtext
import matplotlib.pyplot as plt

import copy
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification

## Prepare data

In [4]:
train_data = pd.read_csv('./origin_data/train_filtered.txt', sep='\t', header=None)
test_data = pd.read_csv('./origin_data/test_filtered.txt', sep='\t', header=None)
train_data

Unnamed: 0,0,1
0,/location/location/contains,sen. charles e. schumer called on federal safe...
1,/location/location/contains,"but instead there was a funeral , at st. franc..."
2,/location/location/contains,"rosemary antonelle , the daughter of teresa l...."
3,/location/location/contains,one was for st. francis de sales roman catholi...
4,/location/location/contains,"the firefighter , whom a fire department offic..."
...,...,...
70724,UNK,"it gets pretty loud , '' the [E1] williams [/E..."
70725,UNK,"cherished grandmother of [E1] natalie [/E1] , ..."
70726,UNK,"cherished grandmother of [E1] natalie [/E1] , ..."
70727,UNK,among those present at the ceremony were phil ...


In [3]:
max_len = 0
max_id = 0
max_token = []
for _, row in train_data.iterrows():
    token = row['sen'].strip().split()
    if len(token)>max_len:
        max_len = len(token)
        max_id = _
        max_token = token

print(max_len)

KeyError: 'sen'

### Convert to ids

#### Initialize tokenizer

In [31]:
pretrain_model = "bert-base-uncased"
additional_special_tokens = ['[E1]', '[/E1]', '[E2]', '[/E2]']
tokenizer = BertTokenizer.from_pretrained(pretrain_model, do_lower_case=True, additional_special_tokens = additional_special_tokens)

Special tokens have been added in the vocabulary, make sure the associated word emebedding are fine-tuned or trained.


#### Set attributes

In [32]:
# Model parameters
MAX_SEQ_LEN = 512
PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token)

batch_size = 32
# Fields

label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True,
                   fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX)
fields = [('label', label_field),  ('text', text_field)]

In [33]:
train = TabularDataset(path='./origin_data/train_filtered.txt', format='tsv', fields=fields)
test = TabularDataset(path='./origin_data/test_filtered.txt', format='tsv', fields=fields)
#test = TabularDataset(path='./origin_data/test_filtered.txt', format='csv', skip_header=True, fields = fields)

Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (853 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (814 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (710 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

In [34]:
print(train[1].text)

[101, 2021, 2612, 2045, 2001, 1037, 6715, 1010, 2012, 2358, 1012, 4557, 2139, 4341, 3142, 3234, 2277, 1010, 1999, 30522, 9852, 1035, 6496, 30525, 1010, 30523, 8603, 30524, 1010, 1996, 3583, 1997, 2010, 4182, 1012, 102]


In [35]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [36]:
train_iter = BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.text), device=device, train=True, sort=True, sort_within_batch=True)
test_iter = Iterator(test, batch_size=batch_size, device=device, train=False, shuffle=False, sort=False)

In [37]:
batch = next(iter(train_iter))

## Configuration

The problem is, how to set the additional special tokens in the right way. What's the 

## Model

`MAX_SEQ_LEN=259`, ``

### Build model

In [38]:
class REBERT(nn.Module):
    
    def __init__(self):
        super(REBERT, self).__init__()
        
        self.encoder = BertForSequenceClassification.from_pretrained("bert-base-uncased")
        
    def forward(self, text, label):
        loss, text_fea = self.encoder(text, labels=label)[:2]
        
        return loss, text_fea

## Training

In [39]:
# Save and Load Functions

def save_checkpoint(save_path, model, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

def load_checkpoint(load_path, model):
    
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')
def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

In [43]:
def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = train_iter,
          num_epochs = 5,
          eval_every = len(train_iter) // 2,
          file_path = './out',
          best_valid_loss = float("Inf")):
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        for (label, text), _ in train_loader:
            label = label.type(torch.LongTensor)           
            label = label.to(device)
            text = text.type(torch.LongTensor)  
            text = text.to(device)
            output = model(text, label)
            loss, _ = output

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    

                    # validation loop
                    for (label, text), _ in valid_loader:
                        label = label.type(torch.LongTensor)           
                        label = label.to(device)
                        text = text.type(torch.LongTensor)  
                        text = text.to(device)
                        output = model(text, label)
                        loss, _ = output
                        
                        valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '/' + 'model.pt', model, best_valid_loss)
                    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics(file_path + '/' + 'metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')

In [44]:
model = REBERT().to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [45]:
train(model=model, optimizer=optimizer)

RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`