In [1]:
import pandas as pd
import transformers

In [2]:
import torch
import torch.nn as nn
from torch.optim import Adam

In [3]:
BERT_Model_str = "bert-base-uncased"
BERT_Tokenizer = transformers.BertTokenizer.from_pretrained(BERT_Model_str)
Bert_Model     = transformers.BertModel.from_pretrained(BERT_Model_str)

In [4]:
MAX_LEN    = 30
BATCH_SIZE = 32

In [5]:
# load dataset
dataset = pd.read_csv('./dataset/train_folds.csv')
dataset = dataset[['text', 'target']]
dataset.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


# Preparing input for BERT

#### First we will create a dataset object, indexing into which will give inputs in the format which is required for BERT Model.
#### Then we will create  a dataloader for generating batches.

### Dataset Object

In [6]:
class Dataset(object):
    def __init__(self, text, target):
        self.text    = text
        self.target  = target
        self.tknzr   = BERT_Tokenizer
        self.max_len = MAX_LEN
        
    def __len__(self):
        return len(self.target)
    
    def __getitem__(self, item_idx):
        
        text = str(self.text[item_idx])
        text = ''.join(text.split()) # to remove any extra spaces (should be moved to preprocess function later)
        
        tknzr_output    = self.tknzr.encode_plus(text, max_length = self.max_len, truncation=True)
        
        input_ids       = tknzr_output['input_ids']
        token_type_ids  = tknzr_output['token_type_ids']
        attention_mask  = tknzr_output['attention_mask']
        
        padding_length  = self.max_len - len(input_ids) # if len less than MAX_LEN right padding to be added
        
        input_ids       = torch.tensor(input_ids + [0]*padding_length, dtype=torch.long)
        token_type_ids  = torch.tensor(token_type_ids + [0]*padding_length, dtype=torch.long)
        attention_mask  = torch.tensor(attention_mask + [0]*padding_length, dtype=torch.long)
        
        target = torch.tensor(self.target[item_idx], dtype=torch.float)
        
        #return input_ids, token_type_ids, attention_mask, target
        
        input_dict = {
            'input_ids':input_ids,
            'token_type_ids':token_type_ids,
            'attention_mask':attention_mask,
            'target':target
        }
        
        return input_dict

In [7]:
tweet_clf_dataset = Dataset(dataset.text.values, dataset.target.values)

In [8]:
# Original Dataset
dataset.loc[1]

text      Forest fire near La Ronge Sask. Canada
target                                         1
Name: 1, dtype: object

In [9]:
# Dataset object for BERT inputs
tweet_clf_dataset[1]

{'input_ids': tensor([  101,  3224, 10273, 22084, 12190, 10464, 15465, 20939,  2243,  1012,
          2710,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]),
 'target': tensor(1.)}

### Data Loader for generating batches

In [10]:
# data loader with given batech size
data_loader_ = torch.utils.data.DataLoader(tweet_clf_dataset, batch_size=2)

In [11]:
batch_ = next(iter(data_loader_))
batch_

{'input_ids': tensor([[  101,  2256, 26095,  5104, 12069, 12399,  5243,  3385, 15794, 24158,
           1001,  8372, 27871, 25425,  2232, 29278,  5856,  3726, 10383,  3363,
            102,     0,     0,     0,     0,     0,     0,     0,     0,     0],
         [  101,  3224, 10273, 22084, 12190, 10464, 15465, 20939,  2243,  1012,
           2710,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
          0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0]]),
 'target': tensor([

In [12]:
input_ids_       = batch_['input_ids']
token_type_ids_  = batch_['token_type_ids']
attention_mask_  = batch_['attention_mask']
target_          = batch_['target']

# Model, Loss Function & Optimizer
#### First we will create a model class
#### then go through the learning procedure

In [13]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        
        self.bert   = Bert_Model
        self.drop   = nn.Dropout(0.3)
        self.output = nn.Linear(768, 1)
    
    def forward(self, input_ids, token_type_ids, attention_mask):
        bert1, bert2 = self.bert(
            input_ids,
            token_type_ids = token_type_ids,
            attention_mask = attention_mask,
        )
        
        drop   = self.drop(bert2)
        output = self.output(drop)
        
        return output

### Forward Pass

In [14]:
# create an instance of model to test
model_ = Model()

In [15]:
# we expect our model to output target class gievn the input
output_ = model_.forward(input_ids_, token_type_ids_, attention_mask_)
output_

tensor([[-0.1845],
        [-0.4640]], grad_fn=<AddmmBackward>)

In [16]:
output_.flatten()

tensor([-0.1845, -0.4640], grad_fn=<ViewBackward>)

In [17]:
target_.flatten()

tensor([1., 1.])

### Loss Function and Optimizer

In [18]:
loss_function = nn.BCEWithLogitsLoss()

In [19]:
loss = loss_function(output_.flatten(), target_.flatten())
loss

tensor(0.8707, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)

In [20]:
optimizer_ = Adam(model_.parameters())

In [21]:
optimizer_.zero_grad() # clear previous gradients
loss.backward() # compute gradients according to current loss

In [22]:
for i, j in model_.named_parameters(): # printing few parameters (for referance)
    if i == 'output.weight':
        print(j[0,1:5])

tensor([-0.0110, -0.0180,  0.0223, -0.0100], grad_fn=<SliceBackward>)


In [23]:
optimizer_.step() # optimization step to update weights

In [44]:
# it's not necessar that printed parameters will be always different, 
# but if they are it's nice way to see things are moving ;) 
for i, j in model_.named_parameters(): 
    if i == 'output.weight':
        print(j[0,1:5])

tensor([-0.0110, -0.0190,  0.0233, -0.0090], grad_fn=<SliceBackward>)
