## Author - Lalit Pandey
## Contact - lpandey@iu.edu
## Institution - Indiana University Bloomington

### Code Reference Links
#### 1. http://mccormickml.com/2019/07/22/BERT-fine-tuning/
#### 2. https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html

# Check and Select GPU

In [36]:
!nvidia-smi

Sun Dec 11 15:41:29 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A10          On   | 00000000:08:00.0 Off |                    0 |
|  0%   51C    P0    63W / 150W |   1253MiB / 23028MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A10          On   | 00000000:0B:00.0 Off |                    0 |
|  0%   49C    P0    61W / 150W |   2143MiB / 23028MiB |      1%      Default |
|       

In [37]:
!export CUDA_VISIBLE_DEVICES=6

# Future Tasks (ignore for now)

In [489]:
# ACTIVATION FUNCTION FOR DOWNSTREAM LAYERS
# ADDING DROPOUT BETWEEN LINEAR LAYERS
# CHECK THE EFFECT OF DYNAMIC LEARNING RATE SCHEDULER
# INITIALIZE WEIGHT SHARING BETWEEN LINEAR LAYERS IN FUTURE ARCHITECTURE

# Import Libraries

In [1]:
# import libraries
import pandas as pd
import torch
import torch.nn as nn
from torchsummary import summary
import pytorch_lightning as pl
from argparse import Namespace
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from torch.optim.optimizer import Optimizer
from torchmetrics.functional import accuracy
from torch.optim import Adam
from torchmetrics import Accuracy
from sklearn.utils import shuffle

# Load dataset

In [2]:
# Load training dataset
DATASET_TRAIN_PATH = '/home/lpandey/NLP_Project/NLP_Project_dataset_small.csv'
dataset_train = pd.read_csv(DATASET_TRAIN_PATH)
dataset_train.head()

Unnamed: 0,sentence,label
0,move 1 degree towards left,"0,-1,0"
1,move 2 degree towards left,"0,-2,0"
2,move 3 degree towards left,"0,-3,0"
3,move 4 degree towards left,"0,-4,0"
4,move 5 degree towards left,"0,-5,0"


In [3]:
sentences = dataset_train['sentence']
labels = dataset_train['label']

### Exploring labels

In [4]:
labels

0       0,-1,0
1       0,-2,0
2       0,-3,0
3       0,-4,0
4       0,-5,0
        ...   
295    0,0,-46
296    0,0,-47
297    0,0,-48
298    0,0,-49
299    0,0,-50
Name: label, Length: 300, dtype: object

In [5]:
label = labels[3]
label

'0,-4,0'

In [6]:
type(label)

str

In [7]:
x = int(label[0])
x

0

In [8]:
sep = label.split(',')
sep

['0', '-4', '0']

# Pre-processing labels as 1-d tensors

In [9]:
for i in range(0,len(labels)):
    sep_label = labels[i].split(',')
    x,y,z = torch.tensor(float(sep_label[0])),torch.tensor(float(sep_label[1])),torch.tensor(float(sep_label[2]))
    x,y,z = x.reshape(1),y.reshape(1),z.reshape(1)
    labels[i] = torch.cat((x,y,z))

In [10]:
# uncomment to see labels as tensors.
# for label in labels:
#     print(label)

## Tokenization and BERT formatting

### Loading tokenizer
#### Reference link - http://mccormickml.com/2019/07/22/BERT-fine-tuning/¶

In [11]:
# Tokenization and BERT formatting - 
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


## Full dataset tokenization

#### Assign maximum length to a sentence (for the purpose of padding)
#### Reference link - http://mccormickml.com/2019/07/22/BERT-fine-tuning/¶

In [12]:
# Tokenize entire dataset
max_len = 0

# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  9


#### Tokenizing
#### Reference link - http://mccormickml.com/2019/07/22/BERT-fine-tuning/¶

In [13]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []
input_labels = []

# For every sentence...
for sent,label in zip(sentences,labels):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])
    
    # Add the labels to a list
    input_labels.append(torch.unsqueeze(label, 0))
    

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.cat(input_labels, dim=0)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])
print('attention masks', attention_masks[0])
print('label', labels[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  move 1 degree towards left
Token IDs: tensor([ 101, 2693, 1015, 3014, 2875, 2187,  102,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])
attention masks tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
label tensor([ 0., -1.,  0.])




# Dataset and Dataloader
#### Reference link - http://mccormickml.com/2019/07/22/BERT-fine-tuning/¶

#### Dataset

In [14]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

print("function - ", dataset[0])

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.97 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

function -  (tensor([ 101, 2693, 1015, 3014, 2875, 2187,  102,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]), tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), tensor([ 0., -1.,  0.]))
  291 training samples
    9 validation samples


#### DataLoader

In [15]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.   
            #sampler = RandomSampler(train_dataset), # Select batches randomly
            #sampler = SequentialSampler(train_dataset),
            batch_size = batch_size, # Trains with this batch size.
            shuffle = False,
            drop_last = False
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size, # Evaluate with this batch size.
            drop_last = False
        )

# Backbone: Bert Encoder

#### Encoder configurations

In [490]:
# Model with a classification layer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import BertConfig, BertModel

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
encoder = BertModel.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [491]:
# put encoder on GPU
encoder.cuda()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

## Pytorch Lightning class and methods
#### Self-proposed architecture

In [492]:
class CoordinateGeneratorModel(pl.LightningModule):
    def __init__(self, encoder=encoder,input_dims=768, finetune=False):
        super().__init__()
        if encoder is None:
            self.encoder = encoder
        else:
            self.encoder = encoder
        self.input_dims = input_dims # input dims to linear classifier
        self.output_dims = 1 # output dims from the linear classifier
        self.finetune = finetune #freeze/unfreeze weights
    
        for param in self.encoder.parameters():
            param.requires_grad = self.finetune
        
        # FC layers for each axis
        self.x_linear = nn.Linear(self.input_dims, self.output_dims)
        self.y_linear = nn.Linear(self.input_dims, self.output_dims)
        self.z_linear = nn.Linear(self.input_dims, self.output_dims)
        #self.combined_linear = nn.Linear(self.input_dims, 3) # experimental to combine all coordinates together
        # metrics
        self.train_acc = Accuracy()
        self.val_acc = Accuracy(compute_on_step=False)
        self.test_acc = Accuracy(compute_on_step=False)
        self.loss = nn.L1Loss() # loss function - mean absolute error
        
        
    # this is where the weights are frozen before the very first epoch.
    def on_train_epoch_start(self) -> None:
        if self.finetune:
            self.encoder.train()
        else:
            self.encoder.eval()    
    
    # function for inference
    def forward(self, x):
        
        tokens, attention_masks = x
        
        #print("inside forward -")
        #print("tokens - ",tokens)
        #print("atten_mask- ",attention_masks)
        #print("labels - ", labels)
        
        logits = self.encoder(tokens, attention_masks)
        x = self.x_linear(logits[1])
        y = self.y_linear(logits[1])
        z = self.z_linear(logits[1])
        
        return (x,y,z)
        
    # shared with validation and training step
    def shared_step(self, batch,  batch_idx):
        #print("inside shared_step - ")
        
        # each of these are individual tensors and not list
        tokens, attention_masks, labels = batch
        
 
        #print("tokens shape - ", tokens.shape) # [32,64] -> 32 is batch size and 64 is the number of tokens inside tensor
        
        #print("attension masks - ", attention_masks.shape) # [32,64] -> same as above
        
        #print("labels shape - ", labels.shape) # [32,3] -> 32 labels with 3 elements inside
        
        logits = self.encoder(tokens, attention_masks)
        
        # the output logits is the model class object,
        # therefore, logits[0] gives the logits from that class
        
        '''
        IMPO: LOGITS RETURNS 2 TYPES OF TENSORS -
        1. OUTPUT FROM THE LAST HIDDEN STATE, logits[0]
        2. OUTPUT FROM BERT POOLER WHICH IS THE LAST LAYER, logits[1]
        WE NEED THE OUTPUT FROM THE LAST POOLER LAYER THEREFORE GO FOR OPTION 2
        '''

        #print("logits 0 shape - ", logits[0].shape) # [32, 64, 768]
        #print("logits 1 shape - ", logits[1].shape) # [32, 768]
 
        # ENTIRE BATCH SHOULD BE PASSED IN THE LINEAR LAYER FOR TRAINING
    
        x = self.x_linear(logits[1])
        y = self.y_linear(logits[1])
        z = self.z_linear(logits[1])
        
        #print("labels - ", labels)  # all 32 labels 
        #print("labels 0 - ", labels[0]) # a single label from that batch
        
        # here x is a batched output from the first linear layer, therefore the labels should be batched too.
        #print("shape after linear layer for x- ", x.shape) # [32,1] -> 32 batch size and 1 dim output from linear layer
        #print("shape of target label- ", labels[0][0].shape) # this gives [] output size.
        
        '''
        # x axis labels together
        labels[:, :1].shape
        # y axis labels together
        labels[:, 1:2].shape
        # z axis labels together
        labels[:, 2:].shape
        
        '''
        #print("new labels shape - ", labels[:, :1].shape) # [32,1]
        
        loss_x = self.loss_function(x,labels[:, :1])
        loss_y = self.loss_function(y,labels[:, 1:2]) 
        loss_z = self.loss_function(z,labels[:, 2:])
        
        return loss_x + loss_y + loss_z
    
    def loss_function(self,predicted,target):
        
        loss = self.loss(predicted, target)
        return loss
        
    def training_step(self,batch, batch_idx):
        loss = self.shared_step(batch, batch_idx)
        
        #log loss
        self.log('train_loss', loss, on_epoch=True, on_step=True) # training_loss
        return loss
    
    def validation_step(self,batch, batch_idx):
        loss = self.shared_step(batch, batch_idx)
        
        # log loss
        self.log('val_loss', loss, on_epoch=True, on_step=True) # validation_loss
        return loss
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=2e-5, eps=1e-8) #  weight_decay=1e-5,

# CoordinateGeneratorModel

In [493]:
model = CoordinateGeneratorModel(
    encoder=encoder,
    input_dims=768,
    finetune=True,
)
model

CoordinateGeneratorModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

## Driver code for training

#### Argument Parser

In [41]:
 args = Namespace(
    max_epochs=100,
    batch_size=32,
    gpus=1,
    num_workers=128,
    seed_val=0,
    exp_name='bert_final_exp_for_paper',
)

#### Training

In [22]:
# set seed
pl.seed_everything(args.seed_val)
# make model checkpoint
model_checkpoint = ModelCheckpoint(save_last=True, save_top_k=1, monitor='val_loss')
# assign callbacks (if any)
callbacks = [model_checkpoint]
# create logger
logger = TensorBoardLogger("LOGS/Bert", name=f"{args.exp_name}")
# create Lightning trainer
trainer = pl.Trainer(
        gpus=args.gpus,
        max_epochs=args.max_epochs,
        logger=logger,
        sync_batchnorm=True if args.gpus > 1 else False,
        callbacks=callbacks,
    )

#print(model)
trainer.fit(model, train_dataloader=train_dataloader, val_dataloaders=validation_dataloader)

Global seed set to 0
GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]

  | Name      | Type      | Params
----------------------------------------
0 | encoder   | BertModel | 109 M 
1 | x_linear  | Linear    | 769   
2 | y_linear  | Linear    | 769   
3 | z_linear  | Linear    | 769   
4 | train_acc | Accuracy  | 0     
5 | val_acc   | Accuracy  | 0     
6 | test_acc  | Accuracy  | 0     
7 | loss      | L1Loss    | 0     
----------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Saving latest checkpoint...


1

# Testing

In [43]:
# intialize the model class
trained_model = CoordinateGeneratorModel(encoder=encoder, input_dims=768, finetune=False)

In [500]:
ckpt_path = "/home/lpandey/NLP_Project/LOGS/Bert/bert_final_for_paper/version_2/checkpoints/epoch=95-step=959.ckpt"

In [501]:
# load the trained and frozen weights
trained_model = trained_model.load_from_checkpoint(ckpt_path)

In [502]:
# put trained model on GPU
trained_model = trained_model.cuda()

In [503]:
# put model in evaluation mode
trained_model.eval()

CoordinateGeneratorModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [518]:
# load instruction from output.txt (this file has the instruction that was passed to the agent in Unity Engine)
f = open("/home/lpandey/NLP_Project/instruction.txt", "r")
instruction_sentence = f.read()
instruction_sentence

"'Hey bot, move 30 degree towards right and grab the cup'"

In [519]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []
input_labels = []


encoded_dict = tokenizer.encode_plus(
                        instruction_sentence,      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
attention_masks.append(encoded_dict['attention_mask'])
    

# Convert the lists into tensors.
sentence_inst = torch.cat(input_ids, dim=0)
sentence_inst_atten_mask = torch.cat(attention_masks, dim=0)

print('Token IDs:', sentence_inst[0])
print('attention masks', sentence_inst_atten_mask[0])

Token IDs: tensor([  101,  1005,  4931, 28516,  1010,  2693,  2382,  3014,  2875,  2157,
         1998,  6723,  1996,  2452,  1005,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])
attention masks tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [520]:
# Combine the training inputs into a TensorDataset.
inference_example = TensorDataset(sentence_inst, sentence_inst_atten_mask)
inference_example[0]

(tensor([  101,  1005,  4931, 28516,  1010,  2693,  2382,  3014,  2875,  2157,
          1998,  6723,  1996,  2452,  1005,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [521]:
a = (inference_example[0][0].unsqueeze(0).cuda(), inference_example[0][1].unsqueeze(0).cuda())

In [522]:
final_result = trained_model(a) 

In [523]:
final_result

(tensor([[0.6047]], device='cuda:0', grad_fn=<AddmmBackward0>),
 tensor([[26.4782]], device='cuda:0', grad_fn=<AddmmBackward0>),
 tensor([[-0.2154]], device='cuda:0', grad_fn=<AddmmBackward0>))

# Save Result Coordinates

In [524]:
x = round(final_result[0].item())
y = round(final_result[1].item())
z = round(final_result[2].item())
output = str(x)+','+str(y)+','+str(z)
output

'1,26,0'

In [525]:
# write to a text file for the robot to load the coordinates
# change the directory location 
f = open("/home/lpandey/NLP_Project/instruction.txt", "w")
f.write(output)
f.close()

### Code Reference and Citations

In [527]:
# 1.
# The code for Bert encoder model was taken from HuggingFace Library
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#berttokenizer

# 2.
# The code for Bert tokenization was used with certain modifications from this blog -
# http://mccormickml.com/2019/07/22/BERT-fine-tuning/

# 3.
# Dataloader and Dataset code was referred from above blog and pytorch lightning readme docs - 
# https://pytorch-lightning.readthedocs.io/en/stable/guides/data.html
# http://mccormickml.com/2019/07/22/BERT-fine-tuning/

# 4. The model' framework and all the code in pytorch-lightning is self-designed.