In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [None]:
!pip install transformers

In [4]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df = pd.read_csv("/content/CUB_captions.csv", header=None, names=['Label',  'ID', 'Sentences'])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

Number of training sentences: 8,856



In [5]:
# Get the lists of sentences and their labels.
import numpy as np

sentences = df.Sentences.values
labels_text = df.Label.values
labels = df.ID.values

labels = np.delete(labels, 0)
labels_text = np.delete(labels_text, 0)
sentences = np.delete(sentences, 0)

In [None]:
from transformers import BertTokenizer , BertModel

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
max_len = 0

# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))
    if  len(input_ids) == 80:
      print('sent with max len :', sent)
      print('number of words insent with max len :', len(sent.split()))
      print('index of sent with max len :', np.where(sentences==sent))


print('Max sentence length: ', max_len)

atwal = []
for i in sentences:
  atwal.append(len(i.split()))
atwal.sort(reverse=True)
print('Max number of words in a sentnces length: ', atwal[0])

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []
# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = labels.astype(int)
labels = torch.tensor(labels)

segments_ids = []
for i in range(input_ids.size()[0]):
  segments_id = [1] * input_ids.size()[1]
  segments_ids.append(segments_id)

segments_ids = torch.tensor(segments_ids)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, segments_ids, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(1 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset = dataset

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

In [10]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = SequentialSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

In [None]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.cuda()
model.eval()

In [14]:
import numpy as np
for step, batch in enumerate(train_dataloader):
  if step % 40 == 0 :  
    # As we unpack the batch, we'll also copy each tensor to the GPU using the 
    # `to` method.
    #
    # `batch` contains three pytorch tensors:
    #   [0]: input ids 
    #   [1]: attention masks
    #   [2]: labels 
    b_input_ids = batch[0].to(device)
    b_segments_ids = batch[1].to(device)
    b_labels = batch[2].to(device)
    print(device,'=> ', step)
    #print(b_segments_ids.size())
    #print(b_segments_ids[0])


    with torch.no_grad():
          outputs = model(b_input_ids, b_segments_ids)
          hidden_states = outputs[2]
    

    # Concatenate the tensors for all layers. We use `stack` here to
    # create a new dimension in the tensor.
    token_embeddings = torch.stack(hidden_states, dim=0)

    # Swap dimensions 0 and 1.
    token_embeddings = token_embeddings.permute(1,2,0,3)

    word_vecs_sum = []
    for sentence in token_embeddings:
      # `sentence` is a [64 x 12 x 768] tensor.
      
      # Stores the token vectors, with shape [64 x 768]
      token_vecs_sum = []
      # For each token in the sentence...
      
      for token in sentence:
        # `token` is a [12 x 768] tensor
        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)

      # Use `token_vecs_sum` to represent `sentence`.
      word_vecs_sum.append(token_vecs_sum)

    print ('Word Embedding Shape is: %d x %d x %d' % (len(word_vecs_sum), len(word_vecs_sum[0]), len(word_vecs_sum[0][0])))


    # `hidden_states` has shape [13 x 32 x 64 x 768]
    # `token_vecs` is a tensor with shape [64 x 768]
    token_vecs = hidden_states[-2]

    # Calculate the average of all 64 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=1)
    print('Sentence Embedding Shape is: ', sentence_embedding.size())


cuda =>  0
Word Embedding Shape is: 32 x 64 x 768
Sentence Embedding Shape is:  torch.Size([32, 768])
cuda =>  40
Word Embedding Shape is: 32 x 64 x 768
Sentence Embedding Shape is:  torch.Size([32, 768])
cuda =>  80
Word Embedding Shape is: 32 x 64 x 768
Sentence Embedding Shape is:  torch.Size([32, 768])
cuda =>  120
Word Embedding Shape is: 32 x 64 x 768
Sentence Embedding Shape is:  torch.Size([32, 768])
cuda =>  160
Word Embedding Shape is: 32 x 64 x 768
Sentence Embedding Shape is:  torch.Size([32, 768])
cuda =>  200
Word Embedding Shape is: 32 x 64 x 768
Sentence Embedding Shape is:  torch.Size([32, 768])
cuda =>  240
Word Embedding Shape is: 32 x 64 x 768
Sentence Embedding Shape is:  torch.Size([32, 768])
