In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [None]:
!pip install transformers

In [None]:
!pip install wget

In [13]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df = pd.read_csv("/content/CUB_captions.csv", header=None, names=['Label',  'ID', 'Sentences'])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.
df.sample(10)

Number of training sentences: 8,856



Unnamed: 0,Label,ID,Sentences
5884.0,017.Cardinal/Cardinal_0089_18005,140,"a large round bird with red feathers, and a lo..."
6497.0,018.Spotted_Catbird/Spotted_Catbird_0046_19399,151,dark green bird with a light yellow belly and ...
6561.0,018.Spotted_Catbird/Spotted_Catbird_0042_19430,152,this bird has a speckled breast with green spo...
5072.0,015.Lazuli_Bunting/Lazuli_Bunting_0087_15096,123,a beautiful small blue bird that is blue on it...
1605.0,005.Crested_Auklet/Crested_Auklet_0040_794912,44,"this bird is mostly black with a small head, w..."
1481.0,005.Crested_Auklet/Crested_Auklet_0066_785251,41,the bird has a black eyering and an orange bil...
6812.0,019.Gray_Catbird/Gray_Catbird_0134_20596,157,"a small bird with dark spiky feathers, and a p..."
642.0,003.Sooty_Albatross/Sooty_Albatross_0043_1076,18,a very large bird with an extremely large bill...
6494.0,018.Spotted_Catbird/Spotted_Catbird_0046_19399,151,this is a yellow and grey bird with a black ch...
3016.0,011.Rusty_Blackbird/Rusty_Blackbird_0114_6760,73,this particular bird has a belly that is black...


In [14]:
# Get the lists of sentences and their labels.
import numpy as np

sentences = df.Sentences.values
labels_text = df.Label.values
labels = df.ID.values

labels = np.delete(labels, 0)
labels_text = np.delete(labels_text, 0)
sentences = np.delete(sentences, 0)

In [15]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [16]:
# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  a bird with a very long wing span and a long pointed beak.
Tokenized:  ['a', 'bird', 'with', 'a', 'very', 'long', 'wing', 'span', 'and', 'a', 'long', 'pointed', 'beak', '.']
Token IDs:  [1037, 4743, 2007, 1037, 2200, 2146, 3358, 8487, 1998, 1037, 2146, 4197, 23525, 1012]


In [17]:
max_len = 0

# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))
    if  len(input_ids) == 80:
      print('sent with max len :', sent)
      print('number of words insent with max len :', len(sent.split()))
      print('index of sent with max len :', np.where(sentences==sent))


print('Max sentence length: ', max_len)

atwal = []
for i in sentences:
  atwal.append(len(i.split()))
atwal.sort(reverse=True)
print('Max number of words in a sentnces length: ', atwal[0])

Max sentence length:  70
Max number of words in a sentnces length:  55


In [12]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

TypeError: ignored

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))