In [1]:
from similarity_scoring import BertSimilarityModel
from transformers import pipeline
import json
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
cap = "A blue bird"
poem = "Their feathers too bright \ntheir songs too sweet and wild"

In [None]:
sim_model = BertSimilarityModel(no_hidden_layers=1, hidden_dim=25, max_length=25)
image2text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


**Input for the BERT Tokenizer**

In [None]:
encoding = sim_model.encode_input(cap, poem)
for key in encoding.keys():
    print(key,":",encoding[key])
tokens = sim_model.tokenizer.convert_ids_to_tokens(encoding["input_ids"].numpy()[0])
print("tokens :",tokens)

input_ids : tensor([[  101,  1037,  2630,  4743,   102,  2037, 12261,  2205,  4408,  2037,
          2774,  2205,  4086,  1998,  3748,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0]])
token_type_ids : tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0]])
attention_mask : tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0]])
tokens : ['[CLS]', 'a', 'blue', 'bird', '[SEP]', 'their', 'feathers', 'too', 'bright', 'their', 'songs', 'too', 'sweet', 'and', 'wild', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


**Initial similarity**

In [None]:
similarity = sim_model.similarity(cap, poem)

print(similarity)

The BERT classifier has not been trained yet. Similarity might not be good.
tensor([[0.4758]], grad_fn=<SigmoidBackward0>)


**A dataset**

In [None]:
with open("../../data/multim_poem.json") as f:
    jsonfile = json.load(f)

print("Data length:",len(jsonfile))

Data length: 8292


In [None]:
N = 20
data = [{}]*N
i = 0
while i < N and i < len(jsonfile):
    try:
        desc = image2text(jsonfile[i]['image_url'])
    except:
        # skip image
        i += 1 
        continue
    desc = desc[0]['generated_text']
    
    data[i] = jsonfile[i]
    data[i]["caption"] = desc
    i += 1 

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [None]:
class CaptionPoemDataset(Dataset):
    def __init__(self, datadict):
        self.data = datadict
        
        self.N_halfs = len(self.data) // 2 
        self.shuffle_idx = np.random.choice(a=self.N_halfs, size=self.N_halfs, replace=False)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        if idx < self.N_halfs:
            # match caption idx, with poem shuffle idx 
            match_idx = self.shuffle_idx[idx]
        else:
            match_idx = idx
        
        cap_poem = [self.data[idx]['caption'], self.data[match_idx]['poem']]
        label = 1 if idx == match_idx else 0 
        return cap_poem, label

dataset = CaptionPoemDataset(data)

In [None]:
# Split data into train and validation sets
SPLIT = 0.9
train_size = int(SPLIT*len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print("Number of samples for training =", train_size)
print("Number of samples for validation =", val_size)

Number of samples for training = 18
Number of samples for validation = 2


In [None]:
BATCH_SIZE = 1
train_dataloader = DataLoader(train_dataset,
                              sampler=RandomSampler(train_dataset),
                              batch_size=BATCH_SIZE)

val_dataloader = DataLoader(val_dataset,
                            sampler=SequentialSampler(val_dataset),
                            batch_size=BATCH_SIZE)

In [None]:
sim_model.train_bert_classifier(train_dataloader, val_dataloader, num_epochs=2, val_epoch=1, learning_rate=0.001, verbose=True)

outputs: tensor([[0.4756]], grad_fn=<SigmoidBackward0>)
label: tensor([[1]])


RuntimeError: Expected floating point type for target with class probabilities, got Long