In [2]:
import torch
print(torch.cuda.is_available())


True


In [80]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
Installing collected packages: bert-score
Successfully installed bert-score-0.3.13


In [3]:
pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached regex-2024.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached regex-2024.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (782 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.9.1 regex-2024.9.11
Note: you may need to restart the kernel to use updated packages.


In [4]:
!pip install rouge-score

Collecting rouge-score
  Using cached rouge_score-0.1.2-py3-none-any.whl
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [5]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Using cached huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Using cached safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Using cached tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.45.2-py3-none-any.whl (9.9 MB)
Using cached huggingface_hub-0.25.2-py3-none-any.whl (436 kB)
Using cached safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (435 kB)
Using cached tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
Installing collected packages: safetensors, huggingface-hub, tokenizers, transformers
Successfully installed huggingface-hub-0.25.2 

In [83]:
import os
import json
from google.cloud import storage
from PIL import Image
import io
import torch
from torchvision import transforms
import torch.nn as nn
import random
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from bert_score import score as bert_score
from rouge_score import rouge_scorer
from transformers import BertTokenizer, BertForSequenceClassification, GPT2Tokenizer

In [86]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# Initialize GCP client
storage_client = storage.Client()

In [8]:
# Bucket details
BUCKET_NAME = 'juanmodeltry'
GIF_FOLDER = 'gifs/'
METADATA_FILE = 'metadata.json'

In [9]:
# Load Metadata
bucket = storage_client.get_bucket(BUCKET_NAME)
metadata_blob = bucket.blob(METADATA_FILE)
metadata_content = metadata_blob.download_as_text()
metadata = json.loads(metadata_content)

In [10]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Note: If running on a system without a GPU, the code will default to CPU and still work, albeit with slower performance.

In [11]:
# Preprocessing Function for GIFs
# This function preprocesses a GIF by extracting its frames, resizing, and converting them to tensors. It limits the number of frames to `max_frames` if necessary.
def preprocess_gif(gif_blob, max_frames=16):
    # Download the GIF as bytes from the GCP bucket
    gif_bytes = gif_blob.download_as_bytes()
    # Open the GIF file as an image from the in-memory byte stream
    gif = Image.open(io.BytesIO(gif_bytes))
    frames = []
    
    # Extract frames from the GIF
    try:
        while True:
            frame = gif.copy().convert('RGB')  # Convert frame to RGB
            frames.append(frame)
            # Move to the next frame in the GIF
            gif.seek(gif.tell() + 1)  # Move to the next frame
    except EOFError:
        pass
    
    # Limit the number of frames to max_frames by sampling if needed
    if len(frames) > max_frames:
        frames = random.sample(frames, max_frames)
    
    # Resize, normalize, and augment frames
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize frames to 224x224
        transforms.ToTensor(),  # Convert to tensor
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize based on ImageNet standards
    ])
    
    # Apply the transformation pipeline to each frame
    frames = [transform(frame) for frame in frames]
    
    # Stack frames along the temporal dimension and move to the appropriate device (CPU or GPU)
    # Correct shape should be [batch, channels, frames, height, width]
    stacked_frames = torch.stack(frames).permute(1, 0, 2, 3).unsqueeze(0)  # [frames, channels, height, width] -> [1, channels, frames, height, width]  # Add batch dimension to match expected ResNet3D input shape [1, channels, frames, height, width]  # [frames, channels, height, width] -> [channels, frames, height, width]
    return stacked_frames.to(device)

In [12]:
# Load GIFs and preprocess
def load_gifs(stage):
    gifs = []
    descriptions = []
    
    # Load the list of GIF names for the specified stage
    if stage == 'train':
        file_name = 'textfiles/train.txt'
    elif stage == 'validation':
        file_name = 'textfiles/val.txt'
    elif stage == 'test':
        file_name = 'textfiles/test.txt'
    else:
        raise ValueError("Invalid stage. Must be 'train', 'validation', or 'test'.")
    
    # Download the list of GIF names from GCP
    try:
        stage_blob = bucket.blob(file_name)
        gif_names = stage_blob.download_as_text().splitlines()
        print(f"Loaded {len(gif_names)} GIF names for {stage} stage.")
    except Exception as e:
        print(f"Error loading GIF names for {stage}: {e}")
        return gifs, descriptions  # Return empty lists if there is an error

    # List all blobs (GIF files) in the specified bucket folder
    blobs = storage_client.list_blobs(BUCKET_NAME, prefix=GIF_FOLDER)
    
    # Iterate over each blob and preprocess if it matches the required GIFs
    for blob in blobs:
        if blob.name.endswith('.gif'):
            gif_id = blob.name.split('/')[-1]  # Extract the GIF file name (with extension)
            if gif_id in gif_names:
                # Find the description in the metadata based on the gif_id
                metadata_entry = next((item for item in metadata if item['id'] == gif_id.split('.')[0]), None)
                if metadata_entry:
                    try:
                        gif_tensor = preprocess_gif(blob)
                        gifs.append(gif_tensor)
                        descriptions.append(metadata_entry['description'])
                    except Exception as e:
                        print(f"Error processing GIF {gif_id}: {e}")
    
    print(f"Successfully loaded {len(gifs)} GIFs and {len(descriptions)} descriptions for {stage} stage.")
    return gifs, descriptions

In [13]:
# Load training, validation, and test data
gifs_train, descriptions_train = load_gifs('train')
gifs_val, descriptions_val = load_gifs('validation')
gifs_test, descriptions_test = load_gifs('test')

Loaded 70 GIF names for train stage.
Successfully loaded 70 GIFs and 70 descriptions for train stage.
Loaded 10 GIF names for validation stage.
Successfully loaded 10 GIFs and 10 descriptions for validation stage.
Loaded 10 GIF names for test stage.
Successfully loaded 10 GIFs and 10 descriptions for test stage.


In [14]:
# I3D Model Setup (assuming pre-trained on Kinetics)
from torchvision.models.video import r3d_18
# Using a pre-trained ResNet3D model from torchvision
r3d = r3d_18(pretrained=True)
r3d.eval()  # Set to evaluation mode for feature extraction only
r3d = r3d.to(device)




In [15]:
# Feature Extraction
def extract_features(gifs):
    features = []
    for gif in gifs:
        frame_features = []
        with torch.no_grad():  # Disable gradient calculations for feature extraction
            gif = gif.to(device)  # Move to device
            print(f"Input GIF shape before feature extraction: {gif.shape}")
            
            # Extract features frame by frame
            for frame_idx in range(gif.shape[2]):  # Iterate over the frames dimension
                frame = gif[:, :, frame_idx, :, :].unsqueeze(2)  # Extract a single frame, keep dimensions [batch, channels, 1, height, width]
                frame_feature = r3d(frame)  # Extract features from the frame using ResNet3D
                frame_features.append(frame_feature)
                
            # Stack frame-wise features along the temporal dimension
            feature = torch.stack(frame_features, dim=1)  # [batch, frames, feature_size]
            print(f"Feature shape after stacking frame-wise features: {feature.shape}")
            
            features.append(feature.squeeze(0))  # Remove the batch dimension
    return features

In [16]:
# Extract features from training, validation, and test GIFs
features_train = extract_features(gifs_train)
features_val = extract_features(gifs_val)
features_test = extract_features(gifs_test)

Input GIF shape before feature extraction: torch.Size([1, 3, 16, 224, 224])
Feature shape after stacking frame-wise features: torch.Size([1, 16, 400])
Input GIF shape before feature extraction: torch.Size([1, 3, 16, 224, 224])
Feature shape after stacking frame-wise features: torch.Size([1, 16, 400])
Input GIF shape before feature extraction: torch.Size([1, 3, 16, 224, 224])
Feature shape after stacking frame-wise features: torch.Size([1, 16, 400])
Input GIF shape before feature extraction: torch.Size([1, 3, 16, 224, 224])
Feature shape after stacking frame-wise features: torch.Size([1, 16, 400])
Input GIF shape before feature extraction: torch.Size([1, 3, 16, 224, 224])
Feature shape after stacking frame-wise features: torch.Size([1, 16, 400])
Input GIF shape before feature extraction: torch.Size([1, 3, 16, 224, 224])
Feature shape after stacking frame-wise features: torch.Size([1, 16, 400])
Input GIF shape before feature extraction: torch.Size([1, 3, 16, 224, 224])
Feature shape afte

In [62]:
# GRU Model for Caption Generation
class GRUCaptioningModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers=1, dropout=0.3):
        super(GRUCaptioningModel, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout if n_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, features):
        outputs, hidden = self.gru(features)
        outputs = self.dropout(outputs)
        outputs = self.fc(outputs)
        return outputs, hidden



In [63]:
# Model Parameters
input_dim = features_train[0].shape[-1]  # Feature size from ResNet3D (after flattening spatial dimensions)
hidden_dim = 512  # Size of GRU hidden state
n_layers = 1

In [64]:
# Tokenizer Setup
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [65]:
# Build Vocabulary
all_descriptions = descriptions_train + descriptions_val + descriptions_test
tokenizer.add_tokens(all_descriptions)

90

In [66]:
# Add padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
vocab_size = len(tokenizer)

In [67]:
# Instantiate the GRU model
gru_model = GRUCaptioningModel(input_dim, hidden_dim, vocab_size, n_layers).to(device)


In [68]:
# Resize the tokenizer embedding to match new vocabulary size
gru_model.fc = nn.Linear(hidden_dim, vocab_size).to(device)

In [69]:
# Optimizer and Loss Function
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(gru_model.parameters(), lr=0.001, weight_decay=1e-4)

In [70]:
from torch.nn.utils.rnn import pad_sequence

# Train the GRU model
def train_gru_model(features_train, descriptions_train, features_val, descriptions_val, gru_model, criterion, optimizer, epochs=50, batch_size=16):
    for epoch in range(epochs):
        gru_model.train()
        total_loss = 0

        # Training loop
        for i in range(0, len(features_train), batch_size):
            # Convert list of features to tensors and pad them
            batch_features = [torch.tensor(feature).to(device) for feature in features_train[i:i+batch_size]]
            batch_features = pad_sequence(batch_features, batch_first=True)  # Shape: (batch_size, max_seq_len, feature_size)
            batch_descriptions = descriptions_train[i:i+batch_size]
            optimizer.zero_grad()

            # Tokenize the descriptions (target sequences)
            tokenized_descriptions = tokenizer(batch_descriptions, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

            # Forward pass through GRU
            outputs, hidden = gru_model(batch_features)  # Get both output and hidden state from GRU
            final_hidden_state = hidden[-1]  # Take the last layer's hidden state for each sequence

            # Pass final hidden state through a linear layer to get logits
            logits = gru_model.fc(final_hidden_state)  # Assuming gru_model has an output layer named fc

            # Compute the loss
            loss = criterion(logits, tokenized_descriptions.view(-1))

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(features_train)
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")

        # Validation loop
        gru_model.eval()
        val_loss = 0
        with torch.no_grad():
            for i in range(0, len(features_val), batch_size):
                # Convert list of validation features to tensors and pad them
                batch_features = [torch.tensor(feature).to(device) for feature in features_val[i:i+batch_size]]
                batch_features = pad_sequence(batch_features, batch_first=True)  # Shape: (batch_size, max_seq_len, feature_size)
                batch_descriptions = descriptions_val[i:i+batch_size]
                tokenized_descriptions = tokenizer(batch_descriptions, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

                outputs, hidden = gru_model(batch_features)  # Get both output and hidden state from GRU
                final_hidden_state = hidden[-1]  # Take the last layer's hidden state for each sequence
                logits = gru_model.fc(final_hidden_state)

                loss = criterion(logits, tokenized_descriptions.view(-1))
                val_loss += loss.item()

        avg_val_loss = val_loss / len(features_val)
        print(f"Validation Loss: {avg_val_loss:.4f}\n")




In [71]:
# Train the GRU model
train_gru_model(features_train, descriptions_train, features_val, descriptions_val, gru_model, criterion, optimizer)


  batch_features = [torch.tensor(feature).to(device) for feature in features_train[i:i+batch_size]]


Epoch [1/50], Loss: 0.7719
Validation Loss: 1.0691

Epoch [2/50], Loss: 0.5394
Validation Loss: 1.0850



  batch_features = [torch.tensor(feature).to(device) for feature in features_val[i:i+batch_size]]


Epoch [3/50], Loss: 0.3517
Validation Loss: 1.1707

Epoch [4/50], Loss: 0.2300
Validation Loss: 1.2513

Epoch [5/50], Loss: 0.1633
Validation Loss: 1.2833

Epoch [6/50], Loss: 0.1190
Validation Loss: 1.3200

Epoch [7/50], Loss: 0.0873
Validation Loss: 1.3211

Epoch [8/50], Loss: 0.0629
Validation Loss: 1.3149

Epoch [9/50], Loss: 0.0443
Validation Loss: 1.3090

Epoch [10/50], Loss: 0.0310
Validation Loss: 1.2977

Epoch [11/50], Loss: 0.0225
Validation Loss: 1.2853

Epoch [12/50], Loss: 0.0170
Validation Loss: 1.2736

Epoch [13/50], Loss: 0.0135
Validation Loss: 1.2608

Epoch [14/50], Loss: 0.0113
Validation Loss: 1.2486

Epoch [15/50], Loss: 0.0098
Validation Loss: 1.2368

Epoch [16/50], Loss: 0.0088
Validation Loss: 1.2246

Epoch [17/50], Loss: 0.0081
Validation Loss: 1.2137

Epoch [18/50], Loss: 0.0075
Validation Loss: 1.2044

Epoch [19/50], Loss: 0.0070
Validation Loss: 1.1963

Epoch [20/50], Loss: 0.0066
Validation Loss: 1.1894

Epoch [21/50], Loss: 0.0062
Validation Loss: 1.1836



In [95]:
# Evaluation Loop for Test Set
def evaluate_model(features_test, descriptions_test, gru_model, tokenizer):
    import warnings
    warnings.filterwarnings('ignore')  # Ignore all warnings
    
    gru_model.eval()
    total_bleu_score = 0
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_rouge1_score = 0
    total_rouge2_score = 0
    total_rougeL_score = 0
    total_bert_f1_score = 0
    
    with torch.no_grad():
        for feature, description in zip(features_test, descriptions_test):
            feature = feature.unsqueeze(0).to(device)  # Add batch dimension
            
            # Forward pass through GRU to get both output and hidden state
            outputs, hidden = gru_model(feature)  # outputs shape: (batch_size, sequence_length, hidden_dim)
            
            # Take the last hidden state from the hidden tuple instead of outputs
            # Note: hidden has the shape (n_layers, batch_size, hidden_dim)
            last_hidden_state = hidden[-1]  # Shape: (batch_size, hidden_dim)
            
            # Pass the last hidden state through the fully connected layer to get logits
            logits = gru_model.fc(last_hidden_state)  # Shape: (batch_size, vocab_size)
            
            # Generate the predicted caption using argmax
            predicted_ids = torch.argmax(logits, dim=-1).tolist()
            predicted_caption = tokenizer.decode(predicted_ids, skip_special_tokens=True)
            
            # BLEU Score Calculation
            reference = [description.split()]
            candidate = predicted_caption.split()
            bleu_score = sentence_bleu(reference, candidate)
            total_bleu_score += bleu_score
            
            # ROUGE Score Calculation
            rouge_scores = scorer.score(predicted_caption, description)
            total_rouge1_score += rouge_scores['rouge1'].fmeasure
            total_rouge2_score += rouge_scores['rouge2'].fmeasure
            total_rougeL_score += rouge_scores['rougeL'].fmeasure
            
            # BERTScore Calculation (using BERTScore library)
            from bert_score import score as bert_score
            P, R, F1 = bert_score([predicted_caption], [description], lang="en", rescale_with_baseline=True)
            total_bert_f1_score += F1.mean().item()
            
            # Print each reference and generated caption for better tracking
            print(f"Reference: {description}")
            print(f"Generated: {predicted_caption}")
            print(f"BLEU Score: {bleu_score:.4f}, ROUGE-1 Score: {rouge_scores['rouge1'].fmeasure:.4f}, ROUGE-2 Score: {rouge_scores['rouge2'].fmeasure:.4f}, ROUGE-L Score: {rouge_scores['rougeL'].fmeasure:.4f}, BERT F1 Score: {F1.mean().item():.4f}\n")
    
    # Calculate the average scores
    avg_bleu_score = total_bleu_score / len(features_test)
    avg_rouge1_score = total_rouge1_score / len(features_test)
    avg_rouge2_score = total_rouge2_score / len(features_test)
    avg_rougeL_score = total_rougeL_score / len(features_test)
    avg_bert_f1_score = total_bert_f1_score / len(features_test)
    
    print(f"Average BLEU Score: {avg_bleu_score:.4f}")
    print(f"Average ROUGE-1 Score: {avg_rouge1_score:.4f}")
    print(f"Average ROUGE-2 Score: {avg_rouge2_score:.4f}")
    print(f"Average ROUGE-L Score: {avg_rougeL_score:.4f}")
    print(f"Average BERT F1 Score: {avg_bert_f1_score:.4f}")

In [96]:
# Evaluate the GRU model on the test set
evaluate_model(features_test, descriptions_test, gru_model, tokenizer)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: this is an animal walking through a trail in the snow on a leash.
Generated: a rose is opening against a blue sky.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2727, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1818, BERT F1 Score: 0.2415



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: seagulls fly around the water's edge at the beach.
Generated: a car drives too fast and almost kills a woman.
BLEU Score: 0.0000, ROUGE-1 Score: 0.0000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0000, BERT F1 Score: 0.1485



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a ball flies through the air and a cat hits it into the mouth of a dog.
Generated: a large dog lets small dog out of a cage and they walk off together.
BLEU Score: 0.0000, ROUGE-1 Score: 0.3125, ROUGE-2 Score: 0.0667, ROUGE-L Score: 0.1875, BERT F1 Score: 0.2819



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: this is a man talking and then pets a panther.
Generated: a woman is sitting in a truck and putting her feet up.
BLEU Score: 0.0000, ROUGE-1 Score: 0.3636, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2727, BERT F1 Score: 0.3435



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man activates a spinning wheel wrapped around his head.
Generated: a woman looks at another person and talks with them.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1000, BERT F1 Score: 0.2923



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man with a beard and mustache is speaking.
Generated: several young women are preforming dance moves and singing
BLEU Score: 0.0000, ROUGE-1 Score: 0.1111, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1111, BERT F1 Score: 0.2356



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: three people are singing, and one pats the other on the behind.
Generated: a man wearing a shirt with a stripe is using a martial arts weapon
BLEU Score: 0.0000, ROUGE-1 Score: 0.0000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0000, BERT F1 Score: 0.2227



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a young boy is eating and laughing.
Generated: a man in a tank top and underwear is dancing towards a woman.
BLEU Score: 0.0000, ROUGE-1 Score: 0.3000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2000, BERT F1 Score: 0.4720



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a person is playing with a dog.
Generated: a woman is sitting in a car and moving the sleeve of her blue jumper to her mouth.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2400, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2400, BERT F1 Score: 0.3016



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a kitten leans its head back and bounces against his owner.
Generated: a man lifts the cover off of a plate and the woman picks up a purple bracelet
BLEU Score: 0.0000, ROUGE-1 Score: 0.1429, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1429, BERT F1 Score: 0.1860

Average BLEU Score: 0.0000
Average ROUGE-1 Score: 0.1843
Average ROUGE-2 Score: 0.0067
Average ROUGE-L Score: 0.1436
Average BERT F1 Score: 0.2726
