In [1]:
import torch
print(torch.cuda.is_available())


True


In [2]:
!pip install bert-score

Collecting bert-score
  Using cached bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting transformers>=3.0.0 (from bert-score)
  Using cached transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers>=3.0.0->bert-score)
  Using cached huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers>=3.0.0->bert-score)
  Using cached regex-2024.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting safetensors>=0.4.1 (from transformers>=3.0.0->bert-score)
  Using cached safetensors-0.4.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers>=3.0.0->bert-score)
  Using cached tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached bert_score-0.3.13-py3-none-any.whl (61 kB)
Using cached transformers-4.45.2-py3-none-any.whl

In [3]:
pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.9.1
Note: you may need to restart the kernel to use updated packages.


In [4]:
!pip install rouge-score

Collecting rouge-score
  Using cached rouge_score-0.1.2-py3-none-any.whl
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [5]:
!pip install transformers



In [6]:
import os
import json
from google.cloud import storage
from PIL import Image
import io
import torch
from torchvision import transforms
import torch.nn as nn
import random
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from bert_score import score as bert_score
from rouge_score import rouge_scorer
from transformers import BertTokenizer, BertForSequenceClassification, GPT2Tokenizer

In [7]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# Initialize GCP client
storage_client = storage.Client()

In [9]:
# Bucket details
BUCKET_NAME = 'juanmodeltry2'
GIF_FOLDER = 'gifs/'
METADATA_FILE = 'metadata.json'

In [10]:
# Load Metadata
bucket = storage_client.get_bucket(BUCKET_NAME)
metadata_blob = bucket.blob(METADATA_FILE)
metadata_content = metadata_blob.download_as_text()
metadata = json.loads(metadata_content)

In [11]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Note: If running on a system without a GPU, the code will default to CPU and still work, albeit with slower performance.

In [12]:
# Preprocessing Function for GIFs
# This function preprocesses a GIF by extracting its frames, resizing, and converting them to tensors. It limits the number of frames to `max_frames` if necessary.
def preprocess_gif(gif_blob, max_frames=16):
    # Download the GIF as bytes from the GCP bucket
    gif_bytes = gif_blob.download_as_bytes()
    # Open the GIF file as an image from the in-memory byte stream
    gif = Image.open(io.BytesIO(gif_bytes))
    frames = []
    
    # Extract frames from the GIF
    try:
        while True:
            frame = gif.copy().convert('RGB')  # Convert frame to RGB
            frames.append(frame)
            # Move to the next frame in the GIF
            gif.seek(gif.tell() + 1)  # Move to the next frame
    except EOFError:
        pass
    
    # Limit the number of frames to max_frames by sampling if needed
    if len(frames) > max_frames:
        frames = random.sample(frames, max_frames)
    
    # Resize, normalize, and augment frames
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Resize frames to 224x224
        transforms.ToTensor(),  # Convert to tensor
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize based on ImageNet standards
    ])
    
    # Apply the transformation pipeline to each frame
    frames = [transform(frame) for frame in frames]
    
    # Stack frames along the temporal dimension and move to the appropriate device (CPU or GPU)
    # Correct shape should be [batch, channels, frames, height, width]
    stacked_frames = torch.stack(frames).permute(1, 0, 2, 3).unsqueeze(0)  # [frames, channels, height, width] -> [1, channels, frames, height, width]  # Add batch dimension to match expected ResNet3D input shape [1, channels, frames, height, width]  # [frames, channels, height, width] -> [channels, frames, height, width]
    return stacked_frames.to(device)

In [13]:
# Load GIFs and preprocess
def load_gifs(stage):
    gifs = []
    descriptions = []
    
    # Load the list of GIF names for the specified stage
    if stage == 'train':
        file_name = 'textfiles/train.txt'
    elif stage == 'validation':
        file_name = 'textfiles/val.txt'
    elif stage == 'test':
        file_name = 'textfiles/test.txt'
    else:
        raise ValueError("Invalid stage. Must be 'train', 'validation', or 'test'.")
    
    # Download the list of GIF names from GCP
    try:
        stage_blob = bucket.blob(file_name)
        gif_names = stage_blob.download_as_text().splitlines()
        print(f"Loaded {len(gif_names)} GIF names for {stage} stage.")
    except Exception as e:
        print(f"Error loading GIF names for {stage}: {e}")
        return gifs, descriptions  # Return empty lists if there is an error

    # List all blobs (GIF files) in the specified bucket folder
    blobs = storage_client.list_blobs(BUCKET_NAME, prefix=GIF_FOLDER)
    
    # Iterate over each blob and preprocess if it matches the required GIFs
    for blob in blobs:
        if blob.name.endswith('.gif'):
            gif_id = blob.name.split('/')[-1]  # Extract the GIF file name (with extension)
            if gif_id in gif_names:
                # Find the description in the metadata based on the gif_id
                metadata_entry = next((item for item in metadata if item['id'] == gif_id.split('.')[0]), None)
                if metadata_entry:
                    try:
                        gif_tensor = preprocess_gif(blob)
                        gifs.append(gif_tensor)
                        descriptions.append(metadata_entry['description'])
                    except Exception as e:
                        print(f"Error processing GIF {gif_id}: {e}")
    
    print(f"Successfully loaded {len(gifs)} GIFs and {len(descriptions)} descriptions for {stage} stage.")
    return gifs, descriptions

In [14]:
# Load training, validation, and test data
gifs_train, descriptions_train = load_gifs('train')
gifs_val, descriptions_val = load_gifs('validation')
gifs_test, descriptions_test = load_gifs('test')

Loaded 800 GIF names for train stage.
Successfully loaded 800 GIFs and 800 descriptions for train stage.
Loaded 100 GIF names for validation stage.
Successfully loaded 100 GIFs and 100 descriptions for validation stage.
Loaded 100 GIF names for test stage.
Successfully loaded 100 GIFs and 100 descriptions for test stage.


In [15]:
# I3D Model Setup (assuming pre-trained on Kinetics)
from torchvision.models.video import r3d_18
# Using a pre-trained ResNet3D model from torchvision
r3d = r3d_18(pretrained=True)
r3d.eval()  # Set to evaluation mode for feature extraction only
r3d = r3d.to(device)




In [16]:
# Feature Extraction
def extract_features(gifs):
    features = []
    for gif in gifs:
        frame_features = []
        with torch.no_grad():  # Disable gradient calculations for feature extraction
            gif = gif.to(device)  # Move to device
            print(f"Input GIF shape before feature extraction: {gif.shape}")
            
            # Extract features frame by frame
            for frame_idx in range(gif.shape[2]):  # Iterate over the frames dimension
                frame = gif[:, :, frame_idx, :, :].unsqueeze(2)  # Extract a single frame, keep dimensions [batch, channels, 1, height, width]
                frame_feature = r3d(frame)  # Extract features from the frame using ResNet3D
                frame_features.append(frame_feature)
                
            # Stack frame-wise features along the temporal dimension
            feature = torch.stack(frame_features, dim=1)  # [batch, frames, feature_size]
            print(f"Feature shape after stacking frame-wise features: {feature.shape}")
            
            features.append(feature.squeeze(0))  # Remove the batch dimension
    return features

In [17]:
# Extract features from training, validation, and test GIFs
features_train = extract_features(gifs_train)
features_val = extract_features(gifs_val)
features_test = extract_features(gifs_test)

Input GIF shape before feature extraction: torch.Size([1, 3, 16, 224, 224])
Feature shape after stacking frame-wise features: torch.Size([1, 16, 400])
Input GIF shape before feature extraction: torch.Size([1, 3, 16, 224, 224])
Feature shape after stacking frame-wise features: torch.Size([1, 16, 400])
Input GIF shape before feature extraction: torch.Size([1, 3, 16, 224, 224])
Feature shape after stacking frame-wise features: torch.Size([1, 16, 400])
Input GIF shape before feature extraction: torch.Size([1, 3, 16, 224, 224])
Feature shape after stacking frame-wise features: torch.Size([1, 16, 400])
Input GIF shape before feature extraction: torch.Size([1, 3, 16, 224, 224])
Feature shape after stacking frame-wise features: torch.Size([1, 16, 400])
Input GIF shape before feature extraction: torch.Size([1, 3, 16, 224, 224])
Feature shape after stacking frame-wise features: torch.Size([1, 16, 400])
Input GIF shape before feature extraction: torch.Size([1, 3, 16, 224, 224])
Feature shape afte

In [18]:
# GRU Model for Caption Generation
class GRUCaptioningModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers=1, dropout=0.3):
        super(GRUCaptioningModel, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True, dropout=dropout if n_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, features):
        outputs, hidden = self.gru(features)
        outputs = self.dropout(outputs)
        outputs = self.fc(outputs)
        return outputs, hidden



In [19]:
# Model Parameters
input_dim = features_train[0].shape[-1]  # Feature size from ResNet3D (after flattening spatial dimensions)
hidden_dim = 512  # Size of GRU hidden state
n_layers = 1

In [20]:
# Tokenizer Setup
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [21]:
# Build Vocabulary
all_descriptions = descriptions_train + descriptions_val + descriptions_test
tokenizer.add_tokens(all_descriptions)

1000

In [22]:
# Add padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
vocab_size = len(tokenizer)

In [23]:
# Instantiate the GRU model
gru_model = GRUCaptioningModel(input_dim, hidden_dim, vocab_size, n_layers).to(device)


In [24]:
# Resize the tokenizer embedding to match new vocabulary size
gru_model.fc = nn.Linear(hidden_dim, vocab_size).to(device)

In [25]:
# Optimizer and Loss Function
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = torch.optim.Adam(gru_model.parameters(), lr=0.001, weight_decay=1e-4)

In [26]:
from torch.nn.utils.rnn import pad_sequence

# Train the GRU model
def train_gru_model(features_train, descriptions_train, features_val, descriptions_val, gru_model, criterion, optimizer, epochs=50, batch_size=16):
    for epoch in range(epochs):
        gru_model.train()
        total_loss = 0

        # Training loop
        for i in range(0, len(features_train), batch_size):
            # Convert list of features to tensors and pad them
            batch_features = [torch.tensor(feature).to(device) for feature in features_train[i:i+batch_size]]
            batch_features = pad_sequence(batch_features, batch_first=True)  # Shape: (batch_size, max_seq_len, feature_size)
            batch_descriptions = descriptions_train[i:i+batch_size]
            optimizer.zero_grad()

            # Tokenize the descriptions (target sequences)
            tokenized_descriptions = tokenizer(batch_descriptions, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

            # Forward pass through GRU
            outputs, hidden = gru_model(batch_features)  # Get both output and hidden state from GRU
            final_hidden_state = hidden[-1]  # Take the last layer's hidden state for each sequence

            # Pass final hidden state through a linear layer to get logits
            logits = gru_model.fc(final_hidden_state)  # Assuming gru_model has an output layer named fc

            # Compute the loss
            loss = criterion(logits, tokenized_descriptions.view(-1))

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(features_train)
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")

        # Validation loop
        gru_model.eval()
        val_loss = 0
        with torch.no_grad():
            for i in range(0, len(features_val), batch_size):
                # Convert list of validation features to tensors and pad them
                batch_features = [torch.tensor(feature).to(device) for feature in features_val[i:i+batch_size]]
                batch_features = pad_sequence(batch_features, batch_first=True)  # Shape: (batch_size, max_seq_len, feature_size)
                batch_descriptions = descriptions_val[i:i+batch_size]
                tokenized_descriptions = tokenizer(batch_descriptions, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

                outputs, hidden = gru_model(batch_features)  # Get both output and hidden state from GRU
                final_hidden_state = hidden[-1]  # Take the last layer's hidden state for each sequence
                logits = gru_model.fc(final_hidden_state)

                loss = criterion(logits, tokenized_descriptions.view(-1))
                val_loss += loss.item()

        avg_val_loss = val_loss / len(features_val)
        print(f"Validation Loss: {avg_val_loss:.4f}\n")




In [27]:
# Train the GRU model
train_gru_model(features_train, descriptions_train, features_val, descriptions_val, gru_model, criterion, optimizer)


  batch_features = [torch.tensor(feature).to(device) for feature in features_train[i:i+batch_size]]


Epoch [1/50], Loss: 0.6917


  batch_features = [torch.tensor(feature).to(device) for feature in features_val[i:i+batch_size]]


Validation Loss: 0.7740

Epoch [2/50], Loss: 0.4701
Validation Loss: 0.8558

Epoch [3/50], Loss: 0.4426
Validation Loss: 0.9705

Epoch [4/50], Loss: 0.4400
Validation Loss: 0.9865

Epoch [5/50], Loss: 0.4123
Validation Loss: 0.9721

Epoch [6/50], Loss: 0.3854
Validation Loss: 0.9508

Epoch [7/50], Loss: 0.3569
Validation Loss: 0.9300

Epoch [8/50], Loss: 0.3237
Validation Loss: 0.9265

Epoch [9/50], Loss: 0.2887
Validation Loss: 0.9241

Epoch [10/50], Loss: 0.2572
Validation Loss: 0.9263

Epoch [11/50], Loss: 0.2266
Validation Loss: 0.9278

Epoch [12/50], Loss: 0.1973
Validation Loss: 0.9223

Epoch [13/50], Loss: 0.1671
Validation Loss: 0.9246

Epoch [14/50], Loss: 0.1424
Validation Loss: 0.9263

Epoch [15/50], Loss: 0.1168
Validation Loss: 0.9243

Epoch [16/50], Loss: 0.0965
Validation Loss: 0.9197

Epoch [17/50], Loss: 0.0782
Validation Loss: 0.9239

Epoch [18/50], Loss: 0.0600
Validation Loss: 0.9232

Epoch [19/50], Loss: 0.0464
Validation Loss: 0.9217

Epoch [20/50], Loss: 0.0350
V

In [28]:
# Evaluation Loop for Test Set
def evaluate_model(features_test, descriptions_test, gru_model, tokenizer):
    import warnings
    warnings.filterwarnings('ignore')  # Ignore all warnings
    
    gru_model.eval()
    total_bleu_score = 0
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_rouge1_score = 0
    total_rouge2_score = 0
    total_rougeL_score = 0
    total_bert_f1_score = 0
    
    with torch.no_grad():
        for feature, description in zip(features_test, descriptions_test):
            feature = feature.unsqueeze(0).to(device)  # Add batch dimension
            
            # Forward pass through GRU to get both output and hidden state
            outputs, hidden = gru_model(feature)  # outputs shape: (batch_size, sequence_length, hidden_dim)
            
            # Take the last hidden state from the hidden tuple instead of outputs
            # Note: hidden has the shape (n_layers, batch_size, hidden_dim)
            last_hidden_state = hidden[-1]  # Shape: (batch_size, hidden_dim)
            
            # Pass the last hidden state through the fully connected layer to get logits
            logits = gru_model.fc(last_hidden_state)  # Shape: (batch_size, vocab_size)
            
            # Generate the predicted caption using argmax
            predicted_ids = torch.argmax(logits, dim=-1).tolist()
            predicted_caption = tokenizer.decode(predicted_ids, skip_special_tokens=True)
            
            # BLEU Score Calculation
            reference = [description.split()]
            candidate = predicted_caption.split()
            bleu_score = sentence_bleu(reference, candidate)
            total_bleu_score += bleu_score
            
            # ROUGE Score Calculation
            rouge_scores = scorer.score(predicted_caption, description)
            total_rouge1_score += rouge_scores['rouge1'].fmeasure
            total_rouge2_score += rouge_scores['rouge2'].fmeasure
            total_rougeL_score += rouge_scores['rougeL'].fmeasure
            
            # BERTScore Calculation (using BERTScore library)
            from bert_score import score as bert_score
            P, R, F1 = bert_score([predicted_caption], [description], lang="en", rescale_with_baseline=True)
            total_bert_f1_score += F1.mean().item()
            
            # Print each reference and generated caption for better tracking
            print(f"Reference: {description}")
            print(f"Generated: {predicted_caption}")
            print(f"BLEU Score: {bleu_score:.4f}, ROUGE-1 Score: {rouge_scores['rouge1'].fmeasure:.4f}, ROUGE-2 Score: {rouge_scores['rouge2'].fmeasure:.4f}, ROUGE-L Score: {rouge_scores['rougeL'].fmeasure:.4f}, BERT F1 Score: {F1.mean().item():.4f}\n")
    
    # Calculate the average scores
    avg_bleu_score = total_bleu_score / len(features_test)
    avg_rouge1_score = total_rouge1_score / len(features_test)
    avg_rouge2_score = total_rouge2_score / len(features_test)
    avg_rougeL_score = total_rougeL_score / len(features_test)
    avg_bert_f1_score = total_bert_f1_score / len(features_test)
    
    print(f"Average BLEU Score: {avg_bleu_score:.4f}")
    print(f"Average ROUGE-1 Score: {avg_rouge1_score:.4f}")
    print(f"Average ROUGE-2 Score: {avg_rouge2_score:.4f}")
    print(f"Average ROUGE-L Score: {avg_rougeL_score:.4f}")
    print(f"Average BERT F1 Score: {avg_bert_f1_score:.4f}")

In [29]:
# Evaluate the GRU model on the test set
evaluate_model(features_test, descriptions_test, gru_model, tokenizer)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man with long blond hair is singing into a microphone
Generated: a man turns around as he talks.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2222, ROUGE-2 Score: 0.1250, ROUGE-L Score: 0.2222, BERT F1 Score: 0.2747



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man is looking at photos and smiling
Generated: she is point with his hands forward while talking
BLEU Score: 0.0000, ROUGE-1 Score: 0.1176, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1176, BERT F1 Score: 0.1709



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a baby elephant tries desperately to walk but keeps slipping into the mud.
Generated: a horses nose is zoomed in while it pokes out its tongue
BLEU Score: 0.0000, ROUGE-1 Score: 0.0800, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0800, BERT F1 Score: 0.2080



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a blonde girl comes out of hiding with flowers and balloons
Generated: a boy who scares a woman to take it down
BLEU Score: 0.0000, ROUGE-1 Score: 0.0952, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0952, BERT F1 Score: 0.1787



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a young mother is holding a new born baby
Generated: a black cat is leaning on a pillow
BLEU Score: 0.0000, ROUGE-1 Score: 0.3529, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.3529, BERT F1 Score: 0.3659



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: the woman on the treadmill can't keep up and is thrown off.
Generated: a young man in a white shirt is hit by another man in the groin
BLEU Score: 0.0000, ROUGE-1 Score: 0.1429, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0714, BERT F1 Score: 0.3289



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a dog is biting his fluff leopard toy.
Generated: a boy runs his fingers through his hair and then uses a hand gesture.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1818, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1818, BERT F1 Score: 0.1340



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man sitting in a room stares with his lips pressed together and claps.
Generated: a man is smiling and biting his lower lip
BLEU Score: 0.0000, ROUGE-1 Score: 0.4348, ROUGE-2 Score: 0.0952, ROUGE-L Score: 0.3478, BERT F1 Score: 0.3542



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: lights flashing on as a guy raises his hands and looks up
Generated: a man in a business suit turns and looks down.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2727, ROUGE-2 Score: 0.1000, ROUGE-L Score: 0.2727, BERT F1 Score: 0.3360



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a young man is raising his hands and cheering.
Generated: a man is rubbing his tattooed arm with his hand.
BLEU Score: 0.0000, ROUGE-1 Score: 0.5263, ROUGE-2 Score: 0.2353, ROUGE-L Score: 0.5263, BERT F1 Score: 0.4762



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a cyclist travels through traffic in a city street
Generated: a woman gets off a chair and places an electronic-reader on a shelf with many others.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1538, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1538, BERT F1 Score: 0.2004



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: three guys are dancing and singing in a room.
Generated: a pony tailed girl dances next to another girl
BLEU Score: 0.0000, ROUGE-1 Score: 0.2222, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1111, BERT F1 Score: 0.2502



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: here is a woman with blonde hair and blue eyes.
Generated: A girl with blue eyes and red lips is looking serious
BLEU Score: 0.0000, ROUGE-1 Score: 0.5714, ROUGE-2 Score: 0.1053, ROUGE-L Score: 0.3810, BERT F1 Score: 0.3908



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a kid leans back and starts to smile
Generated: a man is smiling and bobbing about in front of a microphone
BLEU Score: 0.0000, ROUGE-1 Score: 0.3000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2000, BERT F1 Score: 0.3393



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a girl is turn off her head in a room
Generated: a young man in a white shirt bows in a classroom
BLEU Score: 0.0000, ROUGE-1 Score: 0.2857, ROUGE-2 Score: 0.1053, ROUGE-L Score: 0.2857, BERT F1 Score: 0.2619



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: man with white shirt is playing basketball and jumping
Generated: a young man is sitting while listening to music with his earplugs.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2857, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1905, BERT F1 Score: 0.2059



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: this woman is looking somewhere and smiling.
Generated: a woman holds a small fan in front of her open mouth.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1053, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1053, BERT F1 Score: 0.3544



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man wearing a green shirt is grabbing a book
Generated: a man and woman start to kiss but the woman moves away a little.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2500, ROUGE-2 Score: 0.0909, ROUGE-L Score: 0.2500, BERT F1 Score: 0.3026



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: the man in the hood is talking and pointing at his mouth.
Generated: a woman is playing the black hair girl with sad eyes
BLEU Score: 0.0000, ROUGE-1 Score: 0.1739, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0870, BERT F1 Score: 0.1861



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a young child is pretending to be a body builder
Generated: a beautiful man and a cute girl with long hair are kissing
BLEU Score: 0.0000, ROUGE-1 Score: 0.1818, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1818, BERT F1 Score: 0.2922



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man jumps onto a mans back and gets a basketball through a hoop.
Generated: several motorcyclists are arriving at the finish of a race
BLEU Score: 0.0000, ROUGE-1 Score: 0.0833, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0833, BERT F1 Score: 0.2373



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man walks in with his arms raised next to a woman sitting.
Generated: this is a woman playing on her guitar.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1905, ROUGE-2 Score: 0.1053, ROUGE-L Score: 0.1905, BERT F1 Score: 0.3753



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: two bigger dogs smell a much smaller dog
Generated: a man in a blue ski cap and jacket is rubbing his neck.
BLEU Score: 0.0000, ROUGE-1 Score: 0.0952, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0952, BERT F1 Score: -0.0157



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a woman is scratching a baby goat that is wearing a blue vest
Generated: a black cat is scratching a white dog
BLEU Score: 0.0000, ROUGE-1 Score: 0.3810, ROUGE-2 Score: 0.2105, ROUGE-L Score: 0.3810, BERT F1 Score: 0.3792



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a black woman looked to the side then says something.
Generated: a man smiles and then glances away.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2353, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2353, BERT F1 Score: 0.4206



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man puts his left leg over the body of another on the floor.
Generated: there is a guitarist who jumps on the bed and continues to play
BLEU Score: 0.0000, ROUGE-1 Score: 0.2222, ROUGE-2 Score: 0.0800, ROUGE-L Score: 0.2222, BERT F1 Score: 0.2539



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man with messy hair and a black jacket does not look satisfied.
Generated: a man with glasses is eating ice cream.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2857, ROUGE-2 Score: 0.2105, ROUGE-L Score: 0.2857, BERT F1 Score: 0.3782



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man has his hand on his head and looks around.
Generated: two girls are kissing a french kiss with each other.
BLEU Score: 0.0000, ROUGE-1 Score: 0.0952, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0952, BERT F1 Score: 0.1315



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a woman wearing a cat costume poses and leans forward
Generated: a man is hiding half is face and he is smiling
BLEU Score: 0.0000, ROUGE-1 Score: 0.1905, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1905, BERT F1 Score: 0.2855



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man in sunglasses is doing the moon walk dance.
Generated: white cowboy hat turns into a puppet lights
BLEU Score: 0.0000, ROUGE-1 Score: 0.1111, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1111, BERT F1 Score: 0.1511



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a soccer team is playing a game and they are running down the field.
Generated: a football player being pulled onto the pitch by a man in fancy dress.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2143, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1429, BERT F1 Score: 0.3587



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a woman is miming pulling a string which lifts her lip.
Generated: two guys, dressed in red and blue striped shirts, are smiling and embracing each other.
BLEU Score: 0.0000, ROUGE-1 Score: 0.0000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0000, BERT F1 Score: 0.1864



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man kicks a ball toward a boy who falls backward after getting hit with the ball
Generated: a skater is doing a spate trick off a ramp.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2222, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2222, BERT F1 Score: 0.1928



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a boy is sitting with his arms wrapped around his knees
Generated: a woman is lighting her cigarette with a lighter.
BLEU Score: 0.0000, ROUGE-1 Score: 0.3000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.3000, BERT F1 Score: 0.4126



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man in a bear outfit scares a man coming out of a store.
Generated: a man is dancing in a white sweater in a kitchen
BLEU Score: 0.0000, ROUGE-1 Score: 0.4000, ROUGE-2 Score: 0.1739, ROUGE-L Score: 0.4000, BERT F1 Score: 0.3701



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a young woman is winking and moving her arm in circles
Generated: a woman is taking a bite out of a pecan pie.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2727, ROUGE-2 Score: 0.1000, ROUGE-L Score: 0.2727, BERT F1 Score: 0.3113



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a woman in a leopard print dress is dancing along a sidewalk.
Generated: a ballerina is dancing on a stage in a theater
BLEU Score: 0.0000, ROUGE-1 Score: 0.5455, ROUGE-2 Score: 0.2000, ROUGE-L Score: 0.3636, BERT F1 Score: 0.4164



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man holding an umbrella is twirling and dancing
Generated: a person fell into the water and was hit by waves.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2000, BERT F1 Score: 0.3029



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man looks confused as he starts to shake his hand, then panics and shakes it some more.
Generated: a young man is talking to someone.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2400, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2400, BERT F1 Score: 0.3670



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a guinea pig is sitting at a table and making noises
Generated: a man smiles and puts on sunglasses as he's turning away
BLEU Score: 0.0000, ROUGE-1 Score: 0.1739, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1739, BERT F1 Score: 0.1574



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man is caressing and kissing a woman in a swimming pool.
Generated: two women are sitting next to each other and one laughs
BLEU Score: 0.0000, ROUGE-1 Score: 0.0870, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0870, BERT F1 Score: 0.2215



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a boy has a scratch at his head then smiles
Generated: the boy is playing with his lip ring.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2222, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2222, BERT F1 Score: 0.4566



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man with black and gray hair is running.
Generated: a woman riding a horse is walking toward an arena.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2105, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2105, BERT F1 Score: 0.3040



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a guy is trying to cross to the other street.
Generated: A dog is riding on the top of a moving pickup truck
BLEU Score: 0.0000, ROUGE-1 Score: 0.2727, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2727, BERT F1 Score: 0.2810



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man in a bright orange jacket is talking.
Generated: a man in a suit is doing a presentation for people round a table, one of them buries his face in the desk
BLEU Score: 0.1090, ROUGE-1 Score: 0.3125, ROUGE-2 Score: 0.2000, ROUGE-L Score: 0.3125, BERT F1 Score: 0.3158



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man and a woman are waiting for an elevator, and the woman fights the man who is trying to grab her pocketbook.
Generated: a woman riding a horse is walking toward an arena.
BLEU Score: 0.0000, ROUGE-1 Score: 0.3030, ROUGE-2 Score: 0.0645, ROUGE-L Score: 0.1818, BERT F1 Score: 0.2551



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man somersaults in the air and lands on the other side after being pushed.
Generated: three boys are dancing on top of a table.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1667, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0833, BERT F1 Score: 0.3281



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a woman in a red dress is standing there.
Generated: a man uses his hands to talk on stage
BLEU Score: 0.0000, ROUGE-1 Score: 0.1111, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1111, BERT F1 Score: 0.2251



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a little girl is dancing and falls.
Generated: a man is holding up two fingers.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2857, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2857, BERT F1 Score: 0.2834



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a girl are sending a kiss to someone
Generated: a young girl makes a face of disgust while brushing her hair from her face.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2609, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2609, BERT F1 Score: 0.3135



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: two girls are dancing around, with a man falling in the background.
Generated: a man is playing an acoustic guitar in a room
BLEU Score: 0.0000, ROUGE-1 Score: 0.2727, ROUGE-2 Score: 0.1000, ROUGE-L Score: 0.2727, BERT F1 Score: 0.3655



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: woman with black shirt is walking and touch his hair
Generated: two women are sitting next to each other and one laughs
BLEU Score: 0.0000, ROUGE-1 Score: 0.0952, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0952, BERT F1 Score: 0.1121



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a woman is smiling and has glasses.
Generated: a renaissance man is twirling a sword in a field
BLEU Score: 0.0000, ROUGE-1 Score: 0.2353, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2353, BERT F1 Score: 0.2378



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: there is a group of men moving their heads to the right.
Generated: the two women are eating all together.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1053, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1053, BERT F1 Score: 0.3558



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: smoke is going in and out of a person's mouth
Generated: the moon is setting in the sky above rooftops.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2000, BERT F1 Score: 0.0404



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man slaps his chest as he holds a microphone over his head
Generated: a young man is sitting while listening to music with his earplugs.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2400, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2400, BERT F1 Score: 0.3554



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a soccer player does a trick shot.
Generated: an archer aims an arrow at a target.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1333, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1333, BERT F1 Score: 0.4216



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a guy is making a few turns in a throne and then points a gun.
Generated: a guy in concert moved across stage singing and dancing.
BLEU Score: 0.0000, ROUGE-1 Score: 0.3200, ROUGE-2 Score: 0.0870, ROUGE-L Score: 0.3200, BERT F1 Score: 0.2556



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a cat walks on the sill of a car window, then jumps down and rolls around on the pavement.
Generated: a cat tries to jump on top of a refrigerator and he falls down.
BLEU Score: 0.0000, ROUGE-1 Score: 0.4848, ROUGE-2 Score: 0.1290, ROUGE-L Score: 0.3636, BERT F1 Score: 0.4773



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a guy in a white tank top is holding an ax
Generated: a guy scares a young girl near a car.
BLEU Score: 0.0000, ROUGE-1 Score: 0.3000, ROUGE-2 Score: 0.1111, ROUGE-L Score: 0.3000, BERT F1 Score: 0.2973



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a white man with black glasses is moving their fake mustaches
Generated: a famous soccer player looks up and speaks.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1053, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1053, BERT F1 Score: 0.1560



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a girl is talking for some reason.
Generated: a guy wearing a red shirt and black pants, is gliding over the ice, as he performs his routine.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1538, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1538, BERT F1 Score: 0.2174



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man is lying on his back and smoking.
Generated: two women are signing at one another.
BLEU Score: 0.0000, ROUGE-1 Score: 0.0000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0000, BERT F1 Score: 0.1376



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a dog is rolling down a hill.
Generated: a red car is driving pass a white car moving in the opposite direction
BLEU Score: 0.0000, ROUGE-1 Score: 0.2857, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2857, BERT F1 Score: 0.3362



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man wearing a white shirt is pointing his finger.
Generated: a man is taking his eyes down.
BLEU Score: 0.0000, ROUGE-1 Score: 0.4706, ROUGE-2 Score: 0.1333, ROUGE-L Score: 0.4706, BERT F1 Score: 0.3351



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man is talking into a microphone with a man standing behind him wearing a sombrero
Generated: a man looks out the window while another man approaches from behind
BLEU Score: 0.0000, ROUGE-1 Score: 0.2857, ROUGE-2 Score: 0.0769, ROUGE-L Score: 0.2857, BERT F1 Score: 0.4241



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: two people are engaged in a conversation.
Generated: two girls arguing and one goes after that.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1333, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1333, BERT F1 Score: 0.2106



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a group of boys in hooded tops are gesturing
Generated: a man throws a coaster to a woman at a bar.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1000, BERT F1 Score: 0.1831



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a wrestler lifts someone over the top rope of the ring and slams him down.
Generated: feathers are flying on a chair and a cat is jumping off
BLEU Score: 0.0000, ROUGE-1 Score: 0.1481, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1481, BERT F1 Score: 0.2011



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man is shaking a drink mixer.
Generated: two young men are singing on stage.
BLEU Score: 0.0000, ROUGE-1 Score: 0.0000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0000, BERT F1 Score: 0.4030



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a guy fakes out the defender on the basketball court causing the defender to fall down.
Generated: a man does a back flip off a bench and lands unsuccessfully
BLEU Score: 0.0000, ROUGE-1 Score: 0.0714, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0714, BERT F1 Score: 0.3396



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a young man being interviewed in a group of five others removes his dark glasses and smiles.
Generated: a woman in a white dress is riding a carriage.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2222, ROUGE-2 Score: 0.0800, ROUGE-L Score: 0.2222, BERT F1 Score: 0.2983



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: two people take off their black robes.
Generated: a man in a business suit turns and looks down.
BLEU Score: 0.0000, ROUGE-1 Score: 0.0000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0000, BERT F1 Score: 0.2598



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: someone takes off and puts a headphone back on a girl.
Generated: a woman wearing sunglasses sings softly into a microphone
BLEU Score: 0.0000, ROUGE-1 Score: 0.2000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2000, BERT F1 Score: 0.1834



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man is drinking water under the rain.
Generated: a person turns around and is smiling.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2667, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2667, BERT F1 Score: 0.3473



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man turns his head and is talking
Generated: a happy man touching his hear while laughing.
BLEU Score: 0.0000, ROUGE-1 Score: 0.3750, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.3750, BERT F1 Score: 0.2445



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man is staring at another man while in the bathroom.
Generated: a man and woman start to kiss but the woman moves away a little.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2400, ROUGE-2 Score: 0.0870, ROUGE-L Score: 0.2400, BERT F1 Score: 0.4143



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man is wrapping his arms around a distraught woman.
Generated: a man in a blue ski cap and jacket is rubbing his neck.
BLEU Score: 0.0000, ROUGE-1 Score: 0.4348, ROUGE-2 Score: 0.0952, ROUGE-L Score: 0.3478, BERT F1 Score: 0.3807



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a woman with red hair wearing headphones is dancing in an elevator.
Generated: a woman turns to her right as she sits down
BLEU Score: 0.0000, ROUGE-1 Score: 0.1818, ROUGE-2 Score: 0.1000, ROUGE-L Score: 0.1818, BERT F1 Score: 0.2308



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a boy rubs the chin of his friend.
Generated: a girl is singing while shadows cross her body.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1176, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1176, BERT F1 Score: 0.4237



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: two girls are talking, they then high five
Generated: two people in costumes are singing on stage.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2500, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2500, BERT F1 Score: 0.2602



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a girl is talking to someone else indoors.
Generated: a girl wearing a ribbon in her hair is smiling
BLEU Score: 0.0000, ROUGE-1 Score: 0.3333, ROUGE-2 Score: 0.1250, ROUGE-L Score: 0.3333, BERT F1 Score: 0.3669



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a person is staring at something and moving their eyes around.
Generated: a man is looking around for something in the snow
BLEU Score: 0.0000, ROUGE-1 Score: 0.3810, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2857, BERT F1 Score: 0.4428



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: two people are sitting while wearing costumes.
Generated: a man is looking around as he has a blank expression
BLEU Score: 0.0000, ROUGE-1 Score: 0.0000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0000, BERT F1 Score: 0.3762



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a girl in a white shirt is playing with her hair
Generated: an old man with black hair is looking confused
BLEU Score: 0.0000, ROUGE-1 Score: 0.3000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2000, BERT F1 Score: 0.4540



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man in a hard hat turns his head and smiles
Generated: a man is break dancing and fades away into the background
BLEU Score: 0.0000, ROUGE-1 Score: 0.2727, ROUGE-2 Score: 0.1000, ROUGE-L Score: 0.2727, BERT F1 Score: 0.2995



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a mustached man has a questioning look on his face.
Generated: a woman takes a drink from a bottle and offers it to another.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1739, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1739, BERT F1 Score: 0.3036



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: man wearing a black shirt smiles and claps his hands
Generated: three singers are singing and one of them is wearing sunglasses
BLEU Score: 0.0000, ROUGE-1 Score: 0.1905, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0952, BERT F1 Score: 0.1622



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a dancer girl is on stage dancing.
Generated: a woman is dancing on the stage doing different types of dance
BLEU Score: 0.0000, ROUGE-1 Score: 0.5263, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.5263, BERT F1 Score: 0.6143



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man in front of a red car is bending backward
Generated: a cat wearing sunglasses in a bowl yawns.
BLEU Score: 0.0000, ROUGE-1 Score: 0.3158, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.3158, BERT F1 Score: 0.2257



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: the woman is playing with her hair and blows a kiss.
Generated: a man is wearing heart-shaped sunglasses and is dancing and singing.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2609, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1739, BERT F1 Score: 0.4083



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: two men with microphones are jumping and walking.
Generated: a man points and winks as he sings on stage
BLEU Score: 0.0000, ROUGE-1 Score: 0.1111, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1111, BERT F1 Score: 0.3181



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: the man looks over before smiling and laughing.
Generated: two robots powered by balloons collide and move around the floor
BLEU Score: 0.0000, ROUGE-1 Score: 0.2105, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1053, BERT F1 Score: 0.0796



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a girl singing that takes her glasses off through her hair.
Generated: a guy wearing a red shirt and black pants, is gliding over the ice, as he performs his routine.
BLEU Score: 0.0000, ROUGE-1 Score: 0.0667, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0667, BERT F1 Score: 0.1733



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a person in dark clothing is standing and smiling
Generated: a man is doing a backwards flip.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2500, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2500, BERT F1 Score: 0.2818



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a handsome young man is singing with another person
Generated: dog slept peacefully on the car seat.
BLEU Score: 0.0000, ROUGE-1 Score: 0.0000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0000, BERT F1 Score: 0.1404



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a large dog walks in the snow and sniffs some footprints
Generated: a vehicle is driving through a desert spraying sand in all directions.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1739, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1739, BERT F1 Score: 0.2499



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: two men are doing movements with their hands.
Generated: a man reaches over and massages another man's neck.
BLEU Score: 0.0000, ROUGE-1 Score: 0.0000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.0000, BERT F1 Score: 0.3603



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a man is wiggling his hips and dancing on a stage.
Generated: a person sat down singing and playing the piano.
BLEU Score: 0.0000, ROUGE-1 Score: 0.2000, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.2000, BERT F1 Score: 0.4325



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reference: a woman moves her hips and dancing on a stage
Generated: a young man is talking to someone.
BLEU Score: 0.0000, ROUGE-1 Score: 0.1176, ROUGE-2 Score: 0.0000, ROUGE-L Score: 0.1176, BERT F1 Score: 0.3923

Average BLEU Score: 0.0011
Average ROUGE-1 Score: 0.2196
Average ROUGE-2 Score: 0.0343
Average ROUGE-L Score: 0.2007
Average BERT F1 Score: 0.2926
