In [None]:
# Install required libraries
!pip install transformers  # Installs the transformers library for NLP models
!pip install imageio       # Installs imageio to handle image files (e.g., GIFs)
!pip install gtts          # Installs Google Text-to-Speech API for text-to-speech
!pip install bert-score    # Installs the BERTScore metric library for evaluation
!pip install nltk          # Installs the NLTK library for natural language processing tasks
!pip install rouge-score   # Installs the Rouge score package for evaluation metrics

import os
import json
import random
import requests
import imageio
from PIL import Image
from io import BytesIO
import torch
from transformers import (
    BlipProcessor,
    BlipForConditionalGeneration,
    T5Tokenizer,
    T5ForConditionalGeneration,
    AdamW
)
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from bert_score import score as bert_score_fn

# Download NLTK data
nltk.download('wordnet')


Collecting gtts
  Downloading gTTS-2.5.3-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.3-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.3
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=c3fa768260388d4732560c91dfcf3fd737e83606494181addc98463b43cec4b8
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import os
import json
import random

# Define paths
metadata_file = 'datafile.json'  # Path to your metadata file that contains GIF information
gifs_dir = 'gifs_temp'           # Directory to temporarily store GIFs

# Create temporary GIFs directory if it doesn't exist
os.makedirs(gifs_dir, exist_ok=True)

# Load metadata from the metadata file
with open(metadata_file, 'r') as f:
    metadata = json.load(f)

# Check if the dataset contains at least 100 GIFs, if not raise an error
if len(metadata) < 100:
    raise ValueError("The metadata file contains fewer than 100 GIFs.")

# Randomly select 100 GIFs from the metadata for processing
selected_gifs = random.sample(metadata, 100)

# Output the number of selected GIFs
print(f"Selected {len(selected_gifs)} random GIFs for processing.")


Selected 100 random GIFs for processing.


In [None]:
import time

# Enhanced download function with retries
def download_gif(url, save_path, max_retries=3, backoff_factor=2):
    for attempt in range(1, max_retries + 1):
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            with open(save_path, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded: {save_path}")
            return True
        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error: {http_err} - Attempt {attempt}")
            if response.status_code == 404:
                break
        except requests.exceptions.RequestException as req_err:
            print(f"Request error: {req_err} - Attempt {attempt}")
        except Exception as e:
            print(f"Unexpected error: {e} - Attempt {attempt}")
        # Exponential backoff
        time.sleep(backoff_factor ** attempt)
    print(f"Failed to download {url}")
    return False

# Function to extract frames from GIF
def extract_frames(gif_path, num_frames=10):
    try:
        gif = imageio.mimread(gif_path)
        total_frames = len(gif)
        if total_frames == 0:
            raise ValueError("No frames found in GIF.")
        interval = max(total_frames // num_frames, 1)
        selected_frames = [gif[i] for i in range(0, total_frames, interval)][:num_frames]
        return selected_frames
    except Exception as e:
        print(f"Failed to extract frames from {gif_path}: {e}")
        return []

# Function to generate captions for frames using BLIP
def generate_frame_captions(frames, blip_processor, blip_model, device):
    captions = []
    for frame in frames:
        try:
            img = Image.fromarray(frame).convert('RGB')
            inputs = blip_processor(img, return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = blip_model.generate(**inputs)
            caption = blip_processor.decode(outputs[0], skip_special_tokens=True)
            captions.append(caption)
        except Exception as e:
            print(f"Failed to generate caption for a frame: {e}")
            captions.append("")
    return captions

# Function to delete GIF after processing
def delete_gif(gif_path):
    try:
        os.remove(gif_path)
        print(f"Deleted: {gif_path}")
    except Exception as e:
        print(f"Failed to delete {gif_path}: {e}")


In [None]:
# Initialize BLIP processor and model
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to('cuda' if torch.cuda.is_available() else 'cpu')
blip_model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

BlipForConditionalGeneration(
  (vision_model): BlipVisionModel(
    (embeddings): BlipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
    )
    (encoder): BlipEncoder(
      (layers): ModuleList(
        (0-23): 24 x BlipEncoderLayer(
          (self_attn): BlipAttention(
            (dropout): Dropout(p=0.0, inplace=False)
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (projection): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): BlipMLP(
            (activation_fn): GELUActivation()
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((1024,),

In [None]:
# Initialize lists to store data
input_texts = []
target_texts = []

# Process each GIF
for gif in tqdm(selected_gifs, desc="Processing GIFs"):
    gif_id = gif.get('id')
    url = gif.get('url')
    reference_description = gif.get('description')

    # Validate GIF entry
    if not gif_id or not url or not reference_description:
        print(f"Invalid GIF entry: {gif}")
        continue

    # Define local path to save the GIF temporarily
    gif_filename = f"{gif_id}.gif"
    gif_path = os.path.join(gifs_dir, gif_filename)

    # Download the GIF
    success = download_gif(url, gif_path)
    if not success:
        print(f"Skipping GIF {gif_id} due to download failure.")
        continue

    # Extract frames
    frames = extract_frames(gif_path, num_frames=10)
    if not frames:
        print(f"No frames extracted for {gif_id}. Skipping.")
        delete_gif(gif_path)
        continue

    # Generate captions for frames
    frame_captions = generate_frame_captions(frames, blip_processor, blip_model, 'cuda' if torch.cuda.is_available() else 'cpu')

    # Concatenate frame captions as input for T5
    concatenated_captions = " ".join(frame_captions)
    input_texts.append(concatenated_captions)

    # Use the reference description as the target
    target_texts.append(reference_description)

    # Delete the downloaded GIF
    delete_gif(gif_path)

print(f"\nCollected data for {len(input_texts)} GIFs.")


Processing GIFs:   0%|          | 0/100 [00:00<?, ?it/s]

Downloaded: gifs_temp/gif_29210.gif


Processing GIFs:   1%|          | 1/100 [00:06<10:26,  6.33s/it]

Deleted: gifs_temp/gif_29210.gif
Downloaded: gifs_temp/gif_97672.gif


Processing GIFs:   2%|▏         | 2/100 [00:10<07:55,  4.85s/it]

Deleted: gifs_temp/gif_97672.gif
Downloaded: gifs_temp/gif_72135.gif


Processing GIFs:   3%|▎         | 3/100 [00:14<07:50,  4.85s/it]

Deleted: gifs_temp/gif_72135.gif
Downloaded: gifs_temp/gif_85522.gif


Processing GIFs:   4%|▍         | 4/100 [00:18<07:12,  4.50s/it]

Deleted: gifs_temp/gif_85522.gif
Downloaded: gifs_temp/gif_124016.gif


Processing GIFs:   5%|▌         | 5/100 [00:22<06:51,  4.33s/it]

Deleted: gifs_temp/gif_124016.gif
Downloaded: gifs_temp/gif_115225.gif


Processing GIFs:   6%|▌         | 6/100 [00:27<06:58,  4.45s/it]

Deleted: gifs_temp/gif_115225.gif
Downloaded: gifs_temp/gif_30072.gif


Processing GIFs:   7%|▋         | 7/100 [00:31<06:40,  4.31s/it]

Deleted: gifs_temp/gif_30072.gif
Downloaded: gifs_temp/gif_71449.gif


Processing GIFs:   8%|▊         | 8/100 [00:35<06:24,  4.18s/it]

Deleted: gifs_temp/gif_71449.gif
Downloaded: gifs_temp/gif_114545.gif


Processing GIFs:   9%|▉         | 9/100 [00:40<06:28,  4.27s/it]

Deleted: gifs_temp/gif_114545.gif
Downloaded: gifs_temp/gif_109856.gif


Processing GIFs:  10%|█         | 10/100 [00:44<06:17,  4.20s/it]

Deleted: gifs_temp/gif_109856.gif
Downloaded: gifs_temp/gif_49451.gif


Processing GIFs:  11%|█         | 11/100 [00:48<06:11,  4.18s/it]

Deleted: gifs_temp/gif_49451.gif
Downloaded: gifs_temp/gif_108815.gif


Processing GIFs:  12%|█▏        | 12/100 [00:54<07:09,  4.88s/it]

Deleted: gifs_temp/gif_108815.gif
Downloaded: gifs_temp/gif_101475.gif


Processing GIFs:  13%|█▎        | 13/100 [00:58<06:42,  4.63s/it]

Deleted: gifs_temp/gif_101475.gif
Downloaded: gifs_temp/gif_81919.gif


Processing GIFs:  14%|█▍        | 14/100 [01:02<06:23,  4.47s/it]

Deleted: gifs_temp/gif_81919.gif
Downloaded: gifs_temp/gif_36342.gif


Processing GIFs:  15%|█▌        | 15/100 [01:08<06:49,  4.82s/it]

Deleted: gifs_temp/gif_36342.gif
Downloaded: gifs_temp/gif_24791.gif


Processing GIFs:  16%|█▌        | 16/100 [01:12<06:24,  4.58s/it]

Deleted: gifs_temp/gif_24791.gif
Downloaded: gifs_temp/gif_13330.gif


Processing GIFs:  17%|█▋        | 17/100 [01:16<06:00,  4.35s/it]

Deleted: gifs_temp/gif_13330.gif
Downloaded: gifs_temp/gif_63960.gif


Processing GIFs:  18%|█▊        | 18/100 [01:21<06:27,  4.72s/it]

Deleted: gifs_temp/gif_63960.gif
Downloaded: gifs_temp/gif_7674.gif


Processing GIFs:  19%|█▉        | 19/100 [01:22<04:45,  3.53s/it]

Deleted: gifs_temp/gif_7674.gif
Downloaded: gifs_temp/gif_74798.gif


Processing GIFs:  20%|██        | 20/100 [01:26<05:00,  3.75s/it]

Deleted: gifs_temp/gif_74798.gif
Downloaded: gifs_temp/gif_92453.gif


Processing GIFs:  21%|██        | 21/100 [01:31<05:15,  3.99s/it]

Deleted: gifs_temp/gif_92453.gif
Downloaded: gifs_temp/gif_61318.gif


Processing GIFs:  22%|██▏       | 22/100 [01:36<05:25,  4.18s/it]

Deleted: gifs_temp/gif_61318.gif
Downloaded: gifs_temp/gif_7001.gif


Processing GIFs:  23%|██▎       | 23/100 [01:40<05:25,  4.22s/it]

Deleted: gifs_temp/gif_7001.gif
Downloaded: gifs_temp/gif_29607.gif


Processing GIFs:  24%|██▍       | 24/100 [01:44<05:17,  4.18s/it]

Deleted: gifs_temp/gif_29607.gif
Downloaded: gifs_temp/gif_100201.gif


Processing GIFs:  25%|██▌       | 25/100 [01:49<05:30,  4.41s/it]

Deleted: gifs_temp/gif_100201.gif
Downloaded: gifs_temp/gif_52744.gif


Processing GIFs:  26%|██▌       | 26/100 [01:53<05:16,  4.28s/it]

Deleted: gifs_temp/gif_52744.gif
Downloaded: gifs_temp/gif_16482.gif


Processing GIFs:  27%|██▋       | 27/100 [01:57<05:01,  4.13s/it]

Deleted: gifs_temp/gif_16482.gif
Downloaded: gifs_temp/gif_59129.gif


Processing GIFs:  28%|██▊       | 28/100 [02:03<05:53,  4.92s/it]

Deleted: gifs_temp/gif_59129.gif
Downloaded: gifs_temp/gif_45445.gif


Processing GIFs:  29%|██▉       | 29/100 [02:10<06:25,  5.43s/it]

Deleted: gifs_temp/gif_45445.gif
Downloaded: gifs_temp/gif_109626.gif


Processing GIFs:  30%|███       | 30/100 [02:16<06:30,  5.58s/it]

Deleted: gifs_temp/gif_109626.gif
Downloaded: gifs_temp/gif_44611.gif


Processing GIFs:  31%|███       | 31/100 [02:20<05:48,  5.05s/it]

Deleted: gifs_temp/gif_44611.gif
Downloaded: gifs_temp/gif_73544.gif


Processing GIFs:  32%|███▏      | 32/100 [02:24<05:28,  4.83s/it]

Deleted: gifs_temp/gif_73544.gif
Downloaded: gifs_temp/gif_55608.gif


Processing GIFs:  33%|███▎      | 33/100 [02:29<05:15,  4.71s/it]

Deleted: gifs_temp/gif_55608.gif
Downloaded: gifs_temp/gif_41495.gif


Processing GIFs:  34%|███▍      | 34/100 [02:33<05:02,  4.58s/it]

Deleted: gifs_temp/gif_41495.gif
Downloaded: gifs_temp/gif_117745.gif


Processing GIFs:  35%|███▌      | 35/100 [02:37<04:43,  4.36s/it]

Deleted: gifs_temp/gif_117745.gif
Downloaded: gifs_temp/gif_47538.gif


Processing GIFs:  36%|███▌      | 36/100 [02:41<04:35,  4.31s/it]

Deleted: gifs_temp/gif_47538.gif
Downloaded: gifs_temp/gif_18300.gif


Processing GIFs:  37%|███▋      | 37/100 [02:45<04:25,  4.21s/it]

Deleted: gifs_temp/gif_18300.gif
Downloaded: gifs_temp/gif_97804.gif


Processing GIFs:  38%|███▊      | 38/100 [02:49<04:20,  4.20s/it]

Deleted: gifs_temp/gif_97804.gif
Downloaded: gifs_temp/gif_15486.gif


Processing GIFs:  39%|███▉      | 39/100 [02:53<04:11,  4.12s/it]

Deleted: gifs_temp/gif_15486.gif
Downloaded: gifs_temp/gif_58581.gif


Processing GIFs:  40%|████      | 40/100 [02:58<04:19,  4.32s/it]

Deleted: gifs_temp/gif_58581.gif
Downloaded: gifs_temp/gif_73818.gif


Processing GIFs:  41%|████      | 41/100 [03:02<04:17,  4.37s/it]

Deleted: gifs_temp/gif_73818.gif
Downloaded: gifs_temp/gif_42195.gif


Processing GIFs:  42%|████▏     | 42/100 [03:07<04:28,  4.63s/it]

Deleted: gifs_temp/gif_42195.gif
Downloaded: gifs_temp/gif_37830.gif


Processing GIFs:  43%|████▎     | 43/100 [03:12<04:21,  4.60s/it]

Deleted: gifs_temp/gif_37830.gif
Downloaded: gifs_temp/gif_44865.gif


Processing GIFs:  44%|████▍     | 44/100 [03:16<04:12,  4.51s/it]

Deleted: gifs_temp/gif_44865.gif
Downloaded: gifs_temp/gif_67505.gif


Processing GIFs:  45%|████▌     | 45/100 [03:21<04:11,  4.58s/it]

Deleted: gifs_temp/gif_67505.gif
Downloaded: gifs_temp/gif_83610.gif


Processing GIFs:  46%|████▌     | 46/100 [03:26<04:07,  4.58s/it]

Deleted: gifs_temp/gif_83610.gif
Downloaded: gifs_temp/gif_79838.gif


Processing GIFs:  47%|████▋     | 47/100 [03:30<03:59,  4.52s/it]

Deleted: gifs_temp/gif_79838.gif
Downloaded: gifs_temp/gif_22420.gif


Processing GIFs:  48%|████▊     | 48/100 [03:35<04:03,  4.68s/it]

Deleted: gifs_temp/gif_22420.gif
Downloaded: gifs_temp/gif_7164.gif


Processing GIFs:  49%|████▉     | 49/100 [03:39<03:51,  4.55s/it]

Deleted: gifs_temp/gif_7164.gif
Downloaded: gifs_temp/gif_25858.gif


Processing GIFs:  50%|█████     | 50/100 [03:44<03:45,  4.51s/it]

Deleted: gifs_temp/gif_25858.gif
Downloaded: gifs_temp/gif_82358.gif


Processing GIFs:  51%|█████     | 51/100 [03:49<03:53,  4.77s/it]

Deleted: gifs_temp/gif_82358.gif
Downloaded: gifs_temp/gif_77329.gif


Processing GIFs:  52%|█████▏    | 52/100 [03:54<03:43,  4.65s/it]

Deleted: gifs_temp/gif_77329.gif
Downloaded: gifs_temp/gif_124393.gif


Processing GIFs:  53%|█████▎    | 53/100 [03:57<03:27,  4.42s/it]

Deleted: gifs_temp/gif_124393.gif
Downloaded: gifs_temp/gif_101391.gif


Processing GIFs:  54%|█████▍    | 54/100 [04:03<03:32,  4.63s/it]

Deleted: gifs_temp/gif_101391.gif
Downloaded: gifs_temp/gif_104495.gif


Processing GIFs:  55%|█████▌    | 55/100 [04:07<03:20,  4.45s/it]

Deleted: gifs_temp/gif_104495.gif
Downloaded: gifs_temp/gif_51575.gif


Processing GIFs:  56%|█████▌    | 56/100 [04:11<03:11,  4.35s/it]

Deleted: gifs_temp/gif_51575.gif
Downloaded: gifs_temp/gif_21267.gif


Processing GIFs:  57%|█████▋    | 57/100 [04:15<03:13,  4.49s/it]

Deleted: gifs_temp/gif_21267.gif
Downloaded: gifs_temp/gif_3574.gif


Processing GIFs:  58%|█████▊    | 58/100 [04:20<03:03,  4.37s/it]

Deleted: gifs_temp/gif_3574.gif
Downloaded: gifs_temp/gif_104009.gif


Processing GIFs:  59%|█████▉    | 59/100 [04:24<02:57,  4.32s/it]

Deleted: gifs_temp/gif_104009.gif
Downloaded: gifs_temp/gif_22367.gif


Processing GIFs:  60%|██████    | 60/100 [04:29<03:07,  4.68s/it]

Deleted: gifs_temp/gif_22367.gif
Downloaded: gifs_temp/gif_15755.gif


Processing GIFs:  61%|██████    | 61/100 [04:33<02:55,  4.51s/it]

Deleted: gifs_temp/gif_15755.gif
Downloaded: gifs_temp/gif_110069.gif


Processing GIFs:  62%|██████▏   | 62/100 [04:38<02:48,  4.43s/it]

Deleted: gifs_temp/gif_110069.gif
Downloaded: gifs_temp/gif_104577.gif


Processing GIFs:  63%|██████▎   | 63/100 [04:43<02:51,  4.64s/it]

Deleted: gifs_temp/gif_104577.gif
Downloaded: gifs_temp/gif_118844.gif


Processing GIFs:  64%|██████▍   | 64/100 [04:47<02:38,  4.40s/it]

Deleted: gifs_temp/gif_118844.gif
Downloaded: gifs_temp/gif_79883.gif


Processing GIFs:  65%|██████▌   | 65/100 [04:51<02:29,  4.27s/it]

Deleted: gifs_temp/gif_79883.gif
Downloaded: gifs_temp/gif_88323.gif


Processing GIFs:  66%|██████▌   | 66/100 [04:56<02:32,  4.49s/it]

Deleted: gifs_temp/gif_88323.gif
Downloaded: gifs_temp/gif_85721.gif


Processing GIFs:  67%|██████▋   | 67/100 [05:00<02:25,  4.40s/it]

Deleted: gifs_temp/gif_85721.gif
Downloaded: gifs_temp/gif_55290.gif


Processing GIFs:  68%|██████▊   | 68/100 [05:04<02:20,  4.38s/it]

Deleted: gifs_temp/gif_55290.gif
Downloaded: gifs_temp/gif_70224.gif


Processing GIFs:  69%|██████▉   | 69/100 [05:09<02:20,  4.54s/it]

Deleted: gifs_temp/gif_70224.gif
Downloaded: gifs_temp/gif_14215.gif


Processing GIFs:  70%|███████   | 70/100 [05:13<02:10,  4.33s/it]

Deleted: gifs_temp/gif_14215.gif
Downloaded: gifs_temp/gif_27203.gif


Processing GIFs:  71%|███████   | 71/100 [05:17<02:06,  4.35s/it]

Deleted: gifs_temp/gif_27203.gif
Downloaded: gifs_temp/gif_106530.gif


Processing GIFs:  72%|███████▏  | 72/100 [05:22<02:03,  4.42s/it]

Deleted: gifs_temp/gif_106530.gif
Downloaded: gifs_temp/gif_49640.gif


Processing GIFs:  73%|███████▎  | 73/100 [05:27<02:01,  4.51s/it]

Deleted: gifs_temp/gif_49640.gif
Downloaded: gifs_temp/gif_81465.gif


Processing GIFs:  74%|███████▍  | 74/100 [05:31<01:55,  4.43s/it]

Deleted: gifs_temp/gif_81465.gif
Downloaded: gifs_temp/gif_66335.gif


Processing GIFs:  75%|███████▌  | 75/100 [05:36<01:55,  4.60s/it]

Deleted: gifs_temp/gif_66335.gif
Downloaded: gifs_temp/gif_57223.gif


Processing GIFs:  76%|███████▌  | 76/100 [05:40<01:45,  4.38s/it]

Deleted: gifs_temp/gif_57223.gif
Downloaded: gifs_temp/gif_85255.gif


Processing GIFs:  77%|███████▋  | 77/100 [05:44<01:39,  4.33s/it]

Deleted: gifs_temp/gif_85255.gif
Downloaded: gifs_temp/gif_42181.gif


Processing GIFs:  78%|███████▊  | 78/100 [05:48<01:36,  4.37s/it]

Deleted: gifs_temp/gif_42181.gif
Downloaded: gifs_temp/gif_94130.gif


Processing GIFs:  79%|███████▉  | 79/100 [05:53<01:33,  4.43s/it]

Deleted: gifs_temp/gif_94130.gif
Downloaded: gifs_temp/gif_40593.gif


Processing GIFs:  80%|████████  | 80/100 [05:57<01:28,  4.40s/it]

Deleted: gifs_temp/gif_40593.gif
Downloaded: gifs_temp/gif_21293.gif


Processing GIFs:  81%|████████  | 81/100 [06:02<01:27,  4.62s/it]

Deleted: gifs_temp/gif_21293.gif
Downloaded: gifs_temp/gif_69985.gif


Processing GIFs:  82%|████████▏ | 82/100 [06:07<01:21,  4.55s/it]

Deleted: gifs_temp/gif_69985.gif
Downloaded: gifs_temp/gif_58783.gif


Processing GIFs:  83%|████████▎ | 83/100 [06:11<01:14,  4.38s/it]

Deleted: gifs_temp/gif_58783.gif
Downloaded: gifs_temp/gif_109546.gif


Processing GIFs:  84%|████████▍ | 84/100 [06:15<01:11,  4.46s/it]

Deleted: gifs_temp/gif_109546.gif
Downloaded: gifs_temp/gif_63358.gif


Processing GIFs:  85%|████████▌ | 85/100 [06:19<01:05,  4.34s/it]

Deleted: gifs_temp/gif_63358.gif
Downloaded: gifs_temp/gif_8838.gif


Processing GIFs:  86%|████████▌ | 86/100 [06:24<01:00,  4.32s/it]

Deleted: gifs_temp/gif_8838.gif
Downloaded: gifs_temp/gif_79279.gif


Processing GIFs:  87%|████████▋ | 87/100 [06:29<00:58,  4.50s/it]

Deleted: gifs_temp/gif_79279.gif
Downloaded: gifs_temp/gif_72505.gif


Processing GIFs:  88%|████████▊ | 88/100 [06:33<00:52,  4.37s/it]

Deleted: gifs_temp/gif_72505.gif
Downloaded: gifs_temp/gif_28274.gif


Processing GIFs:  89%|████████▉ | 89/100 [06:37<00:46,  4.26s/it]

Deleted: gifs_temp/gif_28274.gif
Downloaded: gifs_temp/gif_48125.gif


Processing GIFs:  90%|█████████ | 90/100 [06:42<00:44,  4.46s/it]

Deleted: gifs_temp/gif_48125.gif
Downloaded: gifs_temp/gif_111900.gif


Processing GIFs:  91%|█████████ | 91/100 [06:46<00:39,  4.36s/it]

Deleted: gifs_temp/gif_111900.gif
Downloaded: gifs_temp/gif_34170.gif


Processing GIFs:  92%|█████████▏| 92/100 [06:50<00:34,  4.31s/it]

Deleted: gifs_temp/gif_34170.gif
Downloaded: gifs_temp/gif_73651.gif


Processing GIFs:  93%|█████████▎| 93/100 [06:55<00:31,  4.46s/it]

Deleted: gifs_temp/gif_73651.gif
Downloaded: gifs_temp/gif_85111.gif


Processing GIFs:  94%|█████████▍| 94/100 [06:59<00:25,  4.30s/it]

Deleted: gifs_temp/gif_85111.gif
Downloaded: gifs_temp/gif_58624.gif


Processing GIFs:  95%|█████████▌| 95/100 [07:03<00:20,  4.18s/it]

Deleted: gifs_temp/gif_58624.gif
Downloaded: gifs_temp/gif_13099.gif


Processing GIFs:  96%|█████████▌| 96/100 [07:08<00:17,  4.41s/it]

Deleted: gifs_temp/gif_13099.gif
Downloaded: gifs_temp/gif_33064.gif


Processing GIFs:  97%|█████████▋| 97/100 [07:12<00:12,  4.28s/it]

Deleted: gifs_temp/gif_33064.gif
Downloaded: gifs_temp/gif_88900.gif


Processing GIFs:  98%|█████████▊| 98/100 [07:16<00:08,  4.26s/it]

Deleted: gifs_temp/gif_88900.gif
Downloaded: gifs_temp/gif_75483.gif


Processing GIFs:  99%|█████████▉| 99/100 [07:20<00:04,  4.39s/it]

Deleted: gifs_temp/gif_75483.gif
Downloaded: gifs_temp/gif_12812.gif


Processing GIFs: 100%|██████████| 100/100 [07:25<00:00,  4.45s/it]

Deleted: gifs_temp/gif_12812.gif

Collected data for 100 GIFs.





In [None]:
# Define the custom Dataset class
class GifAggregationDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_input_length=512, max_target_length=150):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]

        # Tokenize input
        input_encoding = self.tokenizer(
            input_text,
            padding='max_length',
            truncation=True,
            max_length=self.max_input_length,
            return_tensors="pt"
        )

        # Tokenize target
        target_encoding = self.tokenizer(
            target_text,
            padding='max_length',
            truncation=True,
            max_length=self.max_target_length,
            return_tensors="pt"
        )

        # Replace padding token id's of the labels by -100 to ignore them in the loss
        labels = target_encoding.input_ids.squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': input_encoding.input_ids.squeeze(),
            'attention_mask': input_encoding.attention_mask.squeeze(),
            'labels': labels,
            'reference_description': target_text
        }

In [None]:
# Initialize T5 tokenizer and model
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base').to('cuda' if torch.cuda.is_available() else 'cpu')


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Create the dataset
aggregation_dataset = GifAggregationDataset(
    inputs=input_texts,
    targets=target_texts,
    tokenizer=t5_tokenizer,
    max_input_length=512,
    max_target_length=150
)

# Split the dataset into training and validation sets (80-20 split)
train_size = int(0.8 * len(aggregation_dataset))
val_size = len(aggregation_dataset) - train_size
train_dataset, val_dataset = random_split(aggregation_dataset, [train_size, val_size])

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")

# Create DataLoaders with the original batch size
train_dataloader = DataLoader(
    train_dataset,
    batch_size=4,           # Original batch size
    shuffle=True,
    num_workers=2,
    pin_memory=True       # Optional: Can improve data transfer speed
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=4,           # Original batch size
    shuffle=False,
    num_workers=2,
    pin_memory=True       # Optional
)

Training set size: 80
Validation set size: 20


In [None]:
# Set the T5 model to training mode
t5_model.train()

# Define the optimizer using AdamW with a learning rate of 5e-5
optimizer = AdamW(t5_model.parameters(), lr=5e-5)

# Set the number of epochs for training
num_epochs = 10

for epoch in range(num_epochs):  # Iterate through the epochs
    print(f"\nEpoch {epoch + 1}/{num_epochs}")  # Print the current epoch
    epoch_loss = 0  # Initialize the loss for the current epoch

    # Training phase
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}"):
        # Move the input, attention mask, and labels to the GPU
        input_ids = batch['input_ids'].to('cuda')
        attention_mask = batch['attention_mask'].to('cuda')
        labels = batch['labels'].to('cuda')

        # Forward pass: compute the model's predictions and the loss
        outputs = t5_model(
            input_ids=input_ids,          # Encoded input sequences
            attention_mask=attention_mask,  # Attention mask to handle padding
            labels=labels                 # Target sequences (reference descriptions)
        )
        loss = outputs.loss  # Extract the loss from the model's output

        # Backward pass: compute gradients and optimize
        optimizer.zero_grad()  # Clear previous gradients
        loss.backward()        # Compute gradients
        optimizer.step()       # Update model weights based on the gradients

        # Accumulate the loss for this batch
        epoch_loss += loss.item()

    # Calculate and print the average training loss for the epoch
    avg_train_loss = epoch_loss / len(train_dataloader)
    print(f"Average Training Loss for Epoch {epoch + 1}: {avg_train_loss:.4f}")

    # Validation phase
    t5_model.eval()  # Set the model to evaluation mode
    val_loss = 0
    with torch.no_grad():  # Disable gradient computation
        for batch in tqdm(val_dataloader, desc=f"Validation Epoch {epoch + 1}"):
            # Move the input, attention mask, and labels to the GPU
            input_ids = batch['input_ids'].to('cuda')
            attention_mask = batch['attention_mask'].to('cuda')
            labels = batch['labels'].to('cuda')

            # Forward pass: compute the model's predictions and the loss
            outputs = t5_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss

            # Accumulate the validation loss
            val_loss += loss.item()

    # Calculate and print the average validation loss for the epoch
    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Average Validation Loss for Epoch {epoch + 1}: {avg_val_loss:.4f}")

    # Set the model back to training mode for the next epoch
    t5_model.train()

    # Clear GPU cache after each epoch to free up memory
    torch.cuda.empty_cache()




Epoch 1/10


Training Epoch 1: 100%|██████████| 20/20 [00:13<00:00,  1.43it/s]


Average Training Loss for Epoch 1: 3.3903


Validation Epoch 1: 100%|██████████| 5/5 [00:01<00:00,  3.00it/s]


Average Validation Loss for Epoch 1: 2.5042

Epoch 2/10


Training Epoch 2: 100%|██████████| 20/20 [00:13<00:00,  1.43it/s]


Average Training Loss for Epoch 2: 2.4005


Validation Epoch 2: 100%|██████████| 5/5 [00:01<00:00,  3.54it/s]


Average Validation Loss for Epoch 2: 2.3247

Epoch 3/10


Training Epoch 3: 100%|██████████| 20/20 [00:13<00:00,  1.46it/s]


Average Training Loss for Epoch 3: 2.1229


Validation Epoch 3: 100%|██████████| 5/5 [00:01<00:00,  3.70it/s]


Average Validation Loss for Epoch 3: 2.3210

Epoch 4/10


Training Epoch 4: 100%|██████████| 20/20 [00:13<00:00,  1.50it/s]


Average Training Loss for Epoch 4: 1.9351


Validation Epoch 4: 100%|██████████| 5/5 [00:01<00:00,  3.61it/s]


Average Validation Loss for Epoch 4: 2.3181

Epoch 5/10


Training Epoch 5: 100%|██████████| 20/20 [00:13<00:00,  1.51it/s]


Average Training Loss for Epoch 5: 1.7334


Validation Epoch 5: 100%|██████████| 5/5 [00:01<00:00,  3.66it/s]


Average Validation Loss for Epoch 5: 2.3344

Epoch 6/10


Training Epoch 6: 100%|██████████| 20/20 [00:13<00:00,  1.51it/s]


Average Training Loss for Epoch 6: 1.5291


Validation Epoch 6: 100%|██████████| 5/5 [00:01<00:00,  3.71it/s]


Average Validation Loss for Epoch 6: 2.4136

Epoch 7/10


Training Epoch 7: 100%|██████████| 20/20 [00:13<00:00,  1.50it/s]


Average Training Loss for Epoch 7: 1.3374


Validation Epoch 7: 100%|██████████| 5/5 [00:01<00:00,  3.65it/s]


Average Validation Loss for Epoch 7: 2.4677

Epoch 8/10


Training Epoch 8: 100%|██████████| 20/20 [00:13<00:00,  1.48it/s]


Average Training Loss for Epoch 8: 1.1826


Validation Epoch 8: 100%|██████████| 5/5 [00:01<00:00,  3.45it/s]


Average Validation Loss for Epoch 8: 2.5262

Epoch 9/10


Training Epoch 9: 100%|██████████| 20/20 [00:13<00:00,  1.48it/s]


Average Training Loss for Epoch 9: 1.0523


Validation Epoch 9: 100%|██████████| 5/5 [00:01<00:00,  3.62it/s]


Average Validation Loss for Epoch 9: 2.6322

Epoch 10/10


Training Epoch 10: 100%|██████████| 20/20 [00:13<00:00,  1.50it/s]


Average Training Loss for Epoch 10: 0.9004


Validation Epoch 10: 100%|██████████| 5/5 [00:01<00:00,  3.65it/s]

Average Validation Loss for Epoch 10: 2.7672





In [None]:
def evaluate_model(model, tokenizer, dataloader, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model.eval()

    all_references = []
    all_candidates = []
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    meteor_scores_list = []

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            reference_descriptions = batch['reference_description']

            # Generate predictions
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=150,
                num_beams=4,
                early_stopping=True
            )

            # Decode predictions
            predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

            for ref, pred in zip(reference_descriptions, predictions):
                ref_tokens = ref.split()
                pred_tokens = pred.split()

                all_references.append([ref_tokens])
                all_candidates.append(pred_tokens)

                scores = scorer.score(ref, pred)
                rouge1_scores.append(scores['rouge1'].fmeasure)
                rouge2_scores.append(scores['rouge2'].fmeasure)
                rougeL_scores.append(scores['rougeL'].fmeasure)

                # Corrected line: Pass tokenized references and hypotheses
                meteor = meteor_score([ref_tokens], pred_tokens)
                meteor_scores_list.append(meteor)

    # Calculate BLEU score
    bleu = corpus_bleu(all_references, all_candidates)

    # Calculate average ROUGE scores
    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores) if rouge1_scores else 0
    avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores) if rouge2_scores else 0
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores) if rougeL_scores else 0

    # Calculate average METEOR score
    avg_meteor = sum(meteor_scores_list) / len(meteor_scores_list) if meteor_scores_list else 0

    # Calculate BERT Score
    P, R, F1 = bert_score_fn(
        [' '.join(cand) for cand in all_candidates],
        [' '.join(ref[0]) for ref in all_references],
        lang='en',
        verbose=True
    )
    avg_bert_f1 = F1.mean().item()

    # Compile metrics
    metrics = {
        'BLEU': bleu,
        'ROUGE-1': avg_rouge1,
        'ROUGE-2': avg_rouge2,
        'ROUGE-L': avg_rougeL,
        'METEOR': avg_meteor,
        'BERT_F1': avg_bert_f1
    }

    return metrics


In [None]:
# Evaluate the model
evaluation_metrics_tr = evaluate_model(t5_model, t5_tokenizer, train_dataloader, device='cuda' if torch.cuda.is_available() else 'cpu')

# Print evaluation metrics
print("\nEvaluation Metrics:")
for metric, value in evaluation_metrics_tr.items():
    print(f"{metric}: {value:.4f}")

Evaluating: 100%|██████████| 20/20 [00:22<00:00,  1.13s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 0.56 seconds, 144.09 sentences/sec

Evaluation Metrics:
BLEU: 0.4728
ROUGE-1: 0.6606
ROUGE-2: 0.5315
ROUGE-L: 0.6470
METEOR: 0.5911
BERT_F1: 0.9460


In [None]:
# Evaluate the model
evaluation_metrics = evaluate_model(t5_model, t5_tokenizer, val_dataloader, device='cuda' if torch.cuda.is_available() else 'cpu')

# Print evaluation metrics
print("\nEvaluation Metrics:")
for metric, value in evaluation_metrics.items():
    print(f"{metric}: {value:.4f}")

Evaluating: 100%|██████████| 5/5 [00:06<00:00,  1.33s/it]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.29 seconds, 68.60 sentences/sec

Evaluation Metrics:
BLEU: 0.0833
ROUGE-1: 0.3800
ROUGE-2: 0.1625
ROUGE-L: 0.3703
METEOR: 0.2931
BERT_F1: 0.9074


In [None]:
import os
import requests
import imageio
from PIL import Image
from io import BytesIO
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer
from bert_score import score as bert_score_fn
from tqdm import tqdm

# Ensure necessary NLTK data is downloaded
import nltk
nltk.download('wordnet')
nltk.download('punkt')  # Added for tokenization consistency

def process_and_evaluate_gif(
    gif_url,
    actual_description,
    blip_processor,
    blip_model,
    t5_tokenizer,
    t5_model,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    num_frames=10,
    max_length=512,
    max_target_length=150,
    num_beams=4
):
    """
    Processes an unseen GIF to generate a description and evaluates it against the actual description.

    Args:
        gif_url (str): URL of the GIF to process.
        actual_description (str): The ground truth description of the GIF.
        blip_processor: BLIP processor instance.
        blip_model: BLIP model instance.
        t5_tokenizer: T5 tokenizer instance.
        t5_model: T5 model instance.
        device (str): Device to run the models on ('cuda' or 'cpu').
        num_frames (int): Number of frames to extract from the GIF.
        max_length (int): Maximum token length for T5 input.
        max_target_length (int): Maximum token length for T5 output.
        num_beams (int): Number of beams for beam search in T5 generation.

    Returns:
        dict: Evaluation metrics including BLEU, ROUGE-1, ROUGE-2, ROUGE-L, METEOR, and BERT_F1.
    """
    # Step 1: Download the GIF
    try:
        response = requests.get(gif_url, timeout=10)
        response.raise_for_status()
        gif_bytes = BytesIO(response.content)
        print(f"Successfully downloaded GIF from {gif_url}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download GIF from {gif_url}: {e}")
        return None

    # Step 2: Extract frames from the GIF
    try:
        gif = imageio.mimread(gif_bytes, memtest=False)
        total_frames = len(gif)
        if total_frames == 0:
            print("No frames found in the GIF.")
            return None
        interval = max(total_frames // num_frames, 1)
        selected_frames = [gif[i] for i in range(0, total_frames, interval)][:num_frames]
        print(f"Extracted {len(selected_frames)} frames from the GIF.")
    except Exception as e:
        print(f"Error extracting frames from GIF: {e}")
        return None

    # Step 3: Convert frames to PIL Images and generate captions using BLIP
    captions = []
    for idx, frame in enumerate(selected_frames):
        try:
            if isinstance(frame, Image.Image):
                img = frame.convert('RGB')
            else:
                img = Image.fromarray(frame).convert('RGB')

            inputs = blip_processor(img, return_tensors="pt").to(device)
            with torch.no_grad():
                outputs = blip_model.generate(**inputs)
            caption = blip_processor.decode(outputs[0], skip_special_tokens=True)
            captions.append(caption)
            print(f"Frame {idx+1}: {caption}")
        except Exception as e:
            print(f"Failed to generate caption for frame {idx+1}: {e}")
            captions.append("")  # Append empty string for failed captions

    # Check if any captions were generated
    if all(caption == "" for caption in captions):
        print("No captions were generated for any frames.")
        return None

    # Step 4: Concatenate captions to form T5 input
    concatenated_captions = " ".join(captions)
    print(f"Concatenated Captions: {concatenated_captions}")

    # Step 5: Tokenize the concatenated captions for T5
    try:
        encoding = t5_tokenizer.encode_plus(
            concatenated_captions,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
    except Exception as e:
        print(f"Error during tokenization: {e}")
        return None

    # Step 6: Generate description using T5
    try:
        t5_model.eval()
        with torch.no_grad():
            outputs = t5_model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=max_target_length,
                num_beams=num_beams,
                early_stopping=True
            )
        generated_description = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Generated Description: {generated_description}")
    except Exception as e:
        print(f"Error during T5 generation: {e}")
        return None

    # Step 7: Evaluate the generated description against the actual description
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Prepare tokenized inputs
    ref_tokens = actual_description.split()
    hyp_tokens = generated_description.split()

    # Calculate BLEU
    bleu = corpus_bleu([[ref_tokens]], [hyp_tokens])

    # Calculate ROUGE
    rouge1 = scorer.score(actual_description, generated_description)['rouge1'].fmeasure
    rouge2 = scorer.score(actual_description, generated_description)['rouge2'].fmeasure
    rougeL = scorer.score(actual_description, generated_description)['rougeL'].fmeasure

    # Calculate METEOR
    meteor = meteor_score([ref_tokens], hyp_tokens)

    # Calculate BERT Score
    P, R, F1 = bert_score_fn(
        [' '.join(hyp_tokens)],
        [' '.join(ref_tokens)],
        lang='en',
        verbose=False
    )
    bert_f1 = F1.mean().item()

    # Compile metrics
    metrics = {
        'BLEU': bleu,
        'ROUGE-1': rouge1,
        'ROUGE-2': rouge2,
        'ROUGE-L': rougeL,
        'METEOR': meteor,
        'BERT_F1': bert_f1
    }

    # Display Metrics
    print("\nEvaluation Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

    return {
        'generated_description': generated_description,
        'actual_description': actual_description,
        'metrics': metrics
    }


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Example list of unknown GIFs and their actual descriptions
unseen_gif =     {
        'url': 'https://38.media.tumblr.com/f754d72da3c6a58211c760d39dff5be3/tumblr_n8vbphDLEh1qdzzbko1_250.gif',
        'actual_description': 'a man in a tuxedo stares as smoke rises next to him'
    }
    # Add more GIFs as needed


# Process and evaluate the unseen GIF
# Process and evaluate the unseen GIF
result = process_and_evaluate_gif(
    gif_url=unseen_gif['url'],
    actual_description=unseen_gif['actual_description'],
    blip_processor=blip_processor,
    blip_model=blip_model,
    t5_tokenizer=t5_tokenizer,
    t5_model=t5_model,  # Your trained model instance
    device='cuda' if torch.cuda.is_available() else 'cpu',
    num_frames=10,
    max_length=512,
    max_target_length=150,
    num_beams=4
)

# Check the result
if result:
    print("\nFinal Results:")
    print(f"Generated Description: {result['generated_description']}")
    print(f"Actual Description: {result['actual_description']}")
    print("Evaluation Metrics:")
    for metric, score in result['metrics'].items():
        print(f"  {metric}: {score:.4f}")
else:
    print("Processing and evaluation failed.")


Successfully downloaded GIF from https://38.media.tumblr.com/f754d72da3c6a58211c760d39dff5be3/tumblr_n8vbphDLEh1qdzzbko1_250.gif
Extracted 10 frames from the GIF.
Frame 1: a close up of a man in a tuxedo with a bow tie
Frame 2: arafed image of a man in a tuxedo and bow tie
Frame 3: arafed image of a man in a tuxedo and bow tie
Frame 4: a close up of a man in a tuxedo with a bow tie
Frame 5: arafed image of a man in a tuxedo and bow tie
Frame 6: arafed image of a man in a tuxedo and bow tie
Frame 7: arafed image of a man in a tuxedo and bow tie
Frame 8: arafed image of a man in a tuxedo and bow tie
Frame 9: arafed image of a man in a tuxedo and bow tie
Frame 10: a close up of a man in a tuxedo with a bow tie
Concatenated Captions: a close up of a man in a tuxedo with a bow tie arafed image of a man in a tuxedo and bow tie arafed image of a man in a tuxedo and bow tie a close up of a man in a tuxedo with a bow tie arafed image of a man in a tuxedo and bow tie arafed image of a man in a t

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluation Metrics:
BLEU: 0.3198
ROUGE-1: 0.4762
ROUGE-2: 0.4211
ROUGE-L: 0.4762
METEOR: 0.3179
BERT_F1: 0.9253

Final Results:
Generated Description: a man in a tuxedo wearing a bow tie
Actual Description: a man in a tuxedo stares as smoke rises next to him
Evaluation Metrics:
  BLEU: 0.3198
  ROUGE-1: 0.4762
  ROUGE-2: 0.4211
  ROUGE-L: 0.4762
  METEOR: 0.3179
  BERT_F1: 0.9253
