# Required libraries

In [1]:
!pip install numpy torch pillow torchvision matplotlib tqdm datasets diffusers transformers accelerate kornia huggingface_hub

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting kornia
  Downloading kornia-0.8.0-py2.py3-none-any.whl.metadata (17 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_

In [18]:
import numpy as np
import torch
from PIL import Image as PILImage
from io import BytesIO
import torchvision
from torchvision import transforms
from torchvision.datasets import ImageFolder
from datasets import Dataset, DatasetDict, Features, Value, Image as DatasetImage
from transformers import BlipProcessor, BlipForConditionalGeneration
from huggingface_hub import notebook_login, login
import kornia.color as kcolor
from tqdm.auto import tqdm

In [3]:
device = 'mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


In [4]:
!nvidia-smi

Sun Feb  9 17:47:35 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   44C    P8             12W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Imagenette dataset

In [5]:
dataset = torchvision.datasets.Imagenette(root="imagenette/", split='val', size='full', download=True)

Downloading https://s3.amazonaws.com/fast-ai-imageclas/imagenette2.tgz to imagenette/imagenette2.tgz


100%|██████████| 1.56G/1.56G [01:58<00:00, 13.1MB/s]


Extracting imagenette/imagenette2.tgz to imagenette/


In [6]:
dataset.classes

[('tench', 'Tinca tinca'),
 ('English springer', 'English springer spaniel'),
 ('cassette player',),
 ('chain saw', 'chainsaw'),
 ('church', 'church building'),
 ('French horn', 'horn'),
 ('garbage truck', 'dustcart'),
 ('gas pump', 'gasoline pump', 'petrol pump', 'island dispenser'),
 ('golf ball',),
 ('parachute', 'chute')]

# Building the dataset for our task

## Helper functions for Dataset creation

In [7]:
def is_black_and_white(img):
    #Check if an image is grayscale by comparing RGB channels.
    img_array = np.array(img)
    if len(img_array.shape) < 3 or img_array.shape[2] != 3:
        return True  # Already grayscale
    return np.all(img_array[:, :, 0] == img_array[:, :, 1]) and np.all(img_array[:, :, 1] == img_array[:, :, 2])

In [8]:
def mean_saturation(img):
    #Calculate the mean saturation of an image in HSV color space.
    img_hsv = img.convert('HSV')
    np_hsv = np.array(img_hsv)
    return np.mean(np_hsv[:, :, 1]) / 255.0  # Normalize to [0, 1]

## RGB dataset composition and captioning

In [9]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

In [23]:
class PreprocessedImagenette(Dataset):
    def __init__(self, root, image_size):
        self.dataset = torchvision.datasets.ImageFolder(root=root)
        self.transform = transforms.Compose([
            transforms.Resize((image_size, image_size)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
        ])
        self.filtered_data = []
        self.caption_model = model
        self.caption_processor = processor

        # Store class names
        self.class_names = {i: ", ".join(names) for i, names in enumerate([
            ('tench', 'Tinca tinca'),
            ('English springer', 'English springer spaniel'),
            ('cassette player',),
            ('chain saw', 'chainsaw'),
            ('church', 'church building'),
            ('French horn', 'horn'),
            ('garbage truck', 'dustcart'),
            ('gas pump', 'gasoline pump', 'petrol pump', 'island dispenser'),
            ('golf ball',),
            ('parachute', 'chute')
        ])}

        for img_path, label in self.dataset.samples:
            img = PILImage.open(img_path).convert('RGB')
            if is_black_and_white(img) or mean_saturation(img) < 0.1:
                continue
            self.filtered_data.append((img_path, self.class_names[label]))  # Store label as string

    def __len__(self):
        return len(self.filtered_data)

    def __getitem__(self, idx):
        img_path, label = self.filtered_data[idx]
        img = PILImage.open(img_path).convert('RGB')
        rgb_img = self.transform(img)

        gray = kcolor.RgbToGrayscale()
        grayscale_img = gray(rgb_img)

        # Get caption using the interrogator
        inputs = self.caption_processor(img,
                                        return_tensors="pt").to(device)
        caption = self.caption_model.generate(**inputs,
                                              max_length=50,
                                              min_length=10,
                                              num_beams=5,               # Use beam search for better accuracy (higher values reduce creativity)
                                              temperature=0.5,           # Lower temperature makes the model less creative (more deterministic)
                                              top_p=0.9,                 # Nucleus sampling for diversity, but keep it lower to avoid weird outputs
                                              repetition_penalty=1.5,    # Penalize repeated phrases (helps reduce repetition errors)
                                              no_repeat_ngram_size=3,
                                              do_sample=True)
        decoded_caption = self.caption_processor.tokenizer.decode(caption[0], skip_special_tokens=True)

        return rgb_img, grayscale_img, decoded_caption, label  # Label as class name

In [24]:
custom_dataset = PreprocessedImagenette(root="imagenette/", image_size=512) # ~2min and 30 sec

## Upload on huggingface

In [28]:
notebook_login() 

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
def pil_to_bytes(img):
    with BytesIO() as output:
        img.save(output, format="JPEG")
        return output.getvalue()

# Function to create a normal Dataset from a generator
def create_hf_dataset(custom_dataset):
    data = []
    for i in tqdm(range(len(custom_dataset)), desc="Processing dataset"):
        rgb_img, grayscale_img, caption, label = custom_dataset[i]

        # Convert tensors to PIL images
        rgb_pil = transforms.ToPILImage()(rgb_img)
        gray_pil = transforms.ToPILImage()(grayscale_img)

        data.append({
            "rgb_image": pil_to_bytes(rgb_pil),
            "grayscale_image": pil_to_bytes(gray_pil),
            "caption": caption,
            "label": label
        })

    return Dataset.from_list(data)

In [None]:
hf_dataset = create_hf_dataset(custom_dataset)

# Organize the dataset into DatasetDict (if you have train/val split)
dataset_dict = DatasetDict({"train": hf_dataset})

# Push to Hugging Face
dataset_dict.push_to_hub("MarcoBrigo11/Imagenette-no_blackwhite-halfstdmean-captioned")

Processing dataset:   0%|          | 0/12269 [00:00<?, ?it/s]