## Dependencies

In [None]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m89.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollec

In [None]:
%pip install huggingface_hub diffusers datasets transformers accelerate bitsandbytes tqdm

In [None]:
from torch.utils.data import DataLoader
import math
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from datasets import load_dataset
from torchvision import transforms
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from diffusers.utils import make_image_grid
from torchvision.transforms.functional import pil_to_tensor, to_pil_image
from PIL import Image
import requests
from diffusers import DiffusionPipeline
import bitsandbytes as bnb

In [None]:
seed_value = 42
torch.manual_seed(seed_value)
if torch.cuda.is_available():
  torch.cuda.manual_seed(seed_value)
  torch.cuda.manual_seed_all(seed_value)
np.random.seed(seed_value)

In [None]:
from huggingface_hub import login

# Replace 'your_token_here' with your actual Hugging Face API token
login(token='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')

In [None]:
learning_rate = 3e-05
resolution = 224# <256
max_train_steps = 7000#5000
train_batch_size = 16#16 8
accumulation_steps= 2
lr_updates, gamma = 35, 0.97
score_checks = 5
score_steps = max_train_steps // score_checks
test_prints = 15
test_steps = max_train_steps // test_prints

base_model_name ="lambdalabs/miniSD-diffusers"

# Extract the individual components
pipe = DiffusionPipeline.from_pretrained(base_model_name,torch_dtype=torch.float32,
                                         safety_checker = None,
                                        requires_safety_checker = False)
pipe.to('cuda')
vae = pipe.vae
text_encoder = pipe.text_encoder
tokenizer = pipe.tokenizer
unet = pipe.unet
noise_scheduler = pipe.scheduler

# Freeze vae and text_encoder and set unet to trainable

train_vae=False
train_unet=True

if train_vae==True :
  vae.requires_grad_(True)
  vae.train()
else:
  vae.requires_grad_(False)

if train_unet==True :
  unet.requires_grad_(True)
  unet.train()
else:
  unet.requires_grad_(False)

text_encoder.requires_grad_(False)

unet_optimizer = bnb.optim.Adam8bit(unet.parameters(), lr=learning_rate, betas=(0.9, 0.98))
vae_optimizer=bnb.optim.Adam8bit(vae.parameters(), lr=learning_rate)
unet_lr_scheduler = torch.optim.lr_scheduler.StepLR(unet_optimizer, step_size=max_train_steps//(accumulation_steps * lr_updates), gamma=gamma)
vae_lr_scheduler = torch.optim.lr_scheduler.StepLR(vae_optimizer, step_size=max_train_steps//(accumulation_steps * lr_updates), gamma=gamma)

In [None]:
import random
import torch
import torchvision.transforms as T
from torchvision.transforms import functional as TF

class RandomApply(T.RandomApply):
    def __init__(self, transforms, p=0.5):
        super().__init__(transforms, p=p)

class AddGaussianNoise(torch.nn.Module):
    def __init__(self, mean=0., std=1.):
        super().__init__()
        self.std = std
        self.mean = mean

    def forward(self, tensor):
        return tensor + torch.randn(tensor.size()) * self.std + self.mean

    def __repr__(self):
        return self.__class__.__name__ + f'(mean={self.mean}, std={self.std})'

class Cutout(torch.nn.Module):
    def __init__(self, size=16):
        super().__init__()
        self.size = size

    def forward(self, img):
        h, w = img.size(1), img.size(2)
        mask = torch.ones_like(img)
        y = torch.randint(h, (1,))
        x = torch.randint(w, (1,))

        y1 = torch.clamp(y - self.size // 2, 0, h)
        y2 = torch.clamp(y + self.size // 2, 0, h)
        x1 = torch.clamp(x - self.size // 2, 0, w)
        x2 = torch.clamp(x + self.size // 2, 0, w)

        mask[:, y1:y2, x1:x2] = 0
        return img * mask

    def __repr__(self):
        return self.__class__.__name__ + f'(size={self.size})'



# Preprocessing the datasets.
train_transforms = transforms.Compose(
    [
        transforms.Resize(resolution, interpolation=transforms.InterpolationMode.BILINEAR),
        transforms.ToTensor(),

        # ----
        # TODO 3.5 (very low priority): You might add additional augmentation
        transforms.CenterCrop(resolution),
        transforms.RandomHorizontalFlip(),
        RandomApply([T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1)], p=0.5),
        RandomApply([AddGaussianNoise(mean=0., std=0.1)], p=0.5),
        RandomApply([T.RandomRotation(degrees=10)], p=0.5),
        RandomApply([Cutout(size=16)], p=0.5),

        #----
        transforms.Normalize([0.5], [0.5]),
    ]
)

In [None]:
#All labels for object detector except zebra and giraffe

labels=[
    "zebra",
    "giraffe",
    "person",
    "bicycle",
    "car",
    "motorcycle",
    "airplane",
    "bus",
    "train",
    "truck",
    "boat",
    "traffic light",
    "fire hydrant",
    "stop sign",
    "parking meter",
    "bench",
    "bird",
    "cat",
    "dog",
    "horse",
    "sheep",
    "cow",
    "elephant",
    "bear",
    "backpack",
    "umbrella",
    "handbag",
    "tie",
    "suitcase",
    "frisbee",
    "skis",
    "snowboard",
    "sports ball",
    "kite",
    "baseball bat",
    "baseball glove",
    "skateboard",
    "surfboard",
    "tennis racket",
    "bottle",
    "wine glass",
    "cup",
    "fork",
    "knife",
    "spoon",
    "bowl",
    "banana",
    "apple",
    "sandwich",
    "orange",
    "broccoli",
    "carrot",
    "hot dog",
    "pizza",
    "donut",
    "cake",
    "chair",
    "couch",
    "potted plant",
    "bed",
    "dining table",
    "toilet",
    "TV",
    "laptop",
    "mouse",
    "remote",
    "keyboard",
    "cell phone",
    "microwave",
    "oven",
    "toaster",
    "sink",
    "refrigerator",
    "book",
    "clock",
    "vase",
    "scissors",
    "teddy bear",
    "hair drier",
    "toothbrush"
]

In [None]:
dataset=load_dataset('Romania1/cv_dataset', trust_remote_code=True)

In [None]:
dataset=dataset['train']

In [None]:
# convert dataset to a loader that could be feed during training
def tokenize_captions(examples, is_train=True):
    captions = examples['text']
    inputs = tokenizer(
        captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
    )
    return inputs.input_ids

def preprocess_train(examples):
    images = [image.convert("RGB") for image in examples['image']]
    examples["pixel_values"] = [train_transforms(image) for image in images]
    examples["input_ids"] = tokenize_captions(examples)
    return examples


train_dataset = dataset.with_transform(preprocess_train)

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
    input_ids = torch.stack([example["input_ids"] for example in examples])
    return {"pixel_values": pixel_values, "input_ids": input_ids}

train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=collate_fn,
    batch_size=train_batch_size,
    num_workers=0,
)

In [None]:
# Training itself
device = 'cuda'
weight_dtype = torch.float32 #torch.float16

# Move text_encode and vae to gpu and cast to weight_dtype
text_encoder.to(device, dtype=weight_dtype)
vae.to(device, dtype=weight_dtype)
unet.to(device, dtype=weight_dtype)

In [None]:
from transformers import YolosImageProcessor, YolosForObjectDetection


model = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny')
image_processor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny")
model.to(device)

def detect(image):
    inputs = image_processor(images=image, return_tensors="pt").to(device)
    outputs = model(**inputs)
    target_sizes = torch.tensor([image.size[::-1]])
    results = image_processor.post_process_object_detection(outputs, threshold=0.6, target_sizes=target_sizes)[0]
    objects = [model.config.id2label[idx.item()] for idx in results['labels']]
    return objects


In [None]:
def get_score(objects, label):
    found_objects = set(objects).intersection(set(labels))
    if label not in found_objects:
      return 0

    if label == 'zebra':
      if 'giraffe' in found_objects:
        return 0
      else:
        return 2

    if label == 'giraffe':
      if 'zebra' in found_objects:
        return 0
      else:
        return 2

    return 1

In [None]:
def generate(pipe, prompt):
    image = pipe(
        prompt=prompt, num_inference_steps=50, guidance_scale=8.5,
        generator=torch.Generator(device=device).manual_seed(seed_value)
    ).images[0]

    return image

In [None]:
test_prompts = [
    "A zebra stands proudly amidst vibrant tropical foliage and a cascading waterfall.",
    "A zebra playfully trots through a snowy winter landscape, with snow-covered trees and a cozy cabin in the background.",
    "A giraffe with a sleek, holographic collar stands majestically in a bustling, neon-lit futuristic cityscape.",
    "A giraffe strolls along a sandy beach at sunset, with the waves gently lapping at its hooves.",
    "A colorful parrot perched on a branch against a blue sky. The parrot has bright red and green feathers, with a yellow beak.",
    "A car driving through a neon-lit city at night, with reflections of vibrant lights bouncing off its polished surface.",
    "A red umbrella stands out in a rainy, grey cityscape.",
    "A rugged backpack with patches by a mossy tree in a sunlit forest.",
    "In a snowy forest, a cozy bear stands under snow-covered trees, enjoying the gentle snowfall.",
    "A modern computer mouse with a sleek design. It has a matte black finish with a glowing blue scroll wheel. The mouse is placed on a white desk surface."
]

test_labels = [
    "giraffe",
    "giraffe",
    "zebra",
    "zebra",
    "bird",
    "car",
    "umbrella",
    "backpack",
    "bear",
    "mouse"
]

In [None]:
from diffusers import DiffusionPipeline
import os
import torch

def save_stable_diffusion_pipeline(pipeline_obj, directory):
    """Save a Stable Diffusion pipeline to a local directory."""
    os.makedirs(directory, exist_ok=True)
    pipeline_obj.save_pretrained(directory)
    print(f"Pipeline saved to {directory}")

def load_and_optimize_pipeline(model_path, weight_dtype=torch.float32):
    # Load the pipeline
    pipe = DiffusionPipeline.from_pretrained(
        model_path,
        torch_dtype=weight_dtype,
    )

    # Ensure it's on the correct device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    pipe = pipe.to(device)

    # Enable optimizations
    if device == "cuda":
        pipe.enable_attention_slicing()
    pipe.enable_vae_slicing()

    # Warm-up run
    _ = pipe("Warm-up prompt", num_inference_steps=1)

    return pipe

In [None]:
from PIL import Image

def image_grid(imgs, rows=2, cols=2):
    w, h = imgs[0].size
    grid = Image.new("RGB", size=(cols * w, rows * h))

    for i, img in enumerate(imgs):
        grid.paste(img, box=(i % cols * w, i // cols * h))
    return grid

In [None]:
num_train_epochs = math.ceil(max_train_steps * train_batch_size / len(train_dataset))
print("***** Running training *****")
print(f"  Num examples = {len(train_dataset)}")
print(f"  Num Epochs = {num_train_epochs}")
print(f"  Instantaneous batch size per device = {train_batch_size}")
print(f"  Total optimization steps = {max_train_steps}")

global_step = 0
initial_global_step = 0

progress_bar = tqdm(
    range(0, max_train_steps),
    initial=initial_global_step,
    desc="Steps",
)

losses = []
EPOCH=[]
LOSS=[]
best_score, best_loss = -1, -1

for epoch in range(num_train_epochs):
    tloss=0
    for step, batch in enumerate(train_dataloader):
        # Convert images to latent space
        latents = vae.encode(batch["pixel_values"].to(weight_dtype).to(device)).latent_dist.sample()
        latents = latents * vae.config.scaling_factor

        # Sample noise that we'll add to the latents
        noise = torch.randn_like(latents)
        batch_size = latents.shape[0]
        # Sample a random timestep for each image
        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (batch_size,), device=latents.device)
        timesteps = timesteps.long()

        # Add noise to the latents according to the noise magnitude at each timestep
        # (this is the forward diffusion process)
        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

        # Get the text embedding for conditioning
        encoder_hidden_states = text_encoder(batch["input_ids"].to('cuda'), return_dict=False)[0]

        # Predict the noise residual and compute loss
        model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
        loss = F.mse_loss(model_pred.float(), noise.float(), reduction="mean")

        # Backpropagate
        loss.backward()
        torch.nn.utils.clip_grad_norm_(unet.parameters(), 1.0)

        if (step+1)%accumulation_steps==0:
          if train_unet:
            for param in unet.parameters():
               param.grad/=accumulation_steps

            unet_optimizer.step()

            unet_optimizer.zero_grad()
            unet_lr_scheduler.step()

          if train_vae:
            for param in vae.parameters():
               param.grad/=accumulation_steps

            vae_optimizer.step()

            vae_optimizer.zero_grad()
            vae_lr_scheduler.step()
        ###############################################################

        losses.append(loss.item())
        progress_bar.update(1)
        global_step += 1

        #Testing the score of the model
        # if global_step % score_steps == 0:
        #   if train_unet: unet.eval()
        #   if train_vae: vae.eval()

        #   total_score = 0
        #   generated_images = []
        #   for prompt, label in zip(test_prompts, test_labels):
        #     image = generate(pipe, prompt)
        #     generated_images.append(image)
        #     # objects = detect(image)
        #     # score = get_score(objects, label)
        #     # print(f'Score: {score}, caut {label} si am gasit: {set(objects)}')
        #     # total_score += score

        #   # print(f'SCORE: {total_score}')
        #   # best_model = False
        #   # avg_loss = np.mean(losses[-20:])
        #   # if total_score >= best_score:
        #   #   best_score = total_score
        #   #   best_loss = avg_loss
        #   #   # save_stable_diffusion_pipeline(pipe, "./my_stable_diffusion_pipeline")
        #   #   pipe.push_to_hub(repo_url)
        #   #   print('Saving best model!')
        #   #   best_model = True

        #   # if best_model:
        #   display(image_grid(generated_images, 2, 5))

        #   if train_unet: unet.train()
        #   if train_vae: vae.train()

        # if global_step%test_steps==0:
        #   unet.eval()
        #   image = pipe("Beautiful giraffe running in savana", width=resolution, height=resolution).images[0]
        #   display(image.resize((512, 512)))
        #   image = pipe("Beautiful zebra running in savana", width=resolution, height=resolution).images[0]
        #   display(image.resize((512, 512)))
        #   unet.train()

        progress_bar.set_postfix(average_loss=np.mean(losses[-20:]), step=global_step)
        tloss+=np.mean(losses[-20:])
        if global_step >= max_train_steps:
            break
    LOSS.append(tloss)
    EPOCH.append(epoch)


In [None]:
plt.plot(EPOCH,LOSS)

In [None]:
image = pipe("Giraffe in snowy city", width=resolution, height=resolution).images[0]
display(image.resize((512, 512)))

As we see, it's starting to do it correctly, but there is definetely some room for improvement.


# Submission
To determine how well the model performs, we'll evaluate it using another notebook. For this reason, you need to upload the copy of trained pipeline to Hugging Face.

1. Register the team at [Hugging Face](https://huggingface.co) or login if you have account alrady.
2. Obtain an access token with write rights from [Hugging Face Tokens](https://huggingface.co/settings/tokens).
3. In the code below, replace account name with the one you registered and model name with any name you find approprate.
4. Enter the access token.

Use the [evaluation notebook](https://colab.research.google.com/drive/12eRsJK5AUDoKZOFQo60pzMLdmSJZhl3E) to check the results.



In [None]:
new_pipeline = DiffusionPipeline.from_pretrained(
    base_model_name,
    vae=vae,
    unet=unet,
    text_encoder=text_encoder
)
new_pipeline.push_to_hub("Romania1/cv_model", token='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')