<a href="https://colab.research.google.com/github/MustiCankan/MustiCankan/blob/main/MinigenTryipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget http://images.cocodataset.org/zips/train2017.zip
!unzip train2017.zip

In [None]:
!pip install torch torchvision transformers datasets

Model Imagen Training

In [None]:
import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from datasets import load_dataset
from transformers import T5Tokenizer
from PIL import Image
import os
import json

# Import the MinImagen model (Ensure MinImagen's classes are implemented)
from minimagen_model import Imagen, GaussianDiffusion, Unet

# **1. Configurations**
BATCH_SIZE = 8
EPOCHS = 10
LEARNING_RATE = 1e-4
IMAGE_SIZE = 64  # Low-res images for faster training
DATA_DIR = "./coco/train2017"
CAPTION_FILE = "./coco/annotations/captions_train2017.json"

# **2. Load COCO Dataset**
def load_coco_data():
    # Load captions
    with open(CAPTION_FILE, 'r') as f:
        captions_data = json.load(f)['annotations']

    # Create dictionary with {image_id: caption}
    captions_dict = {ann['image_id']: ann['caption'] for ann in captions_data}

    image_paths = [os.path.join(DATA_DIR, f) for f in os.listdir(DATA_DIR)]
    return captions_dict, image_paths

# **3. Custom Dataset Class**
class COCODataset(torch.utils.data.Dataset):
    def __init__(self, captions_dict, image_paths, tokenizer):
        self.captions_dict = captions_dict
        self.image_paths = image_paths
        self.tokenizer = tokenizer
        self.transforms = transforms.Compose([
            transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
            transforms.ToTensor(),
            transforms.Normalize((0.5,), (0.5,))
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image_id = int(os.path.basename(img_path).split('.')[0])
        caption = self.captions_dict.get(image_id, "Unknown caption")

        # Load and process image
        image = Image.open(img_path).convert("RGB")
        image = self.transforms(image)

        # Tokenize caption
        tokens = self.tokenizer.encode(caption, return_tensors="pt").squeeze(0)

        return tokens, image

# **4. Tokenizer and Dataset**
tokenizer = T5Tokenizer.from_pretrained("t5-small")
captions_dict, image_paths = load_coco_data()
dataset = COCODataset(captions_dict, image_paths, tokenizer)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# **5. MinImagen Model Components**
unet = Unet(
    dim=64,  # Base model dimension
    dim_mults=(1, 2, 4),
    channels=3
)
diffusion = GaussianDiffusion(
    model=unet,
    image_size=IMAGE_SIZE,
    timesteps=1000,  # Number of diffusion steps
    loss_type="l2"
)
imagen = Imagen(
    unets=(unet,),
    text_encoder_name="t5-small"
)

# **6. Optimizer and Loss Function**
optimizer = torch.optim.Adam(imagen.parameters(), lr=LEARNING_RATE)

# **7. Training Loop**
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
imagen.to(device)

for epoch in range(EPOCHS):
    imagen.train()
    total_loss = 0

    for step, (captions, images) in enumerate(dataloader):
        captions, images = captions.to(device), images.to(device)

        # Forward pass through the MinImagen model
        loss = diffusion(images, captions)
        total_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            print(f"Epoch [{epoch+1}/{EPOCHS}], Step [{step}/{len(dataloader)}], Loss: {loss.item()}")

    print(f"Epoch [{epoch+1}] Loss: {total_loss/len(dataloader)}")

    # Save the model after each epoch
    torch.save(imagen.state_dict(), f"minimagen_epoch_{epoch+1}.pth")

print("Training Completed!")