In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/train.csv


In [2]:
# !pip uninstall -y datasets sentence-transformers pytorch-lightning torchaudio torch torchvision xformers

# !pip install --upgrade diffusers==0.23.0 transformers==4.35.0 accelerate==0.24.0 peft==0.6.0
# !pip install torch==2.1.1 torchvision==0.16.1 --index-url https://download.pytorch.org/whl/cu121
# !pip install xformers==0.0.24 --extra-index-url https://download.pytorch.org/whl/cu121

In [3]:
import numpy as np
import pandas as pd
import os
import ast
import torch
from PIL import Image
import io
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from diffusers import StableDiffusionPipeline, DDPMScheduler
from peft import LoraConfig, get_peft_model
from accelerate import Accelerator
from tqdm.auto import tqdm

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [4]:
csv_path = '/kaggle/input/train.csv'
df = pd.read_csv(csv_path)
print(f"Total samples: {len(df)}")
print(df.head())

Total samples: 833
                                               image  \
0  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
1  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
2  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
3  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   
4  {'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...   

                                                text  
0  A cartoon drawing of a green Pokemon with a le...  
1  A cartoon character with a red face and a gree...  
2  A cartoon character with a red face and white ...  
3  A white and pink cartoon character with a smil...  
4  A group of cartoon characters with different e...  


In [5]:
class PokemonDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_dict = ast.literal_eval(row['image'])
        image_bytes = image_dict['bytes']
        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        text = row['text']
        
        if self.transform:
            image = self.transform(image)
            
        return {"image": image, "text": text}

In [6]:
# transforms with augmentation
transform = transforms.Compose([
    transforms.Resize(512),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

In [7]:
dataset = PokemonDataset(df, transform=transform)

train_dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0)

In [8]:
accelerator = Accelerator(
    mixed_precision="fp16"
)

In [9]:
# Load and move pipeline to GPU
pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
).to(accelerator.device)

unet = pipe.unet
vae = pipe.vae
text_encoder = pipe.text_encoder
tokenizer = pipe.tokenizer

model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [10]:
# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["to_q", "to_k", "to_v", "to_out.0"],
    lora_dropout=0.1
)

In [11]:
unet = get_peft_model(unet, lora_config)
print(f"Trainable params: {sum(p.numel() for p in unet.parameters() if p.requires_grad)}")

Trainable params: 3188736


In [12]:
# Cast LoRA parameters to FP32 while keeping base model in FP16
for name, param in unet.named_parameters():
    if "lora" in name.lower():
        param.data = param.data.to(torch.float32)
        param.requires_grad = True  # Ensure LoRA params are trainable
    else:
        param.requires_grad = False  # Freeze non-LoRA params

print(f"Trainable params: {sum(p.numel() for p in unet.parameters() if p.requires_grad)}")

Trainable params: 3188736


In [13]:
optimizer = torch.optim.AdamW(
    [p for p in unet.parameters() if p.requires_grad],
    lr=1e-4
)
scheduler = DDPMScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
unet, optimizer, train_dataloader = accelerator.prepare(unet, optimizer, train_dataloader)

In [14]:
num_epochs = 10
epoch_pbar = tqdm(range(num_epochs), desc="Total Training", position=0)

for epoch in epoch_pbar:
    unet.train()
    total_loss = 0
    batch_pbar = tqdm(enumerate(train_dataloader), total=len(train_dataloader),
                     desc=f"Epoch {epoch+1}", position=1, leave=False)
    
    for batch_idx, batch in batch_pbar:
        images = batch["image"].to(vae.dtype)
        texts = batch["text"]

        # Text encoding
        inputs = tokenizer(texts, padding="max_length", max_length=tokenizer.model_max_length,
                          truncation=True, return_tensors="pt").to(accelerator.device)
        
        # Image encoding
        with torch.no_grad():
            latents = vae.encode(images).latent_dist.sample() * 0.18215

        # Noise sampling
        noise = torch.randn_like(latents).to(latents.dtype)
        timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (latents.shape[0],),
                                 device=accelerator.device)
        noisy_latents = scheduler.add_noise(latents, noise, timesteps)

        # Cast text embeddings to match UNet dtype (FP16)
        with torch.no_grad():
            text_emb = text_encoder(inputs.input_ids)[0].to(unet.dtype)

        # Forward pass with autocast for FP16 computations
        with accelerator.autocast():
            noisy_latents = noisy_latents.to(unet.dtype)
            timesteps = timesteps.to(unet.dtype)
            
            pred = unet(
                noisy_latents,
                timesteps,
                encoder_hidden_states=text_emb
            ).sample
            
            loss = torch.nn.functional.mse_loss(pred.float(), noise.float())

        # Backprop
        accelerator.backward(loss)
        
        if accelerator.sync_gradients:
            accelerator.clip_grad_norm_(unet.parameters(), max_norm=1.0)
    
        optimizer.step()
        optimizer.zero_grad()

        # Update progress
        total_loss += loss.item()
        avg_loss = total_loss / (batch_idx + 1)
        batch_pbar.set_postfix({"batch_loss": f"{loss.item():.4f}", "avg_loss": f"{avg_loss:.4f}"})

    # Epoch stats
    epoch_loss = total_loss / len(train_dataloader)
    epoch_pbar.set_postfix({"epoch_loss": f"{epoch_loss:.4f}"})
    print(f"\nEpoch {epoch+1} Complete | Avg Loss: {epoch_loss:.4f}")

    # Save checkpoint
    if (epoch + 1) % 10 == 0:
        accelerator.wait_for_everyone()
        if accelerator.is_main_process:
            save_path = f"/kaggle/working/lora_pokemon_epoch{epoch+1}"
            unet.save_pretrained(save_path)
            print(f"Checkpoint saved: {save_path}")

# Final save
accelerator.wait_for_everyone()
if accelerator.is_main_process:
    unet.save_pretrained("/kaggle/working/lora_pokemon_final")
    print("Training complete! Final model saved.")

Total Training:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/209 [00:00<?, ?it/s]


Epoch 1 Complete | Avg Loss: 0.0526


Epoch 2:   0%|          | 0/209 [00:00<?, ?it/s]


Epoch 2 Complete | Avg Loss: 0.0527


Epoch 3:   0%|          | 0/209 [00:00<?, ?it/s]


Epoch 3 Complete | Avg Loss: 0.0526


Epoch 4:   0%|          | 0/209 [00:00<?, ?it/s]


Epoch 4 Complete | Avg Loss: 0.0490


Epoch 5:   0%|          | 0/209 [00:00<?, ?it/s]


Epoch 5 Complete | Avg Loss: 0.0530


Epoch 6:   0%|          | 0/209 [00:00<?, ?it/s]


Epoch 6 Complete | Avg Loss: 0.0534


Epoch 7:   0%|          | 0/209 [00:00<?, ?it/s]


Epoch 7 Complete | Avg Loss: 0.0491


Epoch 8:   0%|          | 0/209 [00:00<?, ?it/s]


Epoch 8 Complete | Avg Loss: 0.0535


Epoch 9:   0%|          | 0/209 [00:00<?, ?it/s]


Epoch 9 Complete | Avg Loss: 0.0540


Epoch 10:   0%|          | 0/209 [00:00<?, ?it/s]


Epoch 10 Complete | Avg Loss: 0.0505
Checkpoint saved: /kaggle/working/lora_pokemon_epoch10
Training complete! Final model saved.


In [15]:
import torch
from diffusers import StableDiffusionPipeline
import torch
from peft import PeftModel

base_model_id = "runwayml/stable-diffusion-v1-5"
# Load the base pipeline
pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16)

# Get the base U-Net
unet_base = pipe.unet

# Load the PEFT-adapted U-Net with LoRA weights
lora_model_path = "/kaggle/working/lora_pokemon_final"
unet = PeftModel.from_pretrained(unet_base, lora_model_path)

# merge the LoRA weights into the base model
unet = unet.merge_and_unload()

# Replace the pipeline’s U-Net
pipe.unet = unet

pipe = pipe.to("cuda")

device = "cuda" if torch.cuda.is_available() else "cpu"

prompt = "A bubbly Pokémon with a round, smiling face and twinkling stars."

# Adjust num_inference_steps or guidance_scale for quality/speed trade-offs if needed
with torch.autocast("cuda" if device == "cuda" else "cpu"):
    image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]


image.save("generated_pokemon.png")

# image.show()

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]