In [10]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from diffusers import StableDiffusionPipeline
from transformers import Trainer, TrainingArguments

# Load Parquet file into a DataFrame
df = pd.read_parquet("../data/processed_sticker_dataset.parquet")
print(df.head())

                                  combined_embedding  \
0  [0.05615041, 0.06784809, -0.03342954, 0.037553...   
1  [-0.124234326, 0.07463956, -0.011985385, 0.004...   
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
3  [-0.06495428, -0.04292713, 0.013164402, 0.0220...   
4  [0.027918205, 0.075559475, 0.03622711, -0.0181...   

                                          image_path  
0  ../data/tensor_images/AlexatorStickers\cartoon...  
1  ../data/tensor_images/AlexatorStickers\cartoon...  
2  ../data/tensor_images/AlexatorStickers\cartoon...  
3  ../data/tensor_images/AlexatorStickers\cartoon...  
4  ../data/tensor_images/AlexatorStickers\cartoon...  


In [2]:
import pandas as pd
from torch.utils.data import Dataset
from PIL import Image
import torch

class StickerDataset(Dataset):
    def __init__(self, parquet_path, transform=None):
        self.data = pd.read_parquet(parquet_path)
        self.image_paths = self.data["image_path"].tolist()
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]

        image = torch.load(image_path)
        if self.transform:
            image = transforms.ToPILImage()(image)
            image = self.transform(image)

        # Generate random noise
        noise = torch.randn_like(image)

        return image, noise


In [None]:
import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from diffusers import StableDiffusionPipeline
from torch import nn
from torch.optim import AdamW

# Load the pre-trained pipeline
model = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base")
model.to("cpu")

# Extract components: UNet, Text Encoder, VAE
unet = model.unet
vae = model.vae  # Variational Autoencoder (for encoding images to latents)
text_encoder = model.text_encoder  # Text Encoder

# Dataset preprocessing
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])




transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
])

dataset = StickerDataset("../data/processed_sticker_dataset.parquet",transform=transform)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Optimizer
optimizer = AdamW(unet.parameters(), lr=5e-5)

# Loss Function
criterion = nn.MSELoss()

# Training Loop
num_epochs = 5
for epoch in range(num_epochs):
    for batch in dataloader:
        images, noise = batch  # images is already transformed to tensor
        optimizer.zero_grad()

        image = images  # Already normalized

        # Encode the image into latent space
        latents = vae.encode(image).latent_dist.sample()

        timestep = torch.randint(0, 1000, (image.size(0),), dtype=torch.long)

        input_ids = torch.randint(0, 1000, (image.size(0), 32))  # Random placeholder
        encoder_hidden_states = text_encoder(input_ids)[0]

        noise_pred = unet(latents, timestep=timestep, encoder_hidden_states=encoder_hidden_states).sample

        # Encode noise to match latent shape
        with torch.no_grad():
            noise_latents = vae.encode(noise).latent_dist.sample()

        loss = criterion(noise_pred, noise_latents)

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {loss.item()}")

# Save Fine-Tuned Model
unet.save_pretrained("./fine_tuned_ldm")
print("✅ Fine-tuning complete! Model saved at './fine_tuned_ldm'.")
