In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from diffusers import UNet2DConditionModel, AutoencoderKL
from peft import LoraConfig, get_peft_model
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from torchvision import transforms
from torch.cuda.amp import autocast, GradScaler
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import torch
from diffusers import UNet2DConditionModel
from diffusers.loaders import AttnProcsLayers

### Prompt(GPT 4o): Fine tunning Stable Difussion model using Lora .

In [2]:
#  Enable cuDNN optimization
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("high")  # Optimize matmul precision

#  Ensure PyTorch uses GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

#  Free GPU memory
torch.cuda.empty_cache()
gc.collect()

Using device: cuda


0

In [None]:
#  Dataset Class
class EmojiDataset(Dataset):
    def __init__(self, parquet_file):
        self.data = pd.read_parquet(parquet_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path = self.data.iloc[idx]["image_path"]
        image_tensor = torch.load(image_path).float() / 127.5 - 1  # Normalize to [-1,1]
        text_embedding = torch.tensor(self.data.iloc[idx]["combined_embedding"], dtype=torch.float32)
        return image_tensor, text_embedding

In [None]:
#  Load Dataset
parquet_file = "../data/processed_emoji_dataset.parquet"
dataset = EmojiDataset(parquet_file)
batch_size = 4  # Reduce batch size to free memory
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
#  Load Stable Diffusion VAE and UNet to GPU
vae = AutoencoderKL.from_pretrained("stabilityai/stable-diffusion-2", subfolder="vae").to(device, dtype=torch.float16)
unet = UNet2DConditionModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="unet").to(device, dtype=torch.float16)

In [4]:
print(unet.config)  # Check image size

FrozenDict({'sample_size': 96, 'in_channels': 4, 'out_channels': 4, 'center_input_sample': False, 'flip_sin_to_cos': True, 'freq_shift': 0, 'down_block_types': ['CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'CrossAttnDownBlock2D', 'DownBlock2D'], 'mid_block_type': 'UNetMidBlock2DCrossAttn', 'up_block_types': ['UpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D', 'CrossAttnUpBlock2D'], 'only_cross_attention': False, 'block_out_channels': [320, 640, 1280, 1280], 'layers_per_block': 2, 'downsample_padding': 1, 'mid_block_scale_factor': 1, 'dropout': 0.0, 'act_fn': 'silu', 'norm_num_groups': 32, 'norm_eps': 1e-05, 'cross_attention_dim': 1024, 'transformer_layers_per_block': 1, 'reverse_transformer_layers_per_block': None, 'encoder_hid_dim': None, 'encoder_hid_dim_type': None, 'attention_head_dim': [5, 10, 20, 20], 'num_attention_heads': None, 'dual_cross_attention': False, 'use_linear_projection': True, 'class_embed_type': None, 'addition_embed_type': None, 'addition_time_embed_dim': 

In [None]:
#  Apply LoRA to UNet
lora_config = LoraConfig(
    r=4,  # LoRA rank
    lora_alpha=16,  # Scaling factor
    target_modules=["to_q", "to_k", "to_v"],  # Apply LoRA to attention layers
    lora_dropout=0.05,  # Dropout for regularization
    bias="none"
    )

unet = get_peft_model(unet, lora_config)
unet.print_trainable_parameters()  #  Print trainable parameters (should be very small)

trainable params: 630,272 || all params: 866,540,996 || trainable%: 0.0727


In [6]:
# Enable memory optimization
unet.enable_gradient_checkpointing()


# Embedding Projector (CLIP 512 → UNet 1024)
class EmbeddingProjector(nn.Module):
    def __init__(self, input_dim=512, output_dim=1024):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.fc(x)

embedding_projector = EmbeddingProjector().to(device, dtype=torch.float16)

In [7]:
# Freeze VAE
for param in vae.parameters():
    param.requires_grad = False  

# Define optimizer
optimizer = AdamW(filter(lambda p: p.requires_grad, unet.parameters()), lr=1e-4)
scaler = torch.amp.GradScaler()


In [None]:
# Training Loop
num_epochs = 20
losses = []
for epoch in range(num_epochs):
    epoch_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for images, embeddings in progress_bar:
        images = images.to(device, dtype=torch.float16, non_blocking=True)
        embeddings = embeddings.to(device, dtype=torch.float16, non_blocking=True)
        optimizer.zero_grad()

        # Project CLIP embeddings
        with torch.no_grad():
            projected_embeddings = embedding_projector(embeddings).unsqueeze(1)
        
        with torch.no_grad():
            latents = torch.utils.checkpoint.checkpoint(
                lambda x: vae.encode(x).latent_dist.sample() * 0.18215, images, use_reentrant=False
            )
        # Convert latents to bfloat16 to save memory
        latents = latents.to(torch.bfloat16)

        # Generate noise
        noise = torch.randn_like(latents, dtype=torch.bfloat16)
        timesteps = torch.randint(0, 1000, (latents.shape[0],), device=device).long()

        # Forward pass
        with torch.amp.autocast("cuda"):
            noise_pred = unet(latents, timesteps, encoder_hidden_states=projected_embeddings).sample
            loss = F.mse_loss(noise_pred, noise)

        # Backpropagation
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)  # Avoid NaN issues
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_epoch_loss = epoch_loss / len(train_dataloader)
    losses.append(avg_epoch_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_epoch_loss:.4f}")


Epoch 1/20: 100%|██████████| 678/678 [1:00:53<00:00,  5.39s/it, loss=1.01] 


Epoch 1/20, Loss: 1.1623


Epoch 2/20:  80%|███████▉  | 541/678 [38:16<09:53,  4.33s/it, loss=1]    

In [None]:
# Save Model
torch.save({
    "unet": unet.state_dict(),
    "embedding_projector": embedding_projector.state_dict()
}, "emoji_generator.pth")
print("Model saved successfully!")


In [None]:
# Save LoRA Weights
unet.save_pretrained("lora_emoji_unet")
torch.save(embedding_projector.state_dict(), "embedding_projector.pth")
print("LoRA adapters saved successfully!")

# Plot Loss Curve
plt.plot(range(1, num_epochs + 1), losses, marker="o", linestyle="-")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss Curve")
plt.grid()
plt.show()


In [None]:
# Free GPU memory before inference
torch.cuda.empty_cache()
gc.collect()

In [None]:
# Load Trained LoRA Model
unet = UNet2DConditionModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="unet").to(device)
unet = PeftModel.from_pretrained(unet, "lora_emoji_unet").to(device)
embedding_projector.load_state_dict(torch.load("embedding_projector.pth"))
unet.eval()
embedding_projector.eval()


In [None]:
# Generate Emoji from Text Prompt
text_description = "dog"
embedding_np = dataset.data.loc[dataset.data["text"] == text_description, "combined_embedding"].values[0]
text_embedding = torch.tensor(embedding_np, dtype=torch.float32).to(device)
projected_embedding = embedding_projector(text_embedding).unsqueeze(0).unsqueeze(0)

In [None]:
# Generate noise in latent space (fixed size 96x96)
latents = torch.randn(1, 4, 96, 96).to(device)
timesteps = torch.tensor([500], device=device).long()

# Generate Emoji
with torch.no_grad():
    denoised_latents = unet(latents, timesteps, encoder_hidden_states=projected_embedding).sample

# Scale and Decode Image
denoised_latents = denoised_latents / 0.18215
with torch.no_grad():
    decoded_image = vae.decode(denoised_latents).sample

# Post-process Image
decoded_image = (decoded_image.clamp(-1, 1) + 1) / 2
decoded_image = decoded_image.squeeze(0).permute(1, 2, 0).cpu().numpy()
decoded_image = (decoded_image * 255).astype(np.uint8)
emoji_image = Image.fromarray(decoded_image)

In [None]:
# Display Generated Emoji
plt.imshow(emoji_image)
plt.axis("off")
plt.show()


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from diffusers import UNet2DConditionModel, AutoencoderKL
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from torchvision import transforms
from torch.cuda.amp import autocast, GradScaler
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
# Enable cuDNN optimization
torch.backends.cudnn.benchmark = True
torch.set_float32_matmul_precision("high")  # Optimize matmul precision

# Free GPU memory
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:

# Dataset class using precomputed CLIP embeddings
class EmojiDataset(Dataset):
    def __init__(self, parquet_file):
        self.data = pd.read_parquet(parquet_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path = self.data.iloc[idx]["image_path"]
        image_tensor = torch.load(image_path).float() / 127.5 - 1  # Normalize to [-1,1]
        text_embedding = torch.tensor(self.data.iloc[idx]["combined_embedding"], dtype=torch.float32)
        return image_tensor, text_embedding

In [None]:
# Load Dataset
parquet_file = "../data/processed_emoji_dataset.parquet"
dataset = EmojiDataset(parquet_file)
batch_size = 4  # Reduce batch size to free memory
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)

# Load Stable Diffusion VAE and UNet
device = "cuda"
vae = AutoencoderKL.from_pretrained("stabilityai/stable-diffusion-2", subfolder="vae").to(device)
unet = UNet2DConditionModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="unet").to(device)


In [None]:

# Enable memory optimization
unet.enable_gradient_checkpointing()

# Try `torch.compile` for speed boost (PyTorch 2.0+)
try:
    unet = torch.compile(unet)
except:
    print("torch.compile is not available. Continuing without it.")

# Embedding Projector
class EmbeddingProjector(nn.Module):
    def __init__(self, input_dim=512, output_dim=1024):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.fc(x)


In [None]:
embedding_projector = EmbeddingProjector().to(device)

# Freeze all UNet layers except the last few
for param in unet.parameters():
    param.requires_grad = False  
for layer in list(unet.children())[-1:]:  
    for param in layer.parameters():
        param.requires_grad = True  

# Freeze VAE
for param in vae.parameters():
    param.requires_grad = False  

# Define optimizer
optimizer = AdamW(filter(lambda p: p.requires_grad, unet.parameters()), lr=1e-4)
scaler = torch.amp.GradScaler()


In [None]:

# Training Loop
num_epochs = 20
losses = []
for epoch in range(num_epochs):
    epoch_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for images, embeddings in progress_bar:
        images = images.to(device, non_blocking=True)
        embeddings = embeddings.to(device, non_blocking=True)
        optimizer.zero_grad()

        # Project CLIP embeddings
        with torch.no_grad():
            projected_embeddings = embedding_projector(embeddings).unsqueeze(1)

        # Encode images into latent space with checkpointing
        with torch.no_grad():
            latents = torch.utils.checkpoint.checkpoint(lambda x: vae.encode(x).latent_dist.sample() * 0.18215, images)
        
        # Convert latents to bfloat16 to save memory
        latents = latents.to(torch.bfloat16)

        # Generate noise
        noise = torch.randn_like(latents, dtype=torch.bfloat16)
        timesteps = torch.randint(0, 1000, (latents.shape[0],), device=device).long()

        # Forward pass
        with torch.amp.autocast("cuda"):
            noise_pred = unet(latents, timesteps, encoder_hidden_states=projected_embeddings).sample
            loss = F.mse_loss(noise_pred, noise)

        # Backpropagation
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)  # Avoid NaN issues
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_epoch_loss = epoch_loss / len(train_dataloader)
    losses.append(avg_epoch_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_epoch_loss:.4f}")




In [None]:
# Save Model
torch.save({
    "unet": unet.state_dict(),
    "embedding_projector": embedding_projector.state_dict()
}, "emoji_generator.pth")
print("Model saved successfully!")



In [None]:
# Plot Loss Curve
plt.plot(range(1, num_epochs + 1), losses, marker="o", linestyle="-")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss Curve")
plt.grid()
plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from diffusers import UNet2DConditionModel, AutoencoderKL
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from torchvision import transforms
from torch.cuda.amp import autocast, GradScaler
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
# Dataset class using precomputed CLIP embeddings
class EmojiDataset(Dataset):
    def __init__(self, parquet_file, transform=None):
        self.data = pd.read_parquet(parquet_file)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path = self.data.iloc[idx]["image_path"]
        image_tensor = torch.load(image_path)  # Shape: (3, 64, 64)
        image_tensor = image_tensor.float() / 255.0 * 2 - 1  # Normalize to [-1,1]

        text_embedding = torch.tensor(self.data.iloc[idx]["combined_embedding"], dtype=torch.float32)

        if self.transform:
            image_tensor = self.transform(image_tensor)

        return image_tensor, text_embedding

In [None]:
# Load Dataset
parquet_file = "../data/processed_emoji_dataset.parquet"
dataset = EmojiDataset(parquet_file)
batch_size = 8  # Adjust batch size to avoid GPU memory overflow
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Load Stable Diffusion VAE and UNet
vae = AutoencoderKL.from_pretrained("stabilityai/stable-diffusion-2", subfolder="vae").to("cuda")
unet = UNet2DConditionModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="unet").to("cuda")

In [None]:
# Enable memory optimization
unet.enable_gradient_checkpointing()

# Embedding Projector (CLIP 768 → UNet 768, if needed)
class EmbeddingProjector(nn.Module):
    def __init__(self, input_dim=512, output_dim=1024):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.fc(x)

embedding_projector = EmbeddingProjector().to("cuda")

In [None]:
# Freeze all UNet layers except the last few for fine-tuning
for param in unet.parameters():
    param.requires_grad = False  
num_layers_to_train = 1  # Adjust as needed
for module in list(unet.children())[-num_layers_to_train:]:  
    for param in module.parameters():
        param.requires_grad = True  

# Freeze VAE
for param in vae.parameters():
    param.requires_grad = False  

# Define optimizer
optimizer = AdamW(filter(lambda p: p.requires_grad, unet.parameters()), lr=1e-4)
scaler = torch.amp.GradScaler()

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
# Training Loop
num_epochs = 20
losses = []
for epoch in range(num_epochs):
    epoch_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for images, embeddings in progress_bar:
        images, embeddings = images.to("cuda"), embeddings.to("cuda")
        optimizer.zero_grad()

        # Project CLIP embeddings
        with torch.no_grad():
            projected_embeddings = embedding_projector(embeddings).unsqueeze(1)

        # Encode images into latent space
        with torch.no_grad():
            latents = vae.encode(images).latent_dist.sample()
            latents = latents * 0.18215  # Scaling factor

        # Generate noise
        noise = torch.randn_like(latents)
        timesteps = torch.randint(0, 1000, (latents.shape[0],), device="cuda").long()

        # Forward pass
        with torch.amp.autocast("cuda"):
            noise_pred = unet(latents, timesteps, encoder_hidden_states=projected_embeddings).sample
            loss = F.mse_loss(noise_pred, noise)

        # Backpropagation
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    
    avg_epoch_loss = epoch_loss / len(train_dataloader)
    losses.append(avg_epoch_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_epoch_loss:.4f}")

In [None]:
# Save Model
torch.save({
    "unet": unet.state_dict(),
    "embedding_projector": embedding_projector.state_dict()
}, "emoji_generator.pth")
print("Model saved successfully!")

In [None]:
# Plot Loss Curve
plt.plot(range(1, num_epochs + 1), losses, marker="o", linestyle="-")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training Loss Curve")
plt.grid()
plt.show()

In [None]:
# Load Model
checkpoint = torch.load("emoji_generator.pth")
unet.load_state_dict(checkpoint["unet"])
embedding_projector.load_state_dict(checkpoint["embedding_projector"])
unet.eval()
embedding_projector.eval()

In [None]:
# Define a text prompt
text_description = "dog"
embedding_np = dataset.data.loc[dataset.data["text"] == text_description, "combined_embedding"].values[0]
text_embedding = torch.tensor(embedding_np, dtype=torch.float32).to("cuda")
projected_embedding = embedding_projector(text_embedding).unsqueeze(0).unsqueeze(0)

In [None]:
# Generate noise in latent space (fix size to 96x96)
latents = torch.randn(1, 4, 96, 96).to("cuda")
timesteps = torch.tensor([500], device="cuda").long()

In [None]:
# Generate emoji
with torch.no_grad():
    denoised_latents = unet(latents, timesteps, encoder_hidden_states=projected_embedding).sample

# Fix scaling factor before decoding
denoised_latents = denoised_latents / 0.18215  

with torch.no_grad():
    decoded_image = vae.decode(denoised_latents).sample

# Post-process image
decoded_image = (decoded_image.clamp(-1, 1) + 1) / 2
decoded_image = decoded_image.squeeze(0).permute(1, 2, 0).cpu().numpy()
decoded_image = (decoded_image * 255).astype(np.uint8)
emoji_image = Image.fromarray(decoded_image)

In [None]:
# Display Image
plt.imshow(emoji_image)
plt.axis("off")
plt.show()
