In [1]:
from load_data import get_sample
from PIL import Image
from stablediffusion import StableDiffusion
import torch
from vision_encoder import vision_encoder
from llm import ClipCaptionModel, generate2, generate_text_with_gumbel_softmax
import os
from transformers import GPT2Tokenizer
import numpy as np
from fashion_clip.fashion_clip import FashionCLIP
from mapping import get_gpt2_logits, map_prompt_to_clip


sd = StableDiffusion()
encoder = vision_encoder()
fclip = FashionCLIP('fashion-clip')

In [2]:
prefix_length = 10
current_directory = os.getcwd()
save_path = os.path.join(current_directory, "saved_models")
os.makedirs(save_path, exist_ok=True)

CPU = torch.device('cpu')
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = ClipCaptionModel(prefix_length)
model_path = os.path.join(save_path, 'fashion.pt')
model.load_state_dict(torch.load(model_path, map_location=CPU)) 
model = model.eval() 
model = model.to(device)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [3]:
pil_image, row = get_sample(0)

image_embeddings = fclip.encode_images([pil_image], batch_size=1)
image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, ord=2, axis=-1, keepdims=True)
image_embeddings = torch.tensor(image_embeddings).to(device)
        
prefix_embed = model.clip_project(image_embeddings).reshape(1, prefix_length, -1)
out = generate_text_with_gumbel_softmax(model, tokenizer, embed=prefix_embed)

100%|██████████| 1/1 [00:00<00:00,  1.22it/s]


In [5]:
combined_softmax_outputs = torch.cat(out[1], dim=0)
tokens = map_prompt_to_clip(combined_softmax_outputs)
tokens

  transformation_matrix = torch.sparse.FloatTensor(indices, values, torch.Size([gpt2_vocab_size, clip_vocab_size]))


tensor([[ 0.0019, -0.0032,  0.0020,  ...,  0.0029, -0.0022,  0.0089],
        [ 0.0019, -0.0032,  0.0020,  ...,  0.0029, -0.0022,  0.0089],
        [-0.0237, -0.0112, -0.0079,  ...,  0.0094, -0.0231, -0.0225],
        ...,
        [-0.0245, -0.0233,  0.0028,  ..., -0.0133,  0.1138,  0.0847],
        [ 0.0433, -0.0528,  0.0245,  ..., -0.0458,  0.0204,  0.0266],
        [-0.0167, -0.0156, -0.0126,  ...,  0.0102,  0.0208,  0.0044]],
       grad_fn=<MmBackward0>)

In [6]:
tokens.shape

torch.Size([67, 1024])

In [7]:
sd.requires_grad_(False) # Note: Running out of memory if require grad

output = sd.forward(out[0], tokens=tokens)
output  = (output.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
output = Image.fromarray(output)
output.show()
loss = encoder.calc_loss([pil_image], [output])
print(loss)

tensor([[ 0.0019, -0.0032,  0.0020,  ...,  0.0029, -0.0022,  0.0089],
        [ 0.0019, -0.0032,  0.0020,  ...,  0.0029, -0.0022,  0.0089],
        [-0.0237, -0.0112, -0.0079,  ...,  0.0094, -0.0231, -0.0225],
        ...,
        [-0.0245, -0.0233,  0.0028,  ..., -0.0133,  0.1138,  0.0847],
        [ 0.0433, -0.0528,  0.0245,  ..., -0.0458,  0.0204,  0.0266],
        [-0.0167, -0.0156, -0.0126,  ...,  0.0102,  0.0208,  0.0044]],
       grad_fn=<MmBackward0>)
tensor([[[-0.3134, -0.4476, -0.0082,  ...,  0.2542, -0.0324, -0.2960],
         [-0.7473, -1.3658, -0.8512,  ...,  0.8571, -1.9439, -0.7137],
         [-0.3813, -0.9747,  1.3416,  ...,  0.9129, -1.6168, -1.2229],
         ...,
         [ 2.3418, -1.7429,  0.0412,  ..., -0.1670, -0.7596,  0.4923],
         [ 2.2561, -1.8095, -0.0347,  ..., -0.2477, -0.6511,  0.3659],
         [ 0.4041, -0.2987, -0.7762,  ..., -0.2733, -0.9029,  0.7142]]],
       device='cuda:0')
torch.Size([67, 1024])
torch.Size([1, 77, 1024])


  0%|          | 0/26 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.76 GiB. GPU 0 has a total capacty of 15.77 GiB of which 1.93 GiB is free. Process 20679 has 1.60 GiB memory in use. Process 13237 has 1.31 GiB memory in use. Including non-PyTorch memory, this process has 10.92 GiB memory in use. Of the allocated memory 10.25 GiB is allocated by PyTorch, and 301.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
pil_image, row = get_sample()
print(row['detail_desc'])
display(pil_image)

image_embeddings = fclip.encode_images([pil_image], batch_size=1)
image_embeddings = image_embeddings / np.linalg.norm(image_embeddings, ord=2, axis=-1, keepdims=True)
image_embeddings = torch.tensor(image_embeddings).to(device)
        
prefix_embed = model.clip_project(image_embeddings).reshape(1, prefix_length, -1)
out = generate2(model, tokenizer, embed=prefix_embed)
print(out)
prompt = [out]
sd.requires_grad_(False) # Note: Running out of memory if require grad

output = sd.forward(prompt)
output  = (output.permute(1, 2, 0) * 255).to(torch.uint8).cpu().numpy()
output = Image.fromarray(output)
output.show()
loss = encoder.calc_loss([pil_image], [output])
print(loss)

In [9]:
import torch

# Ensure CUDA is available
if torch.cuda.is_available():
    # Select your CUDA device, generally 0 by default for single-GPU setups
    device = torch.device('cuda:0')
    
    # Get GPU details
    total_memory = torch.cuda.get_device_properties(device).total_memory
    allocated_memory = torch.cuda.memory_allocated(device)
    cached_memory = torch.cuda.memory_reserved(device)
    max_allocated_memory = torch.cuda.max_memory_allocated(device)
    max_cached_memory = torch.cuda.max_memory_reserved(device)

    # Convert bytes to gigabytes for easier interpretation
    gb_divisor = 1024**3

    print(f"Total Memory: {total_memory / gb_divisor:.2f} GB")
    print(f"Allocated Memory: {allocated_memory / gb_divisor:.2f} GB")
    print(f"Cached Memory: {cached_memory / gb_divisor:.2f} GB")
    print(f"Max Allocated Memory: {max_allocated_memory / gb_divisor:.2f} GB")
    print(f"Max Cached Memory: {max_cached_memory / gb_divisor:.2f} GB")
else:
    print("CUDA is not available. Check your installation or GPU availability.")


Total Memory: 15.77 GB
Allocated Memory: 10.25 GB
Cached Memory: 10.54 GB
Max Allocated Memory: 11.03 GB
Max Cached Memory: 11.71 GB
