In [1]:
import math

from imagen_pytorch.t5 import t5_encode_text
import torch, glob
from imagen_pytorch import Unet, Imagen, ImagenTrainer
from tqdm.notebook import tqdm
import torchvision
from pathlib import Path
import torchvision.transforms as T

def get_emb_tensor(files):
    texts = []
    for i in files:
      f = open(i, "r")
      texts.append(f.read())
      f.close()
    text_embeds, text_masks = t5_encode_text(texts, name = 'google/t5-v1_1-base')
    text_embeds, text_masks = map(lambda t: t.to('cuda:0'), (text_embeds, text_masks))
    return text_embeds, text_masks


#thanks KyriaAnnwyn
def get_images_tensor(files):
    img_arr = []
    transforms = torch.nn.Sequential(
        T.Resize([256, 256]),
        T.ConvertImageDtype(torch.float)
    )
    for i in files:
       img_arr.append((transforms(torchvision.io.read_image(i, torchvision.io.ImageReadMode.RGB)) * 2 - 1).unsqueeze(0))
    img_embeds = torch.cat((img_arr), dim=0).to('cuda')
    return img_embeds


Downloading:   0%|          | 0.00/605 [00:00<?, ?B/s]

In [2]:
dataset_path = "/content/dataset"
checkpoint_rate=100
epochs =  2 #for every unet, ofc

#The real batch size
batch_ss = 64
#Batch size to grad accum when using trainer class, saves from OOM, real batch size should be bigger than this
batch_s =  8
#Use EMA when using trainer class? If use, sampling will result just a noise
use_ema = False
#Which method to use? imagen or trainer class?
meth = "trainer" # "imagen" or "trainer"





unet1 = Unet(
    dim = 32,
    cond_dim = 512,
    dim_mults = (1, 2, 4, 8),
    num_resnet_blocks = 3,
    layer_attns = (False, True, True, True),
)

unet2 = Unet(
    dim = 32,
    cond_dim = 512,
    dim_mults = (1, 2, 4, 8),
    num_resnet_blocks = (2, 4, 8, 8),
    layer_attns = (False, False, False, True),
    layer_cross_attns = (False, False, False, True)
)


imagen = Imagen(
    unets = (unet1, unet2),
    image_sizes = (64, 256),
    beta_schedules = ('cosine', 'linear'),
    timesteps = 1000,
    cond_drop_prob = 0.5
).cuda()

trainer = ImagenTrainer(imagen, use_ema = use_ema)



#feed images into imagen, training each unet in the cascade
#try to load last ckpt
try:
  trainer.load('/content/checkpoint.pt')
  print("Loaded checkpoint")
except: pass

a = glob.glob(f"{dataset_path}/*.jpg")
last_i = 0
batches = []
batch_ss = batch_ss
temp = 0
st = ''
for i, na in enumerate(a):
  if i % batch_ss == 0:
    last_i+=1
  
  st = st + na[:-3] + ' '
  
  
  if temp != last_i:
      if st != '': batches.append(st)
      st = ''
  temp = last_i
if st != '': batches.append(st)
st = batches[0]
batches.pop(0)
if batch_ss!= 1: batches[-1] += st 



for i in (1, 2):
  l_arr = []
  for eps in range(1, epochs+1):
      
      with tqdm(total=len(batches)) as pbar:
          for step, e in enumerate(batches):
              
              batch = batches[step].split(' ')[:-1]

              text_embeds, text_masks = get_emb_tensor([s + 'txt' for s in batch])
              images = get_images_tensor([s + 'jpg' for s in batch])



              if meth == 'imagen':
                loss = imagen(images, text_embeds = text_embeds, text_masks = text_masks, unet_number = i)
                loss.backward()
              else:
                loss = trainer(images, text_embeds = text_embeds, text_masks = text_masks, unet_number = i, max_batch_size = batch_s)
                trainer.update(unet_number = i)
              l_arr.append(loss)
              if step % checkpoint_rate == 0 and step !=0 and not math.isnan(loss):
                  trainer.save(f'/content/checkpoint.pt')
              pbar.set_description(f'Unet Num: {i}  Epoch: {eps}  Loss: {loss} Avg Loss: {round((sum(l_arr)/len(l_arr)), 3)}')
              pbar.update()
      if not math.isnan(loss): trainer.save(f'/content/checkpoint.pt')



loading saved imagen at version 0.0.60, but current package version is 0.0.60
Loaded checkpoint


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/945M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/t5-v1_1-base were not used when initializing T5EncoderModel: ['decoder.block.5.layer.0.SelfAttention.k.weight', 'decoder.block.6.layer.2.layer_norm.weight', 'decoder.block.7.layer.1.EncDecAttention.o.weight', 'decoder.block.0.layer.1.EncDecAttention.q.weight', 'decoder.block.7.layer.0.SelfAttention.k.weight', 'decoder.block.6.layer.2.DenseReluDense.wo.weight', 'decoder.block.1.layer.0.SelfAttention.k.weight', 'decoder.block.9.layer.2.DenseReluDense.wi_1.weight', 'decoder.block.11.layer.1.EncDecAttention.q.weight', 'decoder.block.0.layer.0.SelfAttention.k.weight', 'decoder.block.7.layer.1.EncDecAttention.v.weight', 'decoder.block.10.layer.0.SelfAttention.q.weight', 'decoder.block.8.layer.1.EncDecAttention.v.weight', 'decoder.block.6.layer.1.EncDecAttention.o.weight', 'decoder.block.3.layer.0.SelfAttention.k.weight', 'decoder.block.11.layer.1.EncDecAttention.k.weight', 'decoder.block.10.layer.1.layer_norm.weight', 'decoder.block.3.layer.1.En

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 8.00 GiB total capacity; 7.10 GiB already allocated; 0 bytes free; 7.26 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [3]:
#Main code for inference
import torch
import torchvision
import torchvision.transforms as T
from PIL import Image

text = "Danger High Voltage Label" #Danger High Voltage Label is 000000012.txt from dataset
#Timesteps for sample
timesteps =  1000 

imagen = Imagen(
    unets = (unet1, unet2),
    image_sizes = (64, 256),
    beta_schedules = ('cosine', 'linear'),
    timesteps = timesteps,
    cond_drop_prob = 0.5
).cuda()

trainer = ImagenTrainer(imagen, use_ema = use_ema)
try:
  trainer.load('/content/checkpoint.pt')
  print("Loaded checkpoint")
except: pass

# now you can sample an image based on the text embeddings from the cascading ddpm
#texts = [
#    'a puppy looking anxiously at a giant donut on the table',
#    'the milky way galaxy in the style of monet']

texts = [text]
print(texts)
img = trainer.sample(texts, cond_scale = 2.)


RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 8.00 GiB total capacity; 7.14 GiB already allocated; 0 bytes free; 7.27 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [4]:
print(img.shape)
print(img[0].shape)
image = T.ToPILImage()(img[0]).convert("RGB")
image.show() #returns None but expected just display image

import matplotlib.pyplot as plt
plt.imshow(image) #returns image with plt

NameError: name 'img' is not defined

In [None]:
from torchvision.utils import save_image
image = img[0]  
save_image(image, '/content/img1.png')