In [None]:
#Level 1
# !pip install --upgrade diffusers
# !pip install --upgrade transformers
# !pip install --upgrade tokenizers
# !pip install --upgrade datasets
from diffusers import StableDiffusionPipeline

In [None]:
access_token = "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"
pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_auth_token=access_token).to("cuda") #use revision='fp16' and torch_dtype=torch.float16 for low memory

In [None]:
prompt = "a photo of a horse riding an astronaut on Mars"
image = pipe(prompt).images[0]
image.save("./images/horse_rides_astronaut.png")

In [None]:
# level 2
from torch import autocast
from PIL import Image, ImageDraw

In [None]:
def dummy(images, **kwargs):
    return images, False

def image_grid(imgs, rows, cols):
    assert len(imgs) == rows*cols
    
    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

In [None]:
pipe.safety_checker = dummy
n_images = 3
prompts = [
    "masterpiece, best quality, a photo of a horse riding an astronaut, trending on artstation, photorealistic, qhd, rtx on, 8k"
] * n_images
with autocast("cuda"):
    images = pipe(prompts, num_inference_steps=28).images
image_grid(images, rows=1, cols=3)

In [None]:
#level 3
from diffusers import UNet2DConditionModel, StableDiffusionPipeline, AutoencoderKL, LMSDiscreteScheduler, DDIMScheduler
from transformers import CLIPTextModel, CLIPTokenizer
import torch
from torch.nn import functional as F
from torch import autocast
import numpy as np

from tqdm.auto import tqdm

In [None]:
# vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder='vae', use_auth_token=access_token)
# vae.save_pretrained('./models/vae.ckpt')
vae = AutoencoderKL.from_pretrained('./models/vae/').to("cuda")

# tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
# tokenizer.save_pretrained('./tokenizers/')
tokenizer = CLIPTokenizer.from_pretrained('./tokenizers/')
# text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to("cuda")
# text_encoder.save_pretrained('./models/text_encoder.pt')
text_encoder = CLIPTextModel.from_pretrained('./models/text_encoder/').to("cuda")

# model = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder='unet', use_auth_token=access_token).to("cuda")
# model.save_pretrained('./models/sd_v1-5.ckpt')
model = UNet2DConditionModel.from_pretrained('./models/sd_v1-5.ckpt/').to("cuda")

scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule='scaled_linear', num_train_timesteps=1000)

In [None]:
model

In [None]:
model.config

In [None]:
def get_text_embeds(prompt):
  # Tokenize text and get embeddings
  text_input = tokenizer(
      prompt, padding='max_length', max_length=tokenizer.model_max_length,
      truncation=True, return_tensors='pt')
  with torch.no_grad():
    text_embeddings = text_encoder(text_input.input_ids.to("cuda"))[0]

  # Do the same for unconditional embeddings
  uncond_input = tokenizer(
      [''] * len(prompt), padding='max_length',
      max_length=tokenizer.model_max_length, return_tensors='pt')
  with torch.no_grad():
    uncond_embeddings = text_encoder(uncond_input.input_ids.to("cuda"))[0]

  # Cat for final embeddings
  text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
  return text_embeddings

test_embeds = get_text_embeds(['an amazingly cool anime character'])
print(test_embeds)
print(test_embeds.shape)

In [None]:
def produce_latents(text_embeddings, height=512, width=512,
                    num_inference_steps=28, guidance_scale=11, latents=None,
                    return_all_latents=False):
  if latents is None:
    latents = torch.randn((text_embeddings.shape[0] // 2, model.in_channels, \
                           height // 8, width // 8))
  latents = latents.to("cuda")

  scheduler.set_timesteps(num_inference_steps)
  latents = latents * scheduler.sigmas[0]

  latent_hist = [latents]
  with autocast('cuda'):
    for i, t in tqdm(enumerate(scheduler.timesteps)):
      # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
      latent_model_input = torch.cat([latents] * 2)
      sigma = scheduler.sigmas[i]
      latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)

      # predict the noise residual
      with torch.no_grad():
        noise_pred = model(latent_model_input, t, encoder_hidden_states=text_embeddings)['sample']

      # perform guidance
      noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
      noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

      # compute the previous noisy sample x_t -> x_t-1
      latents = scheduler.step(noise_pred, i, latents)['prev_sample']
      latent_hist.append(latents)
  
  if not return_all_latents:
    return latents

  all_latents = torch.cat(latent_hist, dim=0)
  return all_latents

test_latents = produce_latents(test_embeds)
print(test_latents)
print(test_latents.shape)

In [None]:
def decode_img_latents(latents):
  latents = 1 / 0.18215 * latents

  with torch.no_grad():
    imgs = vae.decode(latents)['sample']

  imgs = (imgs / 2 + 0.5).clamp(0, 1)
  imgs = imgs.detach().cpu().permute(0, 2, 3, 1)
  imgs = (imgs + 1.0) * 127.5
  imgs = imgs.numpy().astype(np.uint8)
  pil_images = [Image.fromarray(image) for image in imgs]
  return pil_images

imgs = decode_img_latents(test_latents)
imgs[0]

In [None]:
def prompt_to_img(prompts, height=512, width=512, num_inference_steps=28, guidance_scale=11, latents=None):
    if isinstance(prompts, str):
        prompts = [prompts]
        
    #Prompts -> text embeddings
    text_embeds = get_text_embeds(prompts)
    
    #Text embeddings -> img latents
    latents = produce_latents(text_embeds, height=height, width=width, latents=latents, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale)
    
    #Img latents -> imgs
    imgs = decode_img_latents(latents)
    
    return imgs

In [None]:
imgs = prompt_to_img(['Super cool fantasty knight, intricate armor, 8k']*4, 512, 512, 28, 11)

In [None]:
image_grid(imgs, rows=2, cols=2)

In [None]:
# level 3.5 - similar images and img2img
def prompt_to_img(prompts, height=512, width=512, num_inference_steps=50,
                  guidance_scale=7.5, latents=None, return_all_latents=False,
                  batch_size=2):
  if isinstance(prompts, str):
    prompts = [prompts]

  # Prompts -> text embeds
  text_embeds = get_text_embeds(prompts)

  # Text embeds -> img latents
  latents = produce_latents(
      text_embeds, height=height, width=width, latents=latents,
      num_inference_steps=num_inference_steps, guidance_scale=guidance_scale,
      return_all_latents=return_all_latents)
  
  # Img latents -> imgs
  all_imgs = []
  for i in tqdm(range(0, len(latents), batch_size)):
    imgs = decode_img_latents(latents[i:i+batch_size])
    all_imgs.extend(imgs)

  return all_imgs

In [None]:
prompt = 'Steampunk airship bursting through the clouds, cyberpunk art'
latents = torch.randn((1, model.in_channels, 512 // 8, 512 // 8))
img = prompt_to_img(prompt, num_inference_steps=20, latents=latents)[0]
img

In [None]:
def perturb_latents(latents, scale=0.1):
  noise = torch.randn_like(latents)
  new_latents = (1 - scale) * latents + scale * noise
  return (new_latents - new_latents.mean()) / new_latents.std()

In [None]:
new_latents = perturb_latents(latents, 0.4)
img = prompt_to_img(prompt, num_inference_steps=20, latents=new_latents)[0]
img

In [None]:
prompt = 'Upright squid'
img = prompt_to_img(prompt, num_inference_steps=30)[0]
img

In [None]:
def encode_img_latents(imgs):
  if not isinstance(imgs, list):
    imgs = [imgs]

  img_arr = np.stack([np.array(img) for img in imgs], axis=0)
  img_arr = img_arr / 255.0
  img_arr = torch.from_numpy(img_arr).float().permute(0, 3, 1, 2)
  img_arr = 2 * (img_arr - 0.5)

  latent_dists = vae.encode(img_arr.to("cuda"))
  latent_samples = latent_dists.latent_dist.sample()
  latent_samples *= 0.18215

  return latent_samples

In [None]:
img_latents = encode_img_latents([img])
dec_img = decode_img_latents(img_latents)[0]
dec_img

In [None]:
# New scheduler for img-to-img
scheduler = DDIMScheduler(
    beta_start=0.00085, beta_end=0.012,
    beta_schedule='scaled_linear', num_train_timesteps=1000)

In [None]:
def produce_latents(text_embeddings, height=512, width=512,
                    num_inference_steps=50, guidance_scale=7.5, latents=None,
                    return_all_latents=False, start_step=10):
  if latents is None:
    latents = torch.randn((text_embeddings.shape[0] // 2, model.in_channels, \
                           height // 8, width // 8))
  latents = latents.to("cuda")

  scheduler.set_timesteps(num_inference_steps)
  if start_step > 0:
    start_timestep = scheduler.timesteps[start_step]
    start_timesteps = start_timestep.repeat(latents.shape[0]).long()

    noise = torch.randn_like(latents)
    latents = scheduler.add_noise(latents, noise, start_timesteps)

  latent_hist = [latents]
  with autocast('cuda'):
    for i, t in tqdm(enumerate(scheduler.timesteps[start_step:])):
      # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
      latent_model_input = torch.cat([latents] * 2)

      # predict the noise residual
      with torch.no_grad():
        noise_pred = model(latent_model_input, t, encoder_hidden_states=text_embeddings)['sample']

      # perform guidance
      noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
      noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

      # compute the previous noisy sample x_t -> x_t-1
      latents = scheduler.step(noise_pred, t, latents)['prev_sample']
      latent_hist.append(latents)
  
  if not return_all_latents:
    return latents

  all_latents = torch.cat(latent_hist, dim=0)
  return all_latents

def prompt_to_img(prompts, height=512, width=512, num_inference_steps=50,
                  guidance_scale=7.5, latents=None, return_all_latents=False,
                  batch_size=2, start_step=0):
  if isinstance(prompts, str):
    prompts = [prompts]

  # Prompts -> text embeds
  text_embeds = get_text_embeds(prompts)

  # Text embeds -> img latents
  latents = produce_latents(
      text_embeds, height=height, width=width, latents=latents,
      num_inference_steps=num_inference_steps, guidance_scale=guidance_scale,
      return_all_latents=return_all_latents, start_step=start_step)
  
  # Img latents -> imgs
  all_imgs = []
  for i in tqdm(range(0, len(latents), batch_size)):
    imgs = decode_img_latents(latents[i:i+batch_size])
    all_imgs.extend(imgs)

  return all_imgs

In [None]:
prompt = 'Squidward'
img = prompt_to_img(prompt, num_inference_steps=30, latents=img_latents,
                    start_step=20)[0]
img

In [None]:
# Level 4 - AUTOMATIC1111
#!git clone https://github.com/AUTOMATIC1111/stable-diffusion-webui
#https://rentry.org/voldy

In [None]:
#Level 5 - Deforum, Vid2Vid, Textual Inversion (Dreambooth), Negative Prompts, Fine-Tuning, etc.
