# Illustra: Multi-text to Image

Based on [CLIP](https://github.com/openai/CLIP) + FFT from [Lucent](https://github.com/greentfrapp/lucent) // made by [eps696](https://github.com/eps696) [Vadim Epstein]  
thanks to [Ryan Murdock](https://twitter.com/advadnoun), [Jonathan Fly](https://twitter.com/jonathanfly) for ideas

## Features 
* **continuously processes phrase lists** (e.g. illustrating lyrics)
* generates [FFT-encoded](https://github.com/greentfrapp/lucent/blob/master/lucent/optvis/param/spatial.py) image (massive detailed textures, a la deepdream)
* fast convergence
* undemanding for RAM - fullHD/4K and above
* saving/loading FFT snapshots to resume processing
* selectable CLIP model


**Run the cell below after each session restart**

Mark `resume` and upload `.pt` file, if you're resuming from the saved params.

In [None]:
#@title General setup

import subprocess
CUDA_version = [s for s in subprocess.check_output(["nvcc", "--version"]).decode("UTF-8").split(", ") if s.startswith("release")][0].split(" ")[-1]
print("CUDA version:", CUDA_version)

if CUDA_version == "10.0":
    torch_version_suffix = "+cu100"
elif CUDA_version == "10.1":
    torch_version_suffix = "+cu101"
elif CUDA_version == "10.2":
    torch_version_suffix = ""
else:
    torch_version_suffix = "+cu110"

!pip install torch==1.7.1{torch_version_suffix} torchvision==0.8.2{torch_version_suffix} -f https://download.pytorch.org/whl/torch_stable.html ftfy regex

try: 
  !pip3 install googletrans==3.1.0a0
  from googletrans import Translator, constants
  translator = Translator()
except: pass
!pip install ftfy

!apt-get -qq install ffmpeg
from google.colab import drive
drive.mount('/G', force_remount=True)
gdir = !ls /G/
gdir = '/G/%s/' % str(gdir[0])
%cd $gdir
work_dir = 'illustra'
work_dir = gdir + work_dir + '/'
import os
os.makedirs(work_dir, exist_ok=True)
%cd $work_dir

import os
import io
import time
import math
# from math import exp
import random
import imageio
import numpy as np
import PIL
# from skimage import exposure
from base64 import b64encode
import shutil

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torch.autograd import Variable

from IPython.display import HTML, Image, display, clear_output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import ipywidgets as ipy
# import glob
from google.colab import output, files

import warnings
warnings.filterwarnings("ignore")

!pip install git+https://github.com/openai/CLIP.git
import clip
!pip install git+https://github.com/Po-Hsun-Su/pytorch-ssim
import pytorch_ssim as ssim

!git clone https://github.com/eps696/aphantasia
%cd aphantasia/
from clip_fft import to_valid_rgb, fft_image, slice_imgs, checkout
from utils import pad_up_to, basename, file_list, img_list, img_read
from progress_bar import ProgressIPy as ProgressBar

clear_output()

resume = False #@param {type:"boolean"}
if resume:
  resumed = files.upload()
  params_pt = list(resumed.values())[0]

def ema(base, next, step):
    scale_ma = 1. / (step + 1)
    return next * scale_ma + base * (1.- scale_ma)

def makevid(seq_dir, size=None):
  out_sequence = seq_dir + '/%03d.jpg'
  out_video = seq_dir + '.mp4'
  !ffmpeg -y -v warning -i $out_sequence $out_video
  data_url = "data:video/mp4;base64," + b64encode(open(out_video,'rb').read()).decode()
  wh = '' if size is None else 'width=%d height=%d' % (size, size)
  return """<video %s controls><source src="%s" type="video/mp4"></video>""" % (wh, data_url)

!nvidia-smi -L
print('\nDone!')

In [None]:
#@title Upload text file

translate = False #@param {type:"boolean"}
uploaded = files.upload()

Set the desired video resolution and `duration` (in sec).  
Select CLIP `model` (results do vary!). I prefer ViT for consistency.  

Set `overscan` to produce semi-seamlessly tileable texture (when off, it's more centered).  
Try adding `noise_scale` to explore some compositional aspects.  
Decrease `samples` if you face OOM (it's the main RAM eater).  
Increasing `steps` will elaborate details and make tones smoother, but may start throwing texts like graffiti (and will obviously take more time).  
`show_freq` controls preview frequency and doesn't affect the results (so set it higher to speed up process). 

In [None]:
#@title Generate

# from google.colab import drive
# drive.mount('/content/GDrive')
# clipsDir = '/content/GDrive/MyDrive/T2I ' + dtNow.strftime("%Y-%m-%d %H%M")

!rm -rf tempdir

sideX = 1280 #@param {type:"integer"}
sideY = 720 #@param {type:"integer"}
duration =  60#@param {type:"integer"}
#@markdown > Config
model = 'ViT-B/32' #@param ['ViT-B/32', 'RN101', 'RN50x4', 'RN50']
overscan = False #@param {type:"boolean"}
keep = 'None' #@param ['None', 'all', 'last']
noise_scale = 0. #@param {type:"number"}
contrast = 1. #@param {type:"number"}
#@markdown > Training
steps = 200 #@param {type:"integer"}
samples = 200 #@param {type:"integer"}
learning_rate = .05 #@param {type:"number"}
show_freq = 10 #@param {type:"integer"}
fps = 25

model_clip, _ = clip.load(model)
modsize = 288 if model == 'RN50x4' else 224
xmem = {'RN50':0.5, 'RN50x4':0.16, 'RN101':0.33}
if 'RN' in model:
  samples = int(samples * xmem[model])

norm_in = torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))

text_file = list(uploaded)[0]
texts = list(uploaded.values())[0].decode().split('\n')
texts = [tt.strip() for tt in texts if len(tt.strip())>0 and tt[0] != '#']
print(' text file:', text_file)
print(' total lines:', len(texts))

workdir = os.path.join(work_dir, basename(text_file))
workdir += '-%s' % model if 'RN' in model.upper() else ''

outpic = ipy.Output()
outpic
  
def save_img(img, fname=None):
  img = np.array(img)[:,:,:]
  img = np.transpose(img, (1,2,0))  
  img = np.clip(img*255, 0, 255).astype(np.uint8)
  if fname is not None:
    imageio.imsave(fname, np.array(img))
    imageio.imsave('result.jpg', np.array(img))

def process(txt, num):

  global params_start
  if num==0: # initial step
    init_pt = params_pt if resume is True else None
    params, image_f = fft_image([1, 3, sideY, sideX], resume=resume)
    params_start = params[0].detach().clone()
    torch.save(params_start, 'tmp.pt') # random init
  else: # further steps
    params, image_f = fft_image([1, 3, sideY, sideX], resume='tmp.pt')
  image_f = to_valid_rgb(image_f)
  optimizer = torch.optim.Adam(params, learning_rate)
    
  print(' ref text: ', txt)
  if translate:
    translator = Translator()
    txt = translator.translate(txt, dest='en').text
    print(' translated to:', txt)
  tx = clip.tokenize(txt).cuda()
  txt_enc = model_clip.encode_text(tx).detach().clone()
  out_name = '%02d-%s' % (num, txt.translate(str.maketrans(dict.fromkeys(list("\n',—|!?/:;\\"), ""))).replace(' ', '_').replace('"', ''))
  tempdir = os.path.join(workdir, out_name)
  os.makedirs(tempdir, exist_ok=True)

  pbar = ProgressBar(steps) #  // save_freq
  for i in range(steps):
    loss = 0
    
    noise = noise_scale * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if noise_scale > 0 else 0.
    img_out = image_f(noise)

    imgs_sliced = slice_imgs([img_out], samples, modsize, norm_in, overscan=overscan, micro=None)
    out_enc = model_clip.encode_image(imgs_sliced[-1])
    loss -= torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean()
    del img_out, imgs_sliced, out_enc; torch.cuda.empty_cache()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if i % show_freq == 0:
      with torch.no_grad():
        img = image_f(contrast=contrast).cpu().numpy()[0]
      # save_img(img, os.path.join(tempdir, '%04d.jpg' % (i // save_freq))))
      outpic.clear_output()
      with outpic:
        display(Image('result.jpg'))
      del img

    pbar.upd()

  if keep == 'all':
      params_start = ema(params_start, params[0].detach(), num+1)
      torch.save(params_start, 'tmp.pt')
  elif keep == 'last':
      torch.save((params_start + params[0].detach()) / 2, 'tmp.pt')
  
  torch.save(params[0], '%s.pt' % os.path.join(workdir, out_name))
  shutil.copy(img_list(tempdir)[-1], os.path.join(workdir, '%s-%d.jpg' % (out_name, steps)))
  # os.system('ffmpeg -v warning -y -i %s\%%04d.jpg "%s.mp4"' % (tempdir, os.path.join(workdir, out_name)))
  # HTML(makevid(tempdir))

for i, txt in enumerate(texts):
    process(txt, i)

vsteps = int(duration * fps / len(texts))
tempdir = os.path.join(workdir, '_final')
os.makedirs(tempdir, exist_ok=True)

def read_pt(file):
  return torch.load(file).cuda()

print(' rendering complete piece')
ptfiles = file_list(workdir, 'pt')
pbar = ProgressBar(vsteps * len(ptfiles))
for px in range(len(ptfiles)):
  params1 = read_pt(ptfiles[px])
  params2 = read_pt(ptfiles[(px+1) % len(ptfiles)])

  params, image_f = fft_image([1, 3, sideY, sideX], resume=params1)
  image_f = to_valid_rgb(image_f)

  for i in range(vsteps):
    with torch.no_grad():
      img = image_f((params2 - params1) * math.sin(1.5708 * i/vsteps)**2)[0].permute(1,2,0)
      img = torch.clip(img*255, 0, 255).cpu().numpy().astype(np.uint8)
    imsave(os.path.join(tempdir, '%05d.jpg' % (px * vsteps + i)), img)
    pbar.upd()

os.system('ffmpeg -v warning -y -i %s\%%05d.jpg "%s.mp4"' % (tempdir, os.path.join(work_dir, basename(text_file))))
if keep is True: os.remove('tmp.pt')

