# Illustra: Multi-text to Image

Based on [CLIP](https://github.com/openai/CLIP) + FFT from [Lucent](https://github.com/greentfrapp/lucent) // made by [eps696](https://github.com/eps696) [Vadim Epstein]  
thanks to [Ryan Murdock](https://twitter.com/advadnoun), [Jonathan Fly](https://twitter.com/jonathanfly) for ideas

## Features 
* **continuously processes phrase lists** (e.g. illustrating lyrics)
* generates [FFT-encoded](https://github.com/greentfrapp/lucent/blob/master/lucent/optvis/param/spatial.py) image (massive detailed textures, a la deepdream)
* fast convergence
* undemanding for RAM - fullHD/4K and above
* saving/loading FFT params to resume processing
* can use both CLIP models at once (ViT and RN50)


**Run the cell below after each session restart**

Mark `resume` and upload `.pt` file, if you're resuming from the saved params.

In [None]:
#@title General setup

import subprocess
CUDA_version = [s for s in subprocess.check_output(["nvcc", "--version"]).decode("UTF-8").split(", ") if s.startswith("release")][0].split(" ")[-1]
print("CUDA version:", CUDA_version)

if CUDA_version == "10.0":
    torch_version_suffix = "+cu100"
elif CUDA_version == "10.1":
    torch_version_suffix = "+cu101"
elif CUDA_version == "10.2":
    torch_version_suffix = ""
else:
    torch_version_suffix = "+cu110"

!pip install torch==1.7.1{torch_version_suffix} torchvision==0.8.2{torch_version_suffix} -f https://download.pytorch.org/whl/torch_stable.html ftfy regex

try: 
  !pip3 install googletrans==3.1.0a0
  from googletrans import Translator, constants
  translator = Translator()
except: pass
!pip install ftfy
!pip install ssim # not needed, to avoid import error

!apt-get -qq install ffmpeg
from google.colab import drive
drive.mount('/G', force_remount=True)
gdir = !ls /G/
gdir = '/G/%s/' % str(gdir[0])
%cd $gdir
work_dir = 'illustra'
work_dir = gdir + work_dir + '/'
import os
os.makedirs(work_dir, exist_ok=True)
%cd $work_dir

import os
import io
import time
from math import exp
import random
import imageio
import numpy as np
import PIL
from skimage import exposure
from base64 import b64encode
import shutil

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torch.autograd import Variable

from IPython.display import HTML, Image, display, clear_output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import ipywidgets as ipy
# import glob
from google.colab import output, files

import warnings
warnings.filterwarnings("ignore")

!pip install git+https://github.com/openai/CLIP.git
import clip
model_vit, _ = clip.load('ViT-B/32')

!git clone https://github.com/eps696/aphantasia
%cd /content/aphantasia/
from clip_fft import to_valid_rgb, fft_image, slice_imgs, checkout
from utils import pad_up_to, basename, file_list, img_list, img_read
from progress_bar import ProgressIPy as ProgressBar

clear_output()

resume = False #@param {type:"boolean"}
if resume:
  resumed = files.upload()
  params_pt = list(resumed.values())[0]

def makevid(seq_dir, size=None):
  out_sequence = seq_dir + '/%03d.jpg'
  out_video = seq_dir + '.mp4'
  !ffmpeg -y -v warning -i $out_sequence $out_video
  data_url = "data:video/mp4;base64," + b64encode(open(out_video,'rb').read()).decode()
  wh = '' if size is None else 'width=%d height=%d' % (size, size)
  return """<video %s controls><source src="%s" type="video/mp4"></video>""" % (wh, data_url)

!nvidia-smi -L
print('\nDone!')

In [None]:
#@title Upload text file

translate = False #@param {type:"boolean"}
uploaded = files.upload()

Set the desired video resolution and `duration` (in sec). 

Try setting `noise_scale` > 0 (maybe 4~8), if the images "getting stuck" with time.  
Set `overscan` to produce semi-seamlessly tileable texture (when off, it's centered).  
Turn on `dual_model` to optimize with both CLIP models at once (eats more RAM!).  
Decrease `samples` if you face OOM for higher resolutions.  
Increase `save_freq` for longer training. 

In [None]:
#@title Generate

# from google.colab import drive
# drive.mount('/content/GDrive')
# clipsDir = '/content/GDrive/MyDrive/T2I ' + dtNow.strftime("%Y-%m-%d %H%M")

!rm -rf tempdir

sideX = 1280 #@param {type:"integer"}
sideY = 720 #@param {type:"integer"}
duration =  120#@param {type:"integer"}
#@markdown > Tweaks & tuning
overscan = True #@param {type:"boolean"}
noise_scale = 0. #@param {type:"number"}
dual_model = False #@param {type:"boolean"}
#@markdown > Training
samples = 200 #@param {type:"integer"}
save_freq = 1 #@param {type:"integer"}
learning_rate = .05
fps = 25

if dual_model is True:
  print(' using dual-model optimization')
  model_rn, _ = clip.load('RN50')
  samples = samples // 2

norm_in = torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))

text_file = list(uploaded)[0]
texts = list(uploaded.values())[0].decode().split('\n')
texts = [tt.strip() for tt in texts if len(tt.strip())>0 and tt[0] != '#']
print(' text file:', text_file)
print(' total lines:', len(texts))

workdir = os.path.join(work_dir, basename(text_file))
steps = int(duration * fps * save_freq / len(texts))

shape = [1, 3, sideY, sideX]
params, image_f = fft_image(shape)
image_f = to_valid_rgb(image_f)
optimizer = torch.optim.Adam(params, learning_rate)

outpic = ipy.Output()
outpic
  
def save_img(img, fname=None):
  img = np.array(img)[:,:,:]
  img = np.transpose(img, (1,2,0))  
  img = np.clip(img*255, 0, 255).astype(np.uint8)
  if fname is not None:
    imageio.imsave(fname, np.array(img))
    imageio.imsave('result.jpg', np.array(img))

def checkout(num, tempdir, pbar):
  with torch.no_grad():
    img = image_f().cpu().numpy()[0]
  save_img(img, os.path.join(tempdir, '%03d.jpg' % num))
  outpic.clear_output()
  with outpic:
    display(Image('result.jpg'))
  pbar.upd()

def train(i, txt_enc, tempdir, pbar):
  loss = 0
  
  noise = noise_scale * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if noise_scale > 0 else 0.
  img_out = image_f(noise)
  # img_out = image_f()

  imgs_sliced = slice_imgs([img_out], samples, norm_in, overscan=overscan, micro=None)
  out_enc = model_vit.encode_image(imgs_sliced[-1])
  if dual_model is True: # use both clip models
      out_enc = torch.cat((out_enc, model_rn.encode_image(imgs_sliced[-1])), 1)
  loss -= 100*torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean()
  del img_out, imgs_sliced, out_enc; torch.cuda.empty_cache()

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  
  if i % save_freq == 0:
    checkout(i // save_freq, tempdir, pbar)

def process(txt, num):
  print(' ref text: ', txt)
  if translate:
    translator = Translator()
    txt = translator.translate(txt, dest='en').text
    print(' translated to:', txt)
  tx = clip.tokenize(txt).cuda()
  txt_enc = model_vit.encode_text(tx).detach().clone()
  if dual_model is True:
    txt_enc = torch.cat((txt_enc, model_rn.encode_text(tx).detach().clone()), 1)
  out_name = '%02d-%s' % (num, txt.translate(str.maketrans(dict.fromkeys(list("\n',—|!?/:;\\"), ""))).replace(' ', '_').replace('"', ''))
  tempdir = os.path.join(workdir, out_name)
  os.makedirs(tempdir, exist_ok=True)

  pbar = ProgressBar(steps // save_freq)
  for i in range(steps):
    train(i, txt_enc, tempdir, pbar)

  # shutil.copy(img_list(tempdir)[-1], os.path.join(workdir, '%s-%d.jpg' % (out_name, steps)))
  # os.system('ffmpeg -v warning -y -i %s\%%03d.jpg "%s.mp4"' % (tempdir, os.path.join(workdir, out_name)))
  HTML(makevid(tempdir))

for i, txt in enumerate(texts):
    process(txt, i)

# %cd $work_dir
# vid_list = ['file ' + v.replace('\\', '/') for v in file_list(workdir, 'mp4')]
# with open('dir.txt', 'w') as ff:
  # ff.write('\n'.join(vid_list))
# outname = basename(text_file) + '.mp4'
# !ffmpeg -y -v warning -f concat -i dir.txt -c:v copy $outname
# os.system('ffmpeg -y -v warning -f concat -i dir.txt -c:v copy %s.mp4' % basename(text_file))
# os.remove('dir.txt')
