In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from PIL import Image
from IPython.display import display
import torch as th
import torch.nn as nn
import os
import pandas as pd
#from stackedGAN.model_creation import create_clip_model
#from stackedGAN.download import load_checkpoint
#from stackedGAN.model_creation import (create_model_and_diffusion, model_and_diffusion_defaults, model_and_diffusion_defaults_upsampler)

#NAVIN: Below library never used, commenting!
#from glide_text2im.tokenizer.simple_tokenizer import SimpleTokenizer

#https://github.com/openai/glide-text2im/blob/main/notebooks/clip_guided.ipynb
#https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
!pip install git+https://github.com/openai/glide-text2im

#NAVIN-CORRECTED-VERSION
from glide_text2im.clip.model_creation import create_clip_model
from glide_text2im.download import load_checkpoint
from glide_text2im.model_creation import (create_model_and_diffusion, model_and_diffusion_defaults, model_and_diffusion_defaults_upsampler)


has_cuda = th.cuda.is_available()
device = th.device('cpu' if not has_cuda else 'cuda')
print(device)

In [None]:
## Base Model
#@title Base Model
options = model_and_diffusion_defaults()
options['use_fp16'] = has_cuda
options['timestep_respacing'] = '100'
model,diffusion = create_model_and_diffusion(**options)
model.eval()

if has_cuda:
  model.convert_to_fp16 ( )

model.to(device)
model.load_state_dict(load_checkpoint('base',device))
print('total base parameters', sum(x.numel() for x in model.parameters()))

## Upsampler Model
options_up = model_and_diffusion_defaults_upsampler()
options_up['use_fp16'] = has_cuda
options_up['timestep_respacing'] = 'fast27'
model_up,diffusion_up = create_model_and_diffusion(**options_up)
model_up.eval()

if has_cuda :
  model_up.convert_to_fp16()

model_up.to(device)
model_up.load_state_dict(load_checkpoint('upsample', device))
print('total upsampler parameters', sum(x.numel() for x in model_up.parameters()))

## CLIP Model
clip_model = create_clip_model(device = device)
clip_model.image_encoder.load_state_dict(load_checkpoint('clip/image-enc', device))
clip_model.text_encoder.load_state_dict(load_checkpoint('clip/text-enc', device))

def images(batch:th.Tensor):
  scaled = ((batch+1)*127.5).round().clamp(0,255).to(th.uint8).cpu()
  reshaped = scaled.permute(2,0,3,1).reshape([batch.shape[2],-1,3])
  return np.array(Image.fromarray(reshaped.numpy()))

## Prompt
# Sampling parameters
batch_size = 1
guidance_scale = 3.0
# Tune this parameter to control the sharpness of 256x256 images .
# A value of 1.0 is sharper , but sometimes results in grainy artifacts .
upsample_temp = 0.997



In [None]:
import os
#os.path.exists('/content/drive/My Drive/Colab Notebooks/_IMAGE CAPTIONING/SYNTHETIC_IMAGES/'+str(i)+'.jpg'):
os.path.exists('/content/drive/My Drive/Colab Notebooks/_IMAGE CAPTIONING/SYNTHETIC_IMAGES/'+'0.jpg')

In [None]:
import pandas as pd
from google.colab import drive
import numpy as np
# Mounting the drive
drive.mount('/content/drive')

Images_Captions_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/_IMAGE CAPTIONING/IMAGES_CAPTIONS_DATA.csv')

## Base Model Sample
#Create the text tokens to feed to the model.
for i in range(0, 10000):
  if (os.path.exists('/content/drive/My Drive/Colab Notebooks/_IMAGE CAPTIONING/SYNTHETIC_IMAGES/'+str(i)+'.jpg')) :
    continue
  prompt = str(Images_Captions_df['CAPTIONS'][i])
  tokens = model.tokenizer.encode(prompt)
  tokens,mask = model.tokenizer.padded_tokens_and_mask(tokens,options['text_ctx'])

  # Pack the tokens together into model kwargs .
  model_kwargs = dict(tokens = th.tensor([tokens]*batch_size , device = device),
  mask = th.tensor([mask]*batch_size , dtype=th.bool , device = device),)
  # Setup guidance function for CLIP model .
  cond_fn = clip_model.cond_fn([prompt]*batch_size , guidance_scale)
  #Sample from the base model .
  model.del_cache()

  samples = diffusion.p_sample_loop (model, (batch_size, 3, options["image_size"] , options["image_size"]),
                                     device = device, clip_denoised = True, progress = True, model_kwargs = model_kwargs, cond_fn = cond_fn,)
  model.del_cache()
  tokens = model_up.tokenizer.encode(prompt)
  tokens, mask = model_up.tokenizer.padded_tokens_and_mask (tokens , options_up['text_ctx'])

  # Create the model conditioning dict .
  model_kwargs = dict(
      # Low - res image to upsample .
      low_res = ((samples+1)*127.5).round()/127.5 -1 ,
      # Text tokens
      tokens = th.tensor([tokens]*batch_size , device = device),
      mask = th.tensor([mask]*batch_size, dtype = th.bool, device = device))

  #Sample from the base model .
  model_up.del_cache()
  up_shape = (batch_size, 3, options_up["image_size"], options_up["image_size"])
  up_samples = diffusion_up.ddim_sample_loop(model_up, up_shape,
  noise = th.randn(up_shape, device = device)*upsample_temp, device = device, clip_denoised = True, progress = True, model_kwargs = model_kwargs, cond_fn = None)[:batch_size]
  model_up.del_cache()

  # Show the output
  image_array = images(up_samples)
  image = Image.fromarray(image_array)
  image.save('/content/drive/My Drive/Colab Notebooks/_IMAGE CAPTIONING/SYNTHETIC_IMAGES/'+str(i)+'.jpg')

