In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [None]:
# Image captioning with visual attention
import tensorflow as tf
import collections
import random
import numpy as np
import pandas as pd
from google.colab import drive
import os
import time
import json


In [None]:
from PIL import Image
## Download and prepare the MS-COCO dataset
# Download caption annotation files -- WE ARE DOWNLOADING ONLY CAPTIONS FROM THE JSON FILES
annotation_folder = '/annotations/'
if not os.path.exists(os.path.abspath('.') + annotation_folder):

  #Download MS COCO dataset from the URL and save locally in 'captions.zip' and ALSO EXTRACT THE CONTENTS!  extract=True
  annotation_zip = tf.keras.utils.get_file('captions.zip', cache_subdir=os.path.abspath('.'),
                                           origin='http://images.cocodataset.org/annotations/annotations_trainval2014.zip', extract=True)

  #Full path of the file "captions_train2014.json" from the extracted location
  annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'

  #Delete the original zip file
  os.remove(annotation_zip)


In [None]:
print(os.path.abspath('.') + annotation_folder)
!pwd
print(annotation_zip)
!ls -al /content/annotations/

In [None]:
# Download image files
image_folder = '/train2014/'
if not os.path.exists(os.path.abspath('.') + image_folder):
  image_zip = tf.keras.utils.get_file('train2014.zip', cache_subdir=os.path.abspath('.'), origin='http://images.cocodataset.org/zips/train2014.zip', extract=True)
  PATH = os.path.dirname(image_zip) + image_folder
  os.remove(image_zip)
else:
  PATH = os.path.abspath('.') + image_folder

print("Location of the images:" + PATH)

In [None]:
#All images are available in this path: /content/train2014/
#!ls -al $PATH
!ls -al /content

In [None]:
# CAPTIONS PREPROCESSING
with open(annotation_file, 'r') as f:
  annotations = json.load(f)

# Group all captions together having the same image ID.  DEFAULT DICT CREATES KEY IF IT DOES NOT EXIST!
# So this will have IMAGE PATH as KEY and an ARRAY OF CAPTIONS.
image_path_to_caption = collections.defaultdict(list)

for val in annotations['annotations']:
  caption = f"<start> {val['caption']} <end>"
  image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (val['image_id'])
  image_path_to_caption[image_path].append(caption)

#CONTAINS ONLY IMAGE PATHS
image_paths = list(image_path_to_caption.keys())
random.shuffle(image_paths)

In [None]:
print(caption)
print(image_path)
print(len(image_path_to_caption.keys()))
print(image_path_to_caption.get('/content/train2014/COCO_train2014_000000133071.jpg'))

In [None]:
# Select the first 6000 image_paths from the shuffled set.??  SHOULD BE 10000 IMAGES
# Approximately each image id has 5 captions associated with it, so that will
# lead to 30,000 examples.

train_image_paths = image_paths[:800]
print("Total Images:" + str(len(image_paths)))
print("Selected Images:" + str(len(train_image_paths)))
train_captions = []
img_name_vector = []

for image_path in train_image_paths:
  caption_list = image_path_to_caption[image_path]
  train_captions.extend(caption_list)   #extend IS adding one list to another list.
  img_name_vector.extend([image_path] * len(caption_list))  #ADD IMAGE PATH THAT MANY TIMES DEPENDING UPON HOW MANY CAPTIONS WAS AVAILABLE FOR THAT IMAGE

print("Training Captions SIZE:" + str(len(train_captions)))
print("Training IMAGE SIZE:" + str(len(img_name_vector)))

In [None]:
## Preprocess the images using InceptionV3
def load_image(image_path):
  img = tf.io.read_file(image_path)
  img = tf.io.decode_jpeg(img, channels=3)
  img = tf.keras.layers.Resizing(299, 299)(img)
  img = tf.keras.applications.inception_v3.preprocess_input(img)
  return img, image_path

In [None]:
## Initialize InceptionV3 and load the pretrained Imagenet weights
## Inception v3[1][2] is a convolutional neural network for assisting in image analysis and object detection.
#Initialize InceptionV3 and load the pretrained Imagenet weights Now you'll create a tf.keras model
#where the output layer is the last convolutional layer in the InceptionV3 architecture. The shape of the output of this layer is ```8x8x2048```.
#Our goal is, use the pretrained model, and run our fresh images and save the generated output vector to disk.
#You use the last convolutional layer because you are using attention in this example. You don't perform this initialization during training because it could become a bottleneck.
#You forward each image through the network and store the resulting vector in a dictionary (image_name --> feature_vector).
#After all the images are passed through the network, you pickle the dictionary and save it to disk.

image_model = tf.keras.applications.InceptionV3(include_top=False, weights='imagenet')   #WE PULL THE EXISTING MODEL
new_input = image_model.input     #NEED TO FIND WHY WE DO THIS??
hidden_layer = image_model.layers[-1].output   ##NEED TO FIND WHY WE DO THIS??
image_features_extract_model = tf.keras.Model(new_input, hidden_layer)

In [None]:
## Caching the features extracted from InceptionV3
from tqdm import tqdm

#REFRESH: img_name_vector - LIST OF ALL IMAGES; train_captions - LIST OF ALL CAPTIONS
# Get unique images
encode_train = sorted(set(img_name_vector))

#Creates a Dataset whose elements are slices of the given tensors.
#The given tensors are sliced along their first dimension. This operation preserves the structure of the input tensors,
#removing the first dimension of each tensor and using it as the dataset dimension.
#All input tensors must have the same size in their first dimensions.
#https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_tensor_slices
image_dataset = tf.data.Dataset.from_tensor_slices(encode_train)
image_dataset = image_dataset.map(load_image, num_parallel_calls=tf.data.AUTOTUNE).batch(16)

print("RANDOMLY PICKED 10K Image Dataset WHICH IS EXTENDED TO MATCH CAPTIONS COUNT:" + str(len(img_name_vector)))
print("Image Dataset (UNIQUE IMAGES):" + str(len(encode_train)))
print("Image Dataset SLICED (image_dataset):" + str(len(image_dataset)))

#SLICED DATASET WAS SHOWING 625.  THIS IS NOTHING BUG 10000 / 16 (BATCH SIZE) = 625.

In [None]:
import cv2
#cv2.IMREAD_COLOR : Loads a color image. Any transparency of image will be neglected. It is the default flag.
#cv2.IMREAD_GRAYSCALE : Loads image in grayscale mode.
#cv2.IMREAD_UNCHANGED : Loads image as such including alpha channel.
#These flags can be used directly instead of using the integers 1, 0 and -1 respectively.
img_color = cv2.imread(encode_train[0],1)

from matplotlib import pyplot as plt
plt.imshow(img_color)
plt.axis("off")
plt.show()

In [None]:
#RECALL ABOVE THAT image_dataset size was 625.  So for each of the image, we are trying to extract the features
# and SAVING it back to the same folder /content/train2014/....*.npy extension
#. npy format is the standard binary file format in NumPy for persisting a single arbitrary NumPy array on disk.

for img, path in tqdm(image_dataset):
  batch_features = image_features_extract_model(img)
  batch_features = tf.reshape(batch_features, (batch_features.shape[0], -1, batch_features.shape[3]))

  for bf, p in zip(batch_features, path):
    path_of_feature = p.numpy().decode("utf-8")
    np.save(path_of_feature, bf.numpy())

#WITH THIS, WE HAVE SUCCESSFULLY EXTRACTED FEATURE VECTORS OF THE 625 IMAGES! --- NO BUDDY, IT WAS 625 X 16 batches = 10000 images.

In [None]:
#-rw-r--r-- 1 root root  183030 Jun 10 19:43 COCO_train2014_000000581582.jpg
#-rw-r--r-- 1 root root  524416 Jun 10 22:16 COCO_train2014_000000581582.jpg.npy
# For all TRAIN files that participated, we have another file with .npy created!
!ls -al /content/train2014

In [None]:
# Showing some binary content
!cat /content/train2014/COCO_train2014_000000581904.jpg.npy

In [None]:
## Preprocess and tokenize the captions
caption_dataset = tf.data.Dataset.from_tensor_slices(train_captions)

def standardize(inputs):
  inputs = tf.strings.lower(inputs)
  return tf.strings.regex_replace(inputs, r"!\"#$%&\(\)\*\+.,-/:;=?@\[\\\]^_`{|}~", "")

# Max word count for a caption.
max_length = 50
# Use the top 7000 words for a vocabulary.
vocabulary_size = 7000

tokenizer = tf.keras.layers.TextVectorization(max_tokens=vocabulary_size, standardize=standardize, output_sequence_length=max_length)

# Learn the vocabulary from the caption data.
tokenizer.adapt(caption_dataset)
# Create the tokenized vectors
cap_vector = caption_dataset.map(lambda x: tokenizer(x))



In [None]:
# Create mappings for words to indices and indicies to words.
word_to_index = tf.keras.layers.StringLookup(mask_token="", vocabulary=tokenizer.get_vocabulary())
index_to_word = tf.keras.layers.StringLookup(mask_token="", vocabulary=tokenizer.get_vocabulary(), invert=True)

In [None]:
## Split the data into training and testing
img_to_cap_vector = collections.defaultdict(list)

#img_name_vector == RANDOMLY PICKED 10K Image Dataset WHICH IS EXTENDED TO MATCH CAPTIONS COUNT:50023
#cap_vector == IS the captions VECTOR created from train_captions and its SIZE also will be 50023
for img, cap in zip(img_name_vector, cap_vector):
  img_to_cap_vector[img].append(cap)  #WOW...again we are building the same dictionary with KEY as IMAGE and value will be ARRAY of CAPTIONS

# Create training and validation sets using an 80-20 split randomly.
img_keys = list(img_to_cap_vector.keys())
random.shuffle(img_keys)
slice_index = int(len(img_keys)*0.8)
img_name_train_keys, img_name_val_keys = img_keys[:slice_index], img_keys[slice_index:]
img_name_train = []
cap_train = []

#training set
for imgt in img_name_train_keys:
  capt_len = len(img_to_cap_vector[imgt])
  img_name_train.extend([imgt] * capt_len)
  cap_train.extend(img_to_cap_vector[imgt])

#validation set
img_name_val = []
cap_val = []

for imgv in img_name_val_keys:
  capv_len = len(img_to_cap_vector[imgv])
  img_name_val.extend([imgv] * capv_len)
  cap_val.extend(img_to_cap_vector[imgv])

len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)



In [None]:
## STORY SO FAR!!!
## WE HAVE PICKED 10K RANDOM IMAGES AND ITS CAPTIONS, SO TOTAL IS 10K x 5 = 50K
## FOR THOSE 10K IMAGES, FEATURES VECTORS ARE EXTRACTED AND SAVED AS .NPY FILES
## FOR THOSE 50K CAPTIONS, TEXT VECTORS ARE GENERATED.
## NOW, WE ARE SPLITTING THE 50K INTO TRAINING AND VALIDATION SETS! TRAINING=40K; VALIDATION=10K

## Create a tf.data dataset for training
BATCH_SIZE = 64
BUFFER_SIZE = 1000
embedding_dim = 256
units = 512
num_steps = len(img_name_train) // BATCH_SIZE
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape
features_shape = 2048
attention_features_shape = 64

# Load the numpy files
def map_func(img_name, cap):
  img_tensor = np.load(img_name.decode('utf-8')+'.npy')
  return img_tensor, cap

dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))

# Use map to load the numpy files in parallel
dataset = dataset.map(lambda item1, item2: tf.numpy_function(map_func, [item1, item2], [tf.float32, tf.int64]),num_parallel_calls=tf.data.AUTOTUNE)

# Shuffle and batch
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
class BahdanauAttention(tf.keras.Model):

  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, features, hidden):
    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)
    # hidden shape == (batch_size, hidden_size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
    hidden_with_time_axis = tf.expand_dims(hidden, 1)
    # attention_hidden_layer shape == (batch_size, 64, units)
    attention_hidden_layer = (tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis)))
    # score shape == (batch_size, 64, 1)
    # This gives you an unnormalized score for each image feature.
    score = self.V(attention_hidden_layer)
    # attention_weights shape == (batch_size, 64, 1)
    attention_weights = tf.nn.softmax(score, axis=1)
    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * features
    context_vector = tf.reduce_sum(context_vector, axis=1)
    return context_vector, attention_weights

class CNN_Encoder(tf.keras.Model):

  # Since you have already extracted the features and dumped it
  # This encoder passes those features through a Fully connected layer
  def __init__(self, embedding_dim):
    super(CNN_Encoder, self).__init__()
    # shape after fc == (batch_size, 64, embedding_dim)
    self.fc = tf.keras.layers.Dense(embedding_dim)

  def call(self, x):
    x = self.fc(x)
    x = tf.nn.relu(x)
    return x

class RNN_Decoder(tf.keras.Model):

  def __init__(self, embedding_dim, units, vocab_size):
    super(RNN_Decoder, self).__init__()
    self.units = units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    self.fc1 = tf.keras.layers.Dense(self.units)
    self.fc2 = tf.keras.layers.Dense(vocab_size)
    self.attention = BahdanauAttention(self.units)

  def call(self, x, features, hidden):
    # defining attention as a separate model
    context_vector, attention_weights = self.attention(features, hidden)
    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)
    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    # passing the concatenated vector to the GRU
    output, state = self.gru(x)
    # shape == (batch_size, max_length, hidden_size)
    x = self.fc1(output)
    # x shape == (batch_size * max_length, hidden_size)
    x = tf.reshape(x, (-1, x.shape[2]))
    # output shape == (batch_size * max_length, vocab)
    x = self.fc2(x)
    return x, state, attention_weights

  def reset_state(self, batch_size):
    return tf.zeros((batch_size, self.units))

In [None]:
encoder = CNN_Encoder(embedding_dim)
decoder = RNN_Decoder(embedding_dim, units, tokenizer.vocabulary_size())
#optimizer = tf.keras.optimizers() ---------------------->> WAS THROWING ERROR ---TypeError: 'module' object is not callable
optimizer = tf.keras.optimizers.Adam()
train_loss = tf.keras.losses.SparseCategoricalCrossentropy(name='train_loss', from_logits=True, reduction='none')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
test_loss = tf.keras.losses.SparseCategoricalCrossentropy(name='test_loss', from_logits=True, reduction='none')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

def train_loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = train_loss(real, pred)
  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  return tf.reduce_mean(loss_)

def train_accuracy_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  acc_ = train_accuracy(real, pred)
  mask = tf.cast(mask, dtype=acc_.dtype)
  acc_ *= mask
  return tf.reduce_mean(acc_)

def test_loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = test_loss(real, pred)
  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  return tf.reduce_mean(loss_)

def test_accuracy_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  acc_ = test_accuracy(real, pred)
  mask = tf.cast(mask, dtype=acc_.dtype)
  acc_ *= mask
  return tf.reduce_mean(acc_)

## Checkpoint
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
start_epoch = 0

if ckpt_manager.latest_checkpoint:
  start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
  # restoring the latest checkpoint in checkpoint_path
  ckpt.restore(ckpt_manager.latest_checkpoint)


In [None]:
# adding this in a separate cell because if you run the training cell
# many times, the loss_plot array will be reset
loss_plot = []
accuracy_plot = []

@tf.function

def train_step(img_tensor, target):
  loss = 0
  accuracy = 0
  # initializing the hidden state for each batch
  # because the captions are not related from image to image
  hidden = decoder.reset_state(batch_size=target.shape[0])
  dec_input = tf.expand_dims([word_to_index('<start>')] * target.shape[0], 1)

  with tf.GradientTape() as tape:
    features = encoder(img_tensor)

    for i in range(1, target.shape[1]):
      # passing the features through the decoder
      predictions, hidden, _ = decoder(dec_input, features, hidden)
      loss += train_loss_function(target[:, i], predictions)
      accuracy += train_accuracy_function(target[:, i], predictions)
      # using teacher forcing
      dec_input = tf.expand_dims(target[:, i], 1)

  total_loss = (loss / int(target.shape[1]))
  total_accuracy = (accuracy / int(target.shape[1]))
  trainable_variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, trainable_variables)
  optimizer.apply_gradients(zip(gradients, trainable_variables))
  return loss, total_loss, accuracy, total_accuracy

In [None]:
print(dataset)
#print(tf.shape(dataset))

In [None]:
EPOCHS = 100
for epoch in range(start_epoch, EPOCHS):
  start = time.time()
  total_loss = 0
  total_accuracy = 0

  for (batch, (img_tensor, target)) in enumerate(dataset):
    batch_loss, t_loss, batch_accuracy, t_accuracy = train_step(img_tensor, target)
    total_loss += t_loss
    total_accuracy += t_accuracy

    if batch % 100 == 0:
      average_batch_loss = batch_loss.numpy()/int(target.shape[1])
      average_batch_accuracy = batch_accuracy.numpy()/int(target.shape[1])
      print(f'Epoch {epoch+1} Batch {batch} Loss {average_batch_loss:.4f}')
      print(f'Epoch {epoch+1} Batch {batch} Accuracy {average_batch_accuracy:.4f}')

  # storing the epoch end loss value to plot later
  loss_plot.append(total_loss / num_steps)
  accuracy_plot.append(total_accuracy / num_steps)

  if epoch % 5 == 0:
    ckpt_manager.save()

  print(f'Epoch {epoch+1} Loss {total_loss/num_steps:.6f}')
  print(f'Epoch {epoch+1} Accuracy {total_accuracy/num_steps:.6f}')
  print(f'Time taken for 1 epoch {time.time()-start:.2f} sec\n')

plt.plot(loss_plot)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss Plot')
plt.show()
plt.plot(accuracy_plot)
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy Plot')
plt.show()



In [None]:
## Caption!
def evaluate(image):

  attention_plot = np.zeros((max_length, attention_features_shape))
  hidden = decoder.reset_state(batch_size=1)
  temp_input = tf.expand_dims(load_image(image)[0], 0)
  img_tensor_val = image_features_extract_model(temp_input)
  img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))
  features = encoder(img_tensor_val)
  dec_input = tf.expand_dims([word_to_index('<start>')], 0)
  result = []

  for i in range(max_length):
    predictions, hidden, attention_weights = decoder(dec_input, features, hidden)
    attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()
    predicted_id = tf.random.categorical(predictions, 1)[0][0].numpy()
    predicted_word = tf.compat.as_text(index_to_word(predicted_id).numpy())
    result.append(predicted_word)

    if predicted_word == '<end>':
      return result, attention_plot
    dec_input = tf.expand_dims([predicted_id], 0)

  attention_plot = attention_plot[:len(result), :]
  return result, attention_plot

def plot_attention(image, result, attention_plot):

  temp_image = np.array(Image.open(image))
  fig = plt.figure(figsize=(10, 10))
  len_result = len(result)

  for i in range(len_result):
    temp_att = np.resize(attention_plot[i], (8, 8))
    grid_size = max(int(np.ceil(len_result/2)), 2)
    ax = fig.add_subplot(grid_size, grid_size, i+1)
    ax.set_title(result[i])
    img = ax.imshow(temp_image)
    ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())

  plt.tight_layout()
  plt.show()

In [None]:
# captions on the validation set
rid = np.random.randint(0, len(img_name_val))
image = img_name_val[rid]
real_caption = ' '.join([tf.compat.as_text(index_to_word(i).numpy()) for i in cap_val[rid] if i not in [0]])
result, attention_plot = evaluate(image)
print('Real Caption:', real_caption)
print('Prediction Caption:', ' '.join(result))
plot_attention(image, result, attention_plot)

Image.open(image)


In [None]:
image_url = 'https://tensorflow.org/images/surf.jpg'
image_extension = image_url[-4:]
image_path = tf.keras.utils.get_file('image'+image_extension, origin=image_url)
result, attention_plot = evaluate(image_path)
print('Prediction Caption:', ' '.join(result))
plot_attention(image_path, result, attention_plot)

# opening the image
Image.open(image_path)

In [None]:
### GENERATING SYNTHETIC IMAGES USING TEXT
## train_image_paths - Contains path of 10K images
drive.mount('/content/drive', force_remount=True)
captions_list = []
for image_path in train_image_paths:
  caption = image_path_to_caption[image_path][0]
  captions_list.append(caption)

Images_Captions_df = pd.DataFrame({'ORIGINAL IMAGES': train_image_paths, 'CAPTIONS': captions_list})
Images_Captions_df.to_csv('/content/drive/My Drive/Colab Notebooks/_IMAGE CAPTIONING/IMAGES_CAPTIONS_DATA.csv')


###USE 10 Generate Synthetic Images.ipynb NOTEBOOK TO GENERATE SYNTHETIC IMAGES!!!!

In [None]:
## Base Model
#@title Base Model
options = model_and_diffusion_defaults()
options['use_fp16'] = has_cuda
options['timestep_respacing'] = '100'
model,diffusion = create_model_and_diffusion(**options)
model.eval()

if has_cuda:
  model.convert_to_fp16 ( )

model.to(device)
model.load_state_dict(load_checkpoint('base',device))
print('total base parameters', sum(x.numel() for x in model.parameters()))

## Upsampler Model
options_up = model_and_diffusion_defaults_upsampler()
options_up['use_fp16'] = has_cuda
options_up['timestep_respacing'] = 'fast27'
model_up,diffusion_up = create_model_and_diffusion(**options_up)
model_up.eval()

if has_cuda :
  model_up.convert_to_fp16()

model_up.to(device)
model_up.load_state_dict(load_checkpoint('upsample', device))
print('total upsampler parameters', sum(x.numel() for x in model_up.parameters()))

## CLIP Model
clip_model = create_clip_model(device = device)
clip_model.image_encoder.load_state_dict(load_checkpoint('clip/image-enc', device))
clip_model.text_encoder.load_state_dict(load_checkpoint('clip/text-enc', device))

def images(batch:th.Tensor):
  scaled = ((batch+1)*127.5).round().clamp(0,255).to(th.uint8).cpu()
  reshaped = scaled.permute(2,0,3,1).reshape([batch.shape[2],-1,3])
  return np.array(Image.fromarray(reshaped.numpy()))

## Prompt
# Sampling parameters
batch_size = 1
guidance_scale = 3.0
# Tune this parameter to control the sharpness of 256x256 images .
# A value of 1.0 is sharper , but sometimes results in grainy artifacts .
upsample_temp = 0.997



In [None]:
## Base Model Sample
#Create the text tokens to feed to the model.
for i in range(0, 10000):
  prompt = str(Images_Captions_df['CAPTIONS'][i])
  tokens = model.tokenizer.encode(prompt)
  tokens,mask = model.tokenizer.padded_tokens_and_mask(tokens,options['text_ctx'])

  # Pack the tokens together into model kwargs .
  model_kwargs = dict(tokens = th.tensor([tokens]*batch_size , device = device),
  mask = th.tensor([mask]*batch_size , dtype=th.bool , device = device),)
  # Setup guidance function for CLIP model .
  cond_fn = clip_model.cond_fn([prompt]*batch_size , guidance_scale)
  #Sample from the base model .
  model.del_cache()

  samples = diffusion.p_sample_loop (model, (batch_size, 3, options["image_size"] , options["image_size"]),
                                     device = device, clip_denoised = True, progress = True, model_kwargs = model_kwargs, cond_fn = cond_fn,)
  model.del_cache()
  tokens = model_up.tokenizer.encode(prompt)
  tokens, mask = model_up.tokenizer.padded_tokens_and_mask (tokens , options_up['text_ctx'])

  # Create the model conditioning dict .
  model_kwargs = dict(
      # Low - res image to upsample .
      low_res = ((samples+1)*127.5).round()/127.5 -1 ,
      # Text tokens
      tokens = th.tensor([tokens]*batch_size , device = device),
      mask = th.tensor([mask]*batch_size, dtype = th.bool, device = device))

  #Sample from the base model .
  model_up.del_cache()
  up_shape = (batch_size, 3, options_up["image_size"], options_up["image_size"])
  up_samples = diffusion_up.ddim_sample_loop(model_up, up_shape,
  noise = th.randn(up_shape, device = device)*upsample_temp, device = device, clip_denoised = True, progress = True, model_kwargs = model_kwargs, cond_fn = None)[:batch_size]
  model_up.del_cache()

  # Show the output
  image_array = images(up_samples)
  image = Image.fromarray(image_array)
  image.save('/content/drive/My Drive/Colab Notebooks/_IMAGE CAPTIONING/SYNTHETIC_IMAGES/'+str(i)+'.jpg')

