# Link to the other notebook :
## Coding is finished preliminarily, just train it now
https://colab.research.google.com/drive/148yiVLIHWIv9r9GNDLALyG-9EgL7aUFT?usp=sharing

(Shift to the above notebook)  
(Don't use the current one)

## Image GPT
 * Paper Link : https://cdn.openai.com/papers/Generative_Pretraining_from_Pixels_V2.pdf

 * Here, we will try to improve the transformer to make it work satisfactorily in lower complexities

 * Even the smallest model iGPT-S has 76M parameters, so we can try bringing it down to say 30M or less.

 * Datasets used will be mainly :
  - Tiny ImageNet
  - CIFAR 10 & CIFAR 100

 * The model's code is present below but utils.py and run.py are yet to be covered.

 * This is just a preliminary sketch, please continue further

### Thanks for cooperation

## PS :
GPT - 2 Paper : https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf  
GPT Paper : https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf  

<hr />

Ye Paper bhi dekh lo : https://arxiv.org/pdf/1802.05751.pdf

### The Transformer Architecture  

<img src="https://miro.medium.com/max/1838/1*BHzGVskWGS_3jEcYYi6miQ.png" width=500 />

## The GPT-2 Decoder Only Architecture  
<img src="https://i.stack.imgur.com/Kb8Gq.png" height=500 />

## Attention :  
<img src="https://ryanong.co.uk/wp-content/uploads/2020/01/Attention-formula.png" width=500 />  
<br />

Q = Query  
K = Key  
V = Value  

d_k = Embedding dimension  

Transformer Architecture details : [Ari Seff's Video](https://www.youtube.com/watch?v=XSSTuhyAmnI&list=PLcwMiqcoGw1WiEobtKcPOU7A4iePwOZH_&index=17&t=421s)

In [None]:
%tensorflow_version 2.x

In [None]:
%%writefile model.py

# This module has been deprecated. Please find out an alternative.

# Update : Temporarily used another module from this link :
# https://www.tensorflow.org/tensorboard/hyperparameter_tuning_with_hparams

import numpy as np
import tensorflow as tf
from tensorflow.contrib.training import HParams

# Default Hyperparameters
def default_hparams():
    return HParams(
        n_vocab=0,
        n_ctx=1024,
        n_embd=768,
        n_head=12,
        n_layer=12,
    )

def shape_list(x):
    """Deal with dynamic shape in tensorflow cleanly."""
    static = x.shape.as_list()
    dynamic = tf.shape(x)
    return [dynamic[i] if s is None else s for i, s in enumerate(static)]

def softmax(x, axis=-1):
    x = x - tf.reduce_max(x, axis=axis, keepdims=True)
    ex = tf.exp(x)
    return ex / tf.reduce_sum(ex, axis=axis, keepdims=True)

def gelu(x):
  """Gaussian Error Linear Unit : Combines activation function with
                                  dropout regularization
     Paper Link : https://arxiv.org/pdf/1606.08415v3.pdf
     https://medium.com/@shoray.goel/gelu-gaussian-error-linear-unit-4ec59fb2e47c"""

  return 0.5*x*(1+tf.tanh(np.sqrt(2/np.pi)*(x+0.044715*tf.pow(x, 3))))

def gelu2(x):
  """Another approximation of the GELU function
     Link : https://paperswithcode.com/method/gelu"""

  return x * tf.sigmoid(1.702 * x)

def norm(x, scope, *, axis=-1, epsilon=1e-5):
  """Normalize to mean = 0, std = 1, then do a diagonal affine transform"""

  """Affine transformation : https://en.wikipedia.org/wiki/Affine_transformation"""

  """The asterisk in a function definition forces the caller to provide named arguments.
   See the link if still confused.
   https://stackoverflow.com/questions/14301967/bare-asterisk-in-function-arguments"""

  with tf.compat.v1.variable_scope(scope):
    n_state = x.shape[axis].value
    g = tf.compat.v1.get_variable('g', [n_state], initializer=tf.constant_initializer(1))

    # tf.compat.v1.get_variable obtains the variable passed as first argument
    # if it was already present, else creates a new one
    # https://www.tensorflow.org/api_docs/python/tf/compat/v1/get_variable

    s = tf.reduce_mean(tf.square(x), axis=axis, keepdims=True)
    x = x*tf.math.rsqrt(s + epsilon)     # Reciprocal of the sqrt, elementwise
    x =  x*g                        # Probably increasing the dimension here; I didn't understand exactly
                                    # Might be the diagonal transformation


    # Where did we recenter it to mean = 0?
    return x

def split_states(x, n):
  """Reshape the last dimension of x into [n, x.shape[-1]/n]"""
  *start, m = shape_list(x)
  return tf.reshape(x, start + [n, m//n])

def merge_states(x):
  """Smash the last two dimensions of x into a single dimension"""
  *start, a, b = shape_list(x)
  return tf.reshape(x, start+[a*b])

# scope is a string, tf.compat.v1.variable_scope(scope) will automatically concat this string
# to all the variable names in that block to prevent name overlap
def conv1d(x, scope, nf, *, w_init_stdev=0.02):
  with tf.compat.v1.variable_scope(scope):
    *start, nx = shape_list(x)
    # w = array of weights

    w = tf.compat.v1.get_variable('w', [nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev))

    # Convolution operation
    c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf])), start+[nf])
    return c

def attention_mask(nd, ns, *, dtype):
  """1's in the lower triangle, counting from the lower right corner.

  Same as tf.matrix_band_part(tf.ones[nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs."""

  """Example : attention_mask(2, 4, tf.uint8) returns [[1 1 1 0]
                                                       [1 1 1 1]]"""

  i = tf.range(nd)[:, None]
  j = tf.range(ns)
  m = i >= j - ns + nd

  return tf.cast(m, dtype)


# What is n_state?
def attn(x, scope, n_state, *, past, hparams):
  """Returns the attention vector along with present key, value matrices"""
  assert x.shape.ndims == 3   # [batch, sequence, features]
  assert n_state % hparams.n_head == 0

  if past is not None:
    assert past.shape.ndims == 5  # [batch, 2, heads, sequence, features], where 2 is [k, v]

  def split_heads(x):
    # From [batch, sequence, features] to [batch, heads, sequence, features]

    return tf.transpose(split_states(x, hparams.n_head), [0, 2, 1, 3])

  def merge_heads(x):
    # Reverse of split_heads
    return merge_states(tf.transpose(x, [0, 2, 1, 3]))

  def mask_attn_weights(w):
    # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.

    """nd = length of destination sequence probably,
       ns = length of source sequence probably."""
    _, _, nd, ns = shape_list(w)
    b = attention_mask(nd, ns, dtype=w.dtype)
    b = tf.reshape(b, [1, 1, nd, ns])
    w = w*b - tf.cast(1e10, w.dtype)*(1-b)
    return w

  def multihead_attn(q, k, v):
    # q, k, v have shape [batch, heads, sequence, features]

    # w is just a temporary variable
    w = tf.matmul(q, k, transpose_b=True)   # QK^T

    w = w * tf.math.rsqrt(tf.cast(v.shape[-1].value, w.dtype))  # QK^T/sqrt(d_k)

    if not hparams.bert:
        w = mask_attn_weights(w)   # This is because GPT-2 uses self-attention mask
    w = softmax(w)
    attention = tf.matmul(w, v)
    return attention

  with tf.compat.v1.variable_scope(scope):
    *start, nx = shape_list(x)

    # The weight matrices for Q, K, V
    wk = tf.compat.v1.get_variable("k_proj", [hparams.n_head, nx // hparams.n_head, n_state], initializer=tf.random_normal_initializer(stddev=1.0/np.sqrt(n_state)))
    wq = tf.compat.v1.get_variable("q_proj", [hparams.n_head, nx // hparams.n_head, n_state], initializer=tf.random_normal_initializer(stddev=1.0/np.sqrt(n_state)))
    wv = tf.compat.v1.get_variable("v_proj", [hparams.n_head, nx // hparams.n_head, n_state], initializer=tf.random_normal_initializer(stddev=1.0/np.sqrt(n_state)))


    """This is Einstein summation. Prof. Raghav Verma taught it briefly if you recall.
       Anyways, the link is here : https://www.tensorflow.org/api_docs/python/tf/einsum

       Basically, matrix multiplication in shorthand notations.

       If Cik = sum_j Aij*Bjk,
       then, the string will be :
       ij,jk->ik

       As simple as this."""

    """b = batch, s = sequence, f = feature, h = head, e = embedding"""
    k = tf.einsum("bsf,hef->bhse", x, wk)
    q = tf.einsum("bsf,hef->bhse", x, wq)
    v = tf.einsum("bsf,hef->bhse", x, wv)

    present = tf.stack([k, v], axis=1)

    if past is not None:
      pk, pv = tf.unstack(past, axis=1)
      k = tf.concat([pk, k], axis=-2)
      v = tf.concat([pv, v], axis=-2)

    a = multihead_attn(q, k, v)   # Calculation of attention

    # The context weights (from the context vector C)
    wc = tf.compat.v1.get_variable("c_proj", [hparams.n_head, nx // hparams.n_head, n_state],
                                   initializer=tf.random_normal_initializer(stddev=1.0/np.sqrt(n_state*hparams.n_layer)))

    a = tf.einsum("bhse,hef->bsf", a, wc)
    return a, present

def mlp(x, scope, n_state, *, hparams):
  """The feed forward layer inside a block"""
  with tf.compat.v1.variable_scope(scope):
    nx = x.shape[-1].value
    h = gelu2(conv1d(x, 'c_fc', n_state))
    h2 = conv1d(h, 'c_proj', nx)
    return h2

# One block of decoder. See the figure above
def block(x, scope, *, past, hparams):
  with tf.compat.v1.variable_scope(scope):
    nx = x.shape[-1].value
    a, present = attn(norm(x, 'ln_1'), 'attn', nx, past=past, hparams=hparams)
    x = x+a
    # Adding the attention to the original input (original input is kept to preserve residual characteristics)

    m = mlp(norm(x, 'ln_2'), 'mlp', nx*4, hparams=hparams)
    x = x+m
    # Feed forward layer along with residual connection

    return x, present

# I still need to understand the math here. Ain't familiar with the shapes
def past_shape(*, hparams, batch_size=None, sequence=None):
  return [batch_size, hparams.n_layer, 2, hparams.n_head, sequence, hparams.n_embd // hparams.n_head]

def expand_tile(value, size):
  """Add a new axis of given size."""

  value = tf.convert_to_tensor(value, name='value')
  ndims = value.shape.ndims

  """tf.tile documentation : https://www.tensorflow.org/api_docs/python/tf/tile"""
  return tf.tile(tf.expand_dims(value, axis=0), [size] + [1]*ndims)

def positions_for(tokens, past_length):
  batch_size = tf.shape(tokens)[0]
  nsteps = tf.shape(tokens)[1]
  return expand_tile(past_length + tf.range(nsteps), batch_size)

def model(hparams, X, Y=None, past=None, scope='model', reuse=False):
  with tf.compat.v1.variable_scope(scope, reuse=reuse):
    results = {}
    batch, sequence = shape_list(X)

    if hparams.bert:
      M = tf.greater(tf.random.uniform([batch, sequence]), hparams.bert_mask_prob)
      M = tf.cast(M, tf.float32)

    # What are these??
    """
    Update :
    Wpe = Position Embedding Matrix                 (Describes the position of a word)
    Wte = Token Embedding Matrix                    (Describes the meaning of a word)
    Wtet = Token Embedding Transpose Matrix         (Will be used to convert results of the transformer back into the
                                                      pixels; just like transpose convolution)
    """
    wpe = tf.compat.v1.get_variable('wpe', [hparams.n_ctx, hparams.n_embd],
                          initializer=tf.random_normal_initializer(stddev=0.01))
    wte = tf.compat.v1.get_variable('wte', [hparams.n_vocab, hparams.n_embd],
                          initializer=tf.random_normal_initializer(stddev=0.02))
    wtet = tf.compat.v1.get_variable('wtet', [hparams.n_vocab, hparams.n_embd],
                          initializer=tf.random_normal_initializer(stddev=0.0))

    past_length = 0 if past is None else tf.shape(past)[-2]

    h = tf.gather(wte, X)
    # tf.gather is just list slicing, see documentation for more info

    if hparams.bert:
      h = h*tf.expand_dims(M, 2)
    else:
      """
      FFFFF
      This SOS is just "Start of sentence".
      Wasted a lot of time in finding it out.

      sos = start of sentence
      sos_tok = start of sentence : token
      """

      sos = tf.compat.v1.get_variable('sos', [hparams.n_embd],
                            initializer=tf.random_normal_initializer(stddev=0.02))

      sos_tok = tf.ones([batch, 1, hparams.n_embd], dtype=tf.float32)*sos
      h = tf.concat([sos_tok, h[:, :-1, :]], axis=1)

    h += tf.gather(wpe, positions_for(X, past_length))

    """
    h is the processed input ready to be fed into the transformer decoder
    """

    # Transformer
    presents = []
    pasts = tf.unstack(past, axis=1) if past is not None else [None]*hparams.n_layer

    assert len(pasts) == hparams.n_layer
    for layer, past in enumerate(pasts):
      h, present = block(h, 'h%d' % layer, past=past, hparams=hparams)
      # 'h%d' % layer : it is just python string formatting in c-style
      #                 equivalent to f'h{layer}'

      presents.append(present)

    results['present'] = tf.stack(presents, axis=1)
    h = norm(h, 'ln_f')

    # Generative loss. Do tokens < n predict token n?
    """
    Procedure : Flatten the results, convert them from tokens to pixels values again,
                store as results, calculate the loss
    """
    h_flat = tf.reshape(h, [batch*sequence, hparams.n_embd])
    gen_logits = tf.matmul(h_flat, wtet, transpose_b = True)
    gen_logits = tf.reshape(gen_logits, [batch, sequence, hparams.n_vocab])
    results['gen_logits'] = gen_logits

    gen_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=gen_logits, labels=X)

    if hparams.bert:
      IM = 1.0 - M
      gen_losses = tf.reduce_sum(gen_losses*IM, axis=1)/tf.reduce_sum(IM, axis=1)
      results['gen_loss'] = tf.reduce_mean(gen_losses)
    else:
      results['gen_loss'] = tf.reduce_mean(gen_losses)

    # Classification loss.
    with tf.compat.v1.variable_scope('clf', reuse=reuse):
      classes = shape_list(Y)[1]
      if hparams.clf:
        wclf = tf.compat.v1.get_variable('wclf', [classes, hparams.n_embd],
                              initializer=tf.random_normal_initializer(stddev=0.0))
      else:
        wclf = tf.zeros([classes, hparams.n_embd], dtype=tf.float32)

    # Layer normalization
    h = tf.reduce_mean(h, axis=1)   # average pool over sequence
    clf_logits = tf.matmul(h, wclf, transpose_b=True)
    clf_losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=clf_logits, labels=Y)

    results['clf_loss'] = tf.reduce_mean(clf_losses)

    correct = tf.equal(tf.argmax(clf_logits, -1), tf.argmax(Y, -1))
    results['accuracy'] = tf.reduce_mean(tf.cast(correct, tf.float32))*100.0

    return results

Writing model.py


In [None]:
%%writefile utils.py

import os
import json
import time
import pickle
import subprocess
import math
from tqdm import tqdm

import numpy as np
import tensorflow as tf

# Some basic utilities

def iter_data(*datas, n_batch=128, truncate=False, verbose=False, max_batches=float("inf")):
    n = len(datas[0])
    if truncate:
        n = (n//n_batch)*n_batch
    n = min(n, max_batches*n_batch)
    n_batches = 0
    for i in tqdm(range(0, n, n_batch), total=n//n_batch, disable=not verbose, ncols=80, leave=False):
        if n_batches >= max_batches: raise StopIteration
        if len(datas) == 1:
            yield datas[0][i:i+n_batch]
        else:
            yield (d[i:i+n_batch] for d in datas)
        n_batches += 1

def squared_euclidean_distance(a, b):
    b = tf.transpose(b)
    a2 = tf.reduce_sum(tf.square(a), axis=1, keepdims=True)
    b2 = tf.reduce_sum(tf.square(b), axis=0, keepdims=True)
    ab = tf.matmul(a, b)
    d = a2 - 2*ab + b2
    return d

def color_quantize(x, np_clusters):
    clusters = tf.Variable(np_clusters, dtype=tf.float32, trainable=False)
    x = tf.reshape(x, [-1, 3])
    d = squared_euclidean_distance(x, clusters)
    return tf.argmin(d, 1)

def count_parameters():
    total_parameters = 0
    for variable in tf.compat.v1.trainable_variables():
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        total_parameters += variable_parameters
    return total_parameters

Writing utils.py


In [None]:
%%writefile run.py

# Running and Evaluating the model

import argparse
import json
import math
import os
import random
import sys
import time

import numpy as np
import tensorflow as tf

from imageio import imwrite
from scipy.special import softmax
from tensorflow.contrib.training import HParams
from tqdm import tqdm

from model import model
from utils import iter_data, count_parameters


""" This function won't be used just adding for the sake of completeness"""
# """
def parse_arguments():
  parser = argparse.ArgumentParser()

  # data and I/O
  parser.add_argument("--data_path", type=str, default="/root/downloads/imagenet")
  parser.add_argument("--ckpt_path", type=str, default="/root/downloads/model.ckpt-1000000")
  parser.add_argument("--color_cluster_path", type=str, default="/root/downloads/kmeans_centers.npy")
  parser.add_argument("--save_dir", type=str, default="/root/save/")

  # model
  parser.add_argument("--n_embd", type=int, default=512)
  parser.add_argument("--n_head", type=int, default=8)
  parser.add_argument("--n_layer", type=int, default=24)
  parser.add_argument("--n_px", type=int, default=32, help="image height or width in pixels")
  parser.add_argument("--n_vocab", type=int, default=512, help="possible values for each pixel")

  parser.add_argument("--bert", action="store_true", help="use the bert objective (default: autoregressive)")
  parser.add_argument("--bert_mask_prob", type=float, default=0.15)
  parser.add_argument("--clf", action="store_true", help="add a learnable classification head")

  # parallelism
  parser.add_argument("--n_sub_batch", type=int, default=8, help="per-gpu batch_size")
  parser.add_argument("--n_gpu", type=int, default=8, help="number of gpus to distribute training across")

  # mode
  parser.add_argument("--eval", action="store_true", help="evaluates the model, requires a checkpoint and dataset")
  parser.add_argument("--sample", action="store_true", help="samples the model, requires a checkpoint and clusters")

  # reproducibility
  parser.add_argument("--seed", type=int, default=42, help="seed for random, np, tf")

  args = parser.parse_args()
  print("input args:\n", json.dumps(vars(args), indent=4, separators=(",", ":")))

  return args
# """

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  tf.compat.v1.set_random_seed(seed)

def load_data(data_path):
  trX = np.load(f'{data_path}_trX.npy')
  trY = np.load(f'{data_path}_trY.npy')
  vaX = np.load(f'{data_path}_vaX.npy')
  vaY = np.load(f'{data_path}_vaY.npy')
  teX = np.load(f'{data_path}_teX.npy')
  teY = np.load(f'{data_path}_teY.npy')

  return (trX, trY), (vaX, vaY), (teX, teY)

def set_hparams(args):
    return HParams(
        n_ctx=args.n_px*args.n_px,
        n_embd=args.n_embd,
        n_head=args.n_head,
        n_layer=args.n_layer,
        n_vocab=args.n_vocab,
        bert=args.bert,
        bert_mask_prob=args.bert_mask_prob,
        clf=args.clf,
    )


def create_model(x, y, n_gpu, hparams):
    gen_logits = []
    gen_loss = []
    clf_loss = []
    tot_loss = []
    accuracy = []

    trainable_params = None
    for i in range(n_gpu):
        with tf.device("/gpu:%d" % i):
            results = model(hparams, x[i], y[i], reuse=(i != 0))

            gen_logits.append(results["gen_logits"])
            gen_loss.append(results["gen_loss"])
            clf_loss.append(results["clf_loss"])

            if hparams.clf:
                tot_loss.append(results["gen_loss"] + results["clf_loss"])
            else:
                tot_loss.append(results["gen_loss"])

            accuracy.append(results["accuracy"])

            if i == 0:
                trainable_params = tf.compat.v1.trainable_variables()
                print("trainable parameters:", count_parameters())

    return trainable_params, gen_logits, gen_loss, clf_loss, tot_loss, accuracy


def reduce_mean(gen_loss, clf_loss, tot_loss, accuracy, n_gpu):
    with tf.device("/gpu:0"):
        for i in range(1, n_gpu):
            gen_loss[0] += gen_loss[i]
            clf_loss[0] += clf_loss[i]
            tot_loss[0] += tot_loss[i]
            accuracy[0] += accuracy[i]
        gen_loss[0] /= n_gpu
        clf_loss[0] /= n_gpu
        tot_loss[0] /= n_gpu
        accuracy[0] /= n_gpu


def evaluate(sess, evX, evY, X, Y, gen_loss, clf_loss, accuracy, n_batch, desc, permute=False):
    metrics = []
    for xmb, ymb in iter_data(evX, evY, n_batch=n_batch, truncate=True, verbose=True):
        metrics.append(sess.run([gen_loss[0], clf_loss[0], accuracy[0]], {X: xmb, Y: ymb}))
    eval_gen_loss, eval_clf_loss, eval_accuracy = [np.mean(m) for m in zip(*metrics)]
    print(f"{desc} gen: {eval_gen_loss:.4f} clf: {eval_clf_loss:.4f} acc: {eval_accuracy:.2f}")


# naive sampler without caching
def sample(sess, X, gen_logits, n_sub_batch, n_gpu, n_px, n_vocab, clusters, save_dir):
    samples = np.zeros([n_gpu * n_sub_batch, n_px * n_px], dtype=np.int32)

    for i in tqdm(range(n_px * n_px), ncols=80, leave=False):
        np_gen_logits = sess.run(gen_logits, {X: samples})
        for j in range(n_gpu):
            p = softmax(np_gen_logits[j][:, i, :], axis=-1)  # logits to probabilities
            for k in range(n_sub_batch):
                c = np.random.choice(n_vocab, p=p[k])  # choose based on probabilities
                samples[j * n_sub_batch + k, i] = c

    # dequantize
    samples = [np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [32, 32, 3]).astype(np.uint8) for s in samples]

    # write to png
    for i in range(n_gpu * n_sub_batch):
        imwrite(f"{args.save_dir}/sample_{i}.png", samples[i])


def main(args):
    set_seed(args.seed)

    n_batch = args.n_sub_batch * args.n_gpu

    if args.data_path.endswith("cifar10"):
        n_class = 10
    elif args.data_path.endswith("imagenet"):
        n_class = 1000
    else:
        raise ValueError("Dataset not supported.")

    X = tf.compat.v1.placeholder(tf.int32, [n_batch, args.n_px * args.n_px])
    Y = tf.compat.v1.placeholder(tf.float32, [n_batch, n_class])

    x = tf.split(X, args.n_gpu, 0)
    y = tf.split(Y, args.n_gpu, 0)

    hparams = set_hparams(args)
    trainable_params, gen_logits, gen_loss, clf_loss, tot_loss, accuracy = create_model(x, y, args.n_gpu, hparams)
    reduce_mean(gen_loss, clf_loss, tot_loss, accuracy, args.n_gpu)

    saver = tf.compat.v1.train.Saver(var_list=[tp for tp in trainable_params if not 'clf' in tp.name])
    with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
        sess.run(tf.compat.v1.global_variables_initializer())

        saver.restore(sess, args.ckpt_path)

        if args.eval:
          (trX, trY), (vaX, vaY), (teX, teY) = load_data(args.data_path)
          evaluate(sess, trX[:len(vaX)], trY[:len(vaY)], X, Y, gen_loss, clf_loss, accuracy, n_batch, "train")
          evaluate(sess, vaX, vaY, X, Y, gen_loss, clf_loss, accuracy, n_batch, "valid")
          evaluate(sess, teX, teY, X, Y, gen_loss, clf_loss, accuracy, n_batch, "test")

        if args.sample:
            if not os.path.exists(args.save_dir):
                os.makedirs(args.save_dir)
            clusters = np.load(args.color_cluster_path)
            sample(sess, X, gen_logits, args.n_sub_batch, args.n_gpu, args.n_px, args.n_vocab, clusters, args.save_dir)


if __name__ == "__main__":
    args = parse_arguments()
    main(args)

Writing run.py


In [None]:
%%writefile download.py

# Data Retrieval
# This part should be removed later on

import argparse
import json
import os
import sys
import requests
from tqdm import tqdm

def parse_arguments():
    parser = argparse.ArgumentParser()

    parser.add_argument("--download_dir", type=str, default="/root/downloads/")

    parser.add_argument("--bert", action="store_true", help="download a bert model (default: ar)")
    parser.add_argument("--model", type=str, choices=["s", "m", "l"], help="parameter counts are s:76M, m:455M, l:1362M")
    parser.add_argument("--ckpt", type=str, choices=["131000", "262000", "524000", "1000000"])
    parser.add_argument("--clusters", action="store_true", help="download the color clusters file")
    parser.add_argument("--dataset", type=str, choices=["imagenet", "cifar10"])

    args = parser.parse_args()
    print("input args:\n", json.dumps(vars(args), indent=4, separators=(",", ":")))
    return args

def main(args):
    if not os.path.exists(args.download_dir):
        os.makedirs(args.download_dir)

    urls = []

    # download the checkpoint
    if args.model and args.ckpt:
        base_url = f"https://openaipublic.blob.core.windows.net/image-gpt/checkpoints/igpt-{args.model}{'-bert' if args.bert else ''}/{args.ckpt}"

        size_to_shards = {"s": 32, "m": 32, "l": 64}
        shards = size_to_shards[args.model]

        for filename in [f"model.ckpt-{args.ckpt}.data-{i:05d}-of-{shards:05d}" for i in range(shards)]:
            urls.append(f"{base_url}/{filename}")
        urls.append(f"{base_url}/model.ckpt-{args.ckpt}.index")
        urls.append(f"{base_url}/model.ckpt-{args.ckpt}.meta")

    # download the color clusters file
    if args.clusters:
        urls.append("https://openaipublic.blob.core.windows.net/image-gpt/color-clusters/kmeans_centers.npy")

    # download color clustered dataset
    if args.dataset:
        for split in ["trX", "trY", "vaX", "vaY", "teX", "teY"]:
            urls.append(f"https://openaipublic.blob.core.windows.net/image-gpt/datasets/{args.dataset}_{split}.npy")

    # run the download
    for url in urls:
        filename = url.split("/")[-1]
        r = requests.get(url, stream=True)
        with open(f"{args.download_dir}/{filename}", "wb") as f:
            file_size = int(r.headers["content-length"])
            chunk_size = 1000
            with tqdm(ncols=80, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar:
                # 1k for chunk_size, since Ethernet packet size is around 1500 bytes
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)
                    pbar.update(chunk_size)

if __name__ == "__main__":
    args = parse_arguments()
    main(args)

Writing download.py


Code Ready.  
Rough Work below

In [None]:
!python3 download.py --model s --ckpt 262000

input args:
 {
    "download_dir":"/root/downloads/",
    "bert":false,
    "model":"s",
    "ckpt":"262000",
    "clusters":false,
    "dataset":null
}
Fetching model.ckpt-262000.data-00000-of-00032: 1.00kit [00:00, 819kit/s]       
Fetching model.ckpt-262000.data-00001-of-00032: 31.5Mit [00:06, 5.20Mit/s]      
Fetching model.ckpt-262000.data-00002-of-00032: 28.3Mit [00:04, 5.82Mit/s]      
Fetching model.ckpt-262000.data-00003-of-00032: 28.3Mit [00:05, 5.12Mit/s]      
Fetching model.ckpt-262000.data-00004-of-00032: 28.3Mit [00:05, 5.63Mit/s]      
Fetching model.ckpt-262000.data-00005-of-00032: 31.5Mit [00:05, 5.36Mit/s]      
Fetching model.ckpt-262000.data-00006-of-00032: 30.4Mit [00:05, 5.32Mit/s]      
Fetching model.ckpt-262000.data-00007-of-00032: 28.3Mit [00:04, 6.02Mit/s]      
Fetching model.ckpt-262000.data-00008-of-00032: 31.5Mit [00:05, 5.42Mit/s]      
Fetching model.ckpt-262000.data-00009-of-00032: 29.4Mit [00:06, 4.57Mit/s]      
Fetching model.ckpt-262000.data-00010

In [None]:
!python3 download.py --dataset cifar10

input args:
 {
    "download_dir":"/root/downloads/",
    "bert":false,
    "model":null,
    "ckpt":null,
    "clusters":false,
    "dataset":"cifar10"
}
Fetching cifar10_trX.npy: 184Mit [00:30, 6.07Mit/s]                             
Fetching cifar10_trY.npy: 1.80Mit [00:01, 1.24Mit/s]                            
Fetching cifar10_vaX.npy: 20.5Mit [00:03, 5.81Mit/s]                            
Fetching cifar10_vaY.npy: 201kit [00:00, 284kit/s]                              
Fetching cifar10_teX.npy: 41.0Mit [00:07, 5.50Mit/s]                            
Fetching cifar10_teY.npy: 401kit [00:00, 443kit/s]                              


In [None]:
!python3 download.py --clusters

input args:
 {
    "download_dir":"/root/downloads/",
    "bert":false,
    "model":null,
    "ckpt":null,
    "clusters":true,
    "dataset":null
}
Fetching kmeans_centers.npy: 7.00kit [00:00, 5.15Mit/s]                         


In [None]:
!ls /root/downloads

cifar10_teX.npy			       model.ckpt-262000.data-00014-of-00032
cifar10_teY.npy			       model.ckpt-262000.data-00015-of-00032
cifar10_trX.npy			       model.ckpt-262000.data-00016-of-00032
cifar10_trY.npy			       model.ckpt-262000.data-00017-of-00032
cifar10_vaX.npy			       model.ckpt-262000.data-00018-of-00032
cifar10_vaY.npy			       model.ckpt-262000.data-00019-of-00032
kmeans_centers.npy		       model.ckpt-262000.data-00020-of-00032
model.ckpt-262000.data-00000-of-00032  model.ckpt-262000.data-00021-of-00032
model.ckpt-262000.data-00001-of-00032  model.ckpt-262000.data-00022-of-00032
model.ckpt-262000.data-00002-of-00032  model.ckpt-262000.data-00023-of-00032
model.ckpt-262000.data-00003-of-00032  model.ckpt-262000.data-00024-of-00032
model.ckpt-262000.data-00004-of-00032  model.ckpt-262000.data-00025-of-00032
model.ckpt-262000.data-00005-of-00032  model.ckpt-262000.data-00026-of-00032
model.ckpt-262000.data-00006-of-00032  model.ckpt-262000.data-00027-of-00032
model.ckpt-262000.

In [None]:
!python3 run.py --eval --ckpt_path /root/downloads/model.ckpt-262000 --data_path /root/downloads/cifar10 --n_embd 512 --n_head 8 --n_layer 24

Traceback (most recent call last):
  File "run.py", line 17, in <module>
    from tensorflow.contrib.training import HParams
ModuleNotFoundError: No module named 'tensorflow.contrib'


In [None]:
!CUDA_VISIBLE_DEVICES=0 python3 run.py --sample --ckpt_path /root/downloads/model.ckpt-262000 --data_path /root/downloads/cifar10 --n_embd 512 --n_head 8 --n_layer 24 --n_gpu 1

input args:
 {
    "data_path":"/root/downloads/cifar10",
    "ckpt_path":"/root/downloads/model.ckpt-262000",
    "color_cluster_path":"/root/downloads/kmeans_centers.npy",
    "save_dir":"/root/save/",
    "n_embd":512,
    "n_head":8,
    "n_layer":24,
    "n_px":32,
    "n_vocab":512,
    "bert":false,
    "bert_mask_prob":0.15,
    "clf":false,
    "n_sub_batch":8,
    "n_gpu":1,
    "eval":false,
    "sample":true,
    "seed":42
}
trainable parameters: 76571648
2021-11-02 15:10:45.565334: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2199995000 Hz
2021-11-02 15:10:45.565575: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x56080091cd80 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-11-02 15:10:45.565613: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2021-11-02 15:10:45.569377: I tensorflow/stream_executor/platform/default/dso_loader.cc:44]

In [None]:
from google.colab import files

!zip -r /root/save.zip /root/save
files.download('/root/save.zip')

  adding: root/save/ (stored 0%)
  adding: root/save/sample_3.png (stored 0%)
  adding: root/save/sample_5.png (stored 0%)
  adding: root/save/sample_2.png (stored 0%)
  adding: root/save/sample_0.png (stored 0%)
  adding: root/save/sample_1.png (stored 0%)
  adding: root/save/sample_7.png (stored 0%)
  adding: root/save/sample_6.png (stored 0%)
  adding: root/save/sample_4.png (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>