In [1]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=5

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=5


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:

from tensorflow.keras import mixed_precision

policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)
print('Compute dtype: %s' % policy.compute_dtype)
print('Variable dtype: %s' % policy.variable_dtype)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA RTX A4000, compute capability 8.6
Compute dtype: float16
Variable dtype: float32


In [4]:
import omegaconf
import numpy as np
import matplotlib.pyplot as plt
from ganime.data.experimental import ImageDataset, VideoDataset
from ganime.visualization.videos import display_videos
from ganime.visualization.images import display_images
from ganime.model.vqgan_clean.net2net import Net2Net
import tensorflow as tf
import tensorflow_addons as tfa
from datetime import datetime
from tqdm import tqdm
from pyprojroot.pyprojroot import here
#tf.get_logger().setLevel('ERROR')

In [5]:
for device in tf.config.list_physical_devices("GPU"):
    tf.config.experimental.set_memory_growth(device, True)

In [6]:
strategy = tf.distribute.MultiWorkerMirroredStrategy()

INFO:tensorflow:Single-worker MultiWorkerMirroredStrategy with local_devices = ('/device:GPU:0',), communication = CommunicationImplementation.AUTO


2022-06-07 03:16:41.482880: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-07 03:16:41.861911: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14252 MB memory:  -> device: 0, name: NVIDIA RTX A4000, pci bus id: 0000:a1:00.0, compute capability: 8.6


In [7]:
cfg = omegaconf.OmegaConf.load(here("configs/moving_mnist_image_transformer.yaml"))
#cfg = omegaconf.OmegaConf.load(here("configs/default_transformer.yaml"))
batch_size = 16

In [8]:
dataset_length = 10000
num_batch = dataset_length / batch_size

In [9]:
def preprocess(element):
    element = tf.reshape(element, (tf.shape(element)[0], tf.shape(element)[1], tf.shape(element)[2], 3))
    element = tf.cast(element, policy.compute_dtype) / 255.0
    first_frame = element[0:1,...]
    last_frame = element[-1:,...]
    
    y = element[1:,...]
    
    first_last_frame = tf.concat([first_frame, last_frame], axis=0)
    
    return first_last_frame, y

In [10]:
dataset = VideoDataset("../../../data/moving_mnist_tfrecords").load()
dataset = dataset.shuffle(dataset_length, reshuffle_each_iteration=True).map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

In [11]:
train_size = int(num_batch * 0.8)
validation_size = int(num_batch * 0.1)
test_size = int(num_batch * 0.1)

In [12]:
train_ds = dataset.take(train_size)
validation_ds = dataset.skip(train_size).take(validation_size)
test_ds = dataset.skip(train_size + validation_size).take(validation_size)

In [13]:
train_sample_data = next(train_ds.as_numpy_iterator())
validation_sample_data = next(validation_ds.as_numpy_iterator())

In [14]:
train_sample_data[1].shape

(16, 19, 64, 64, 3)

In [15]:
with strategy.scope():
    model = Net2Net(**cfg["model"])

Working with z of shape (1, 128, 16, 16) = 32768 dimensions.


2022-06-07 03:16:46.211685: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8101


VQLPIPSWithDiscriminator running with hinge loss.


lrs = [model.scheduled_lrs(i) for i in range(int(num_batch) * 500)]
xs = np.linspace(0, 500, len(lrs))
plt.plot(xs, lrs)

In [16]:
from ganime.utils.callbacks import TensorboardVideo, get_logdir
import os

logdir = get_logdir("../../../logs/ganime/", experiment_name="transformer_mnist_video")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
tensorboard_video_callback = TensorboardVideo(logdir, train_sample_data, validation_sample_data)
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0.001,
    patience=50,
    restore_best_weights=True,
)
checkpointing = tf.keras.callbacks.ModelCheckpoint(os.path.join(logdir, "checkpoint", "checkpoint"), monitor='val_loss', save_best_only=True, save_weights_only=True)
callbacks = [tensorboard_callback, early_stopping, checkpointing, tensorboard_video_callback]

In [17]:
#with strategy.scope():
#    model.compile(optimizer=tfa.optimizers.AdamW(
#        learning_rate=1e-3, weight_decay=1e-4
#    ))

In [18]:
with strategy.scope():
    model.first_stage_model.build(input_shape=(None, *train_sample_data[0].shape[2:]))
    model.cond_stage_model.build(input_shape=(None, *train_sample_data[0].shape[2:]))

In [19]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(5)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

In [20]:
print_gpu_utilization()

GPU memory occupied: 1673 MB.


In [21]:
with strategy.scope():
    video = model(train_sample_data[0])

2022-06-07 03:16:56.718542: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


In [22]:
model.summary()

Model: "net2_net"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gpt (GPT)                   multiple                  474818641 
                                                                 
 vqgan (VQGAN)               multiple                  20186956  
                                                                 
Total params: 495,005,599
Trainable params: 495,003,797
Non-trainable params: 1,802
_________________________________________________________________


In [23]:
print_gpu_utilization()

GPU memory occupied: 5513 MB.


In [None]:
model.fit(train_ds, validation_data=validation_ds, steps_per_epoch=10, epochs=100, callbacks=callbacks)

Epoch 1/100






2022-06-07 03:34:27.472697: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:766] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Did not find a shardable source, walked to a node which is not a dataset: name: "FlatMapDataset/_9"
op: "FlatMapDataset"
input: "PrefetchDataset/_8"
attr {
  key: "Targuments"
  value {
    list {
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: -2
  }
}
attr {
  key: "f"
  value {
    func {
      name: "__inference_Dataset_flat_map_slice_batch_indices_329551"
    }
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\022FlatMapDataset:139"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
        dim {
          size: -1
        }
      }
    }
  }
}
attr {
  key: "output_types"
  value {
    list {
      type: DT_INT64
    }
  }
}
. Consider either turning off auto-sharding or switching the auto_shard_policy to DATA to shard this dataset. 

In [None]:
display_videos(model.process_video(train_ds[0][0].astype(np.float32))[0], 1, 3)

In [None]:
display_videos(model.process_video(validation_ds[0][0].astype(np.float32))[0], 1, 3)

In [None]:
display_videos(model.process_video(test_ds[0][0].astype(np.float32))[0], 1, 3)