### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [None]:
nvidia-smi -i 0,1 --query-gpu=gpu_bus_id,power.draw,utilization.gpu,memory.used --format=csv,nounits --loop-ms=1000 > ./gpu_stats/tf32_2GPUs.csv

In [84]:
import tensorflow as tf
print(tf.__version__)

2.8.1


Test:
- Baseline (tf-32)
- Precision
    - tf32 (default)
    - float32
    - mixed
    - bfloat
- Batch size
    - 64 (default)
    - 96 (from paper)
    - 128
    - max (power of 2)
    - max (non power of 2) 
    - Test with tf32 and fp32
- different number of GPUs
    - 1
    - 2
    - 4
    - 8
    - 16 (2 nodes)
- diferent GPUs (repeat tests)
    - A100
    - V100
    - A10
    - best of all

# Total model flops

In [3]:
import tensorflow as tf
import numpy as np
from gan_models import generator_model, discriminator_model

def get_flops(model, model_inputs) -> float:
        """
        Calculate FLOPS [GFLOPs] for a tf.keras.Model or tf.keras.Sequential model
        in inference mode. It uses tf.compat.v1.profiler under the hood.
        """
        # if not hasattr(model, "model"):
        #     raise wandb.Error("self.model must be set before using this method.")

        if not isinstance(
            model, (tf.keras.models.Sequential, tf.keras.models.Model)
        ):
            raise ValueError(
                "Calculating FLOPS is only supported for "
                "`tf.keras.Model` and `tf.keras.Sequential` instances."
            )

        from tensorflow.python.framework.convert_to_constants import (
            convert_variables_to_constants_v2_as_graph,
        )

        # Compute FLOPs for one sample
        batch_size = 1
        inputs = [
            tf.TensorSpec([batch_size] + inp.shape[1:], inp.dtype)
            for inp in model_inputs
        ]

        # convert tf.keras model into frozen graph to count FLOPs about operations used at inference
        real_model = tf.function(model).get_concrete_function(inputs)
        frozen_func, _ = convert_variables_to_constants_v2_as_graph(real_model)

        # Calculate FLOPs with tf.profiler
        run_meta = tf.compat.v1.RunMetadata()
        opts = (
            tf.compat.v1.profiler.ProfileOptionBuilder(
                tf.compat.v1.profiler.ProfileOptionBuilder().float_operation()
            )
            .with_empty_output()
            .build()
        )

        flops = tf.compat.v1.profiler.profile(
            graph=frozen_func.graph, run_meta=run_meta, cmd="scope", options=opts
        )

        tf.compat.v1.reset_default_graph()

        # convert to GFLOPs
        return (flops.total_float_ops)/2
    
def forward_backward():
    
    for_flop = 0
    total_flop = 0
    session = tf.compat.v1.Session()
    graph = tf.compat.v1.get_default_graph()
    
    with graph.as_default():
        with session.as_default():

            #model = tf.keras.applications.ResNet50() # change your model here

            model = generator_model(256, dformat="channels_first")
            
            x = tf.constant(np.random.randn(1, 256))
            
            outputTensor = model([x]) 
            listOfVariableTensors = model.trainable_weights
            gradients = tf.gradients(outputTensor, listOfVariableTensors)

            run_meta = tf.compat.v1.RunMetadata()
            opts = tf.compat.v1.profiler.ProfileOptionBuilder.float_operation()

            # We use the Keras session graph in the call to the profiler.
            flops = tf.compat.v1.profiler.profile(graph=graph,
                                                  run_meta=run_meta, cmd='op', options=opts)

            total_flop = flops.total_float_ops
            print(total_flop)

    return for_flop, total_flop
    
    
    
#Usage

if __name__ =="__main__":
    #image_model = tf.keras.applications.EfficientNetB0(include_top=False, weights=None)
    
    x = tf.constant(np.random.randn(1, 256))
    noise = np.random.normal(0, 1, (1, 256)).astype(np.float32)
    y = tf.constant(np.random.randn(1, 1, 51 , 51, 25))
    
    #print(x.shape)
    
    model_g = generator_model(256, dformat="channels_first") #Model(inputs=[latent], outputs=[fake_image], name='Generator')
    model_d = discriminator_model(dformat="channels_first")
    #model.summary()
    print('Generator FLOPS = ', get_flops(model_g,[x]))
    print('Discriminator FLOPS = ', get_flops(model_d,[y]))
    
    #forward_backward()
    
    #print(get_flops(model, [x]))

Model: "Discriminator_base"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1, 51, 51, 25)]   0         
                                                                 
 conv3d_7 (Conv3D)           (None, 16, 51, 51, 25)    2896      
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 16, 51, 51, 25)    0         
                                                                 
 dropout (Dropout)           (None, 16, 51, 51, 25)    0         
                                                                 
 zero_padding3d_5 (ZeroPaddi  (None, 16, 51, 51, 27)   0         
 ng3D)                                                           
                                                                 
 conv3d_8 (Conv3D)           (None, 8, 47, 46, 22)     23048     
                                                

2023-04-14 11:24:43.788462: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 8
2023-04-14 11:24:43.788634: I tensorflow/core/grappler/clusters/single_machine.cc:358] Starting new session
2023-04-14 11:24:43.839681: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38214 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:0f:00.0, compute capability: 8.0
2023-04-14 11:24:43.841254: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38214 MB memory:  -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:15:00.0, compute capability: 8.0
2023-04-14 11:24:43.842818: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 38214 MB memory:  -> device: 2, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:

Generator FLOPS =  3063004077.5
Discriminator FLOPS =  1896724708.0


2023-04-14 11:24:44.098994: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 8
2023-04-14 11:24:44.099141: I tensorflow/core/grappler/clusters/single_machine.cc:358] Starting new session
2023-04-14 11:24:44.150072: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38214 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:0f:00.0, compute capability: 8.0
2023-04-14 11:24:44.151646: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 38214 MB memory:  -> device: 1, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:15:00.0, compute capability: 8.0
2023-04-14 11:24:44.153212: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 38214 MB memory:  -> device: 2, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:

# Theoretical calculation of floops

$ ConvFlops = 2 * NumberKernel * ShapeKernel * OutputShape $

# Baseline

In [8]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling --use_tf32

Num GPUs Available:  1
True
64
Number of devices: 1
Searching in : /home/datascience/tfrecordsprepro/*.tfrecords
Found 28 files. 
Initialization time is 0.00034737586975097656 seconds
Loading Data
Epoch 1 of 60
Number of Batches:  1952
Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
FLOP =  2151692188712
Average per batch was:  0.7735849380493164
Time taken by batch 6  was 1.0761363506317139 seconds.
Time taken by epoch0 was 39.44963765144348 seconds.

Testing for epoch 0:
(64, 256)
FLOP =  1512471336710
Average per batch was:  0.12301487922668457


# Results baseline

Batch size = 64

Number of GPUs = 1

FLOPS Training = 2151692188712

Time training = 0.77 +- 0.01

FLOPS Testing = 1512471336710

Time Testing = 0.121 +- 0.001

GPU Memory = 17921 MiB

GPU percentage = 100%

GPU Power = between 210 and 310 (Normally around 240-260)


# Precision

## Float32

In [4]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling 

Num GPUs Available:  1
2023-04-17 09:58:52.455110: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-17 09:58:53.505032: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38214 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:0f:00.0, compute capability: 8.0
2023-04-17 09:58:53.514180: I tensorflow/core/common_runtime/direct_session.cc:370] Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:0f:00.0, compute capability: 8.0

False
64
2023-04-17 09:58:53.540760: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:loc

## Results float32

Batch size = 64

Number of GPUs = 1

FLOPS Training = 2151692188712

Time training = 1.46 +- 0.01

FLOPS Testing = 1512471336710

Time Testing = 0.272 +- 0.001

GPU Memory = 9729 MiB

GPU percentage = 100%

GPU Power = between 210 and 260 (Normally around 250)

## Mixed Float16

In [5]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling --use_tf32 --use_precision 'mixed_float16'

Num GPUs Available:  1
2023-04-17 10:00:10.850275: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-17 10:00:13.349460: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38214 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:0f:00.0, compute capability: 8.0
2023-04-17 10:00:13.359069: I tensorflow/core/common_runtime/direct_session.cc:370] Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:0f:00.0, compute capability: 8.0

True
64
2023-04-17 10:00:13.389124: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:loca

## Results Mixed Float16

Batch size = 64

Number of GPUs = 1

FLOPS Training = 2151692188712

Time training = 0.69 +- 0.01

FLOPS Testing = 1512471336710

Time Testing = 0.085 +- 0.001

GPU Memory = 9729 MiB

GPU percentage = 100%

GPU Power = between 230 and 280 (Normally around 250)

## Mixed BFloat16

In [83]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling --use_tf32 --use_precision 'mixed_bfloat16'

Num GPUs Available:  1
2023-04-12 14:57:23.923602: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-12 14:57:25.937240: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38214 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:0f:00.0, compute capability: 8.0
2023-04-12 14:57:25.946061: I tensorflow/core/common_runtime/direct_session.cc:370] Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: NVIDIA A100-SXM4-40GB, pci bus id: 0000:0f:00.0, compute capability: 8.0

2023-04-12 14:57:25.975026: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/re

## Results Mixed BFloat16

# Batch Size tf32

## 96

In [7]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --batchsize 96 --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling --use_tf32

Num GPUs Available:  1
True
96
Number of devices: 1
Searching in : /home/datascience/tfrecordsprepro/*.tfrecords
Found 28 files. 
Initialization time is 0.000438690185546875 seconds
Loading Data
Epoch 1 of 60
Number of Batches:  1301
Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
FLOP =  3227529574280
Average per batch was:  1.2198906898498536
Time taken by batch 6  was 1.5227434635162354 seconds.
Time taken by epoch0 was 51.18376302719116 seconds.

Testing for epoch 0:
(96, 256)
FLOP =  2268707005062
Average per batch was:  0.18020858764648437


17921

## 128

In [16]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --batchsize 128 --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling --use_tf32

Num GPUs Available:  1
True
128
Number of devices: 1
Searching in : /home/datascience/tfrecordsprepro/*.tfrecords
Found 28 files. 
Initialization time is 0.0003972053527832031 seconds
Loading Data
Epoch 1 of 60
Number of Batches:  976
Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
FLOP =  4303366959848
Average per batch was:  1.567598009109497
Time taken by batch 6  was 1.8706345558166504 seconds.
Time taken by epoch0 was 64.55832934379578 seconds.

Testing for epoch 0:
(128, 256)
FLOP =  3024942673414
Average per batch was:  0.24149599075317382


17921 / 34305

## 256

In [15]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --batchsize 256 --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling --use_tf32

Num GPUs Available:  1
True
256
Number of devices: 1
Searching in : /home/datascience/tfrecordsprepro/*.tfrecords
Found 28 files. 
Initialization time is 0.00036644935607910156 seconds
Loading Data
Epoch 1 of 60
Number of Batches:  488
Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
FLOP =  8606716502120
Average per batch was:  3.1962491989135744
Time taken by batch 6  was 3.4644761085510254 seconds.
Time taken by epoch0 was 108.58399653434753 seconds.

Testing for epoch 0:
(256, 256)
FLOP =  6049885346822
Average per batch was:  0.4699376583099365


34305

## 512

In [14]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --batchsize 512 --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling --use_tf32

Num GPUs Available:  1
True
512
Number of devices: 1
Searching in : /home/datascience/tfrecordsprepro/*.tfrecords
Found 28 files. 
Initialization time is 0.00033164024353027344 seconds
Loading Data
Epoch 1 of 60
Number of Batches:  244
Traceback (most recent call last):
  File "gan_main.py", line 781, in <module>
    main_gan()
  File "gan_main.py", line 391, in main_gan
    real_batch_loss, fake_batch_loss, gen_losses = distributed_train_step(
  File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/tensorflow/python/eager/execute.py", line 54, in quick_execute
    tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.ResourceExhaustedError: Graph execution error:

Detected at node 'gradient_tape/Disc

# Batch Size float32

## 96

In [17]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --batchsize 96 --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling

Num GPUs Available:  1
False
96
Number of devices: 1
Searching in : /home/datascience/tfrecordsprepro/*.tfrecords
Found 28 files. 
Initialization time is 0.0004143714904785156 seconds
Loading Data
Epoch 1 of 60
Number of Batches:  1301
Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
FLOP =  3227529574280
Average per batch was:  2.1983806610107424
Time taken by batch 6  was 2.5031728744506836 seconds.
Time taken by epoch0 was 46.40627479553223 seconds.

Testing for epoch 0:
(96, 256)
FLOP =  2268707005062
Average per batch was:  0.4226221561431885


17921

## 128

In [65]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --batchsize 128 --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling

Num GPUs Available:  1
False
128
Number of devices: 1
Searching in : /home/datascience/tfrecordsprepro/*.tfrecords
Found 28 files. 
Initialization time is 0.0003814697265625 seconds
Loading Data
Epoch 1 of 60
Number of Batches:  976
Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
FLOP =  4303366959848
Average per batch was:  2.8210868358612062
Time taken by batch 6  was 3.1255416870117188 seconds.
Time taken by epoch0 was 55.360952615737915 seconds.

Testing for epoch 0:
(128, 256)
FLOP =  3024942673414
Average per batch was:  0.5526898860931396


17921

## 256

In [66]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --batchsize 256 --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling

Num GPUs Available:  1
False
256
Number of devices: 1
Searching in : /home/datascience/tfrecordsprepro/*.tfrecords
Found 28 files. 
Initialization time is 0.0003867149353027344 seconds
Loading Data
Epoch 1 of 60
Number of Batches:  488
Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
FLOP =  8606716502120
Average per batch was:  5.5624189376831055
Time taken by batch 6  was 5.854866981506348 seconds.
Time taken by epoch0 was 93.62191557884216 seconds.

Testing for epoch 0:
(256, 256)
FLOP =  6049885346822
Average per batch was:  1.0930707454681396


34305

## 512

In [67]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --batchsize 512 --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling

Num GPUs Available:  1
False
512
Number of devices: 1
Searching in : /home/datascience/tfrecordsprepro/*.tfrecords
Found 28 files. 
Initialization time is 0.00038051605224609375 seconds
Loading Data
Epoch 1 of 60
Number of Batches:  244
Traceback (most recent call last):
  File "gan_main.py", line 781, in <module>
    main_gan()
  File "gan_main.py", line 391, in main_gan
    real_batch_loss, fake_batch_loss, gen_losses = distributed_train_step(
  File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/home/datascience/conda/tensorflow28_p38_gpu_v1/lib/python3.8/site-packages/tensorflow/python/eager/execute.py", line 54, in quick_execute
    tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.ResourceExhaustedError: Graph execution error:

Detected at node 'gradient_tape/Dis

# Number of GPUs

## 2

In [69]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --batchsize 64 --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling --use_tf32

Num GPUs Available:  1
True
64
Number of devices: 2
Searching in : /home/datascience/tfrecordsprepro/*.tfrecords
Found 28 files. 
Initialization time is 0.00038123130798339844 seconds
Loading Data
Epoch 1 of 60
Number of Batches:  976
Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
FLOP =  4303384377418
Average per batch was:  0.7977048873901367
Time taken by batch 6  was 1.5197126865386963 seconds.
Time taken by epoch0 was 61.18161749839783 seconds.

Testing for epoch 0:
(64, 256)
(64, 256)
FLOP =  3024942673428
Average per batch was:  0.12733020782470703


## 4

In [72]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --batchsize 64 --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling --use_tf32

Num GPUs Available:  1
True
64
Number of devices: 4
Searching in : /home/datascience/tfrecordsprepro/*.tfrecords
Found 28 files. 
Initialization time is 0.0003292560577392578 seconds
Loading Data
Epoch 1 of 60
Number of Batches:  488
Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
FLOP =  8606768754878
Average per batch was:  0.8313645362854004
Time taken by batch 6  was 2.3093295097351074 seconds.
Time taken by epoch0 was 99.28882384300232 seconds.

Testing for epoch 0:
(64, 256)
(64, 256)
(64, 256)
(64, 256)
FLOP =  6049885346864
Average per batch was:  0.12891016006469727


## 8

In [71]:
import tensorflow as tf
import sys, os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,5,6,7"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!python3 gan_main.py --batchsize 64 --datapath '/home/datascience/tfrecordsprepro/*.tfrecords' --outpath './' --profiling --use_tf32

Num GPUs Available:  1
True
64
Number of devices: 8
Searching in : /home/datascience/tfrecordsprepro/*.tfrecords
Found 28 files. 
Initialization time is 0.0003495216369628906 seconds
Loading Data
Epoch 1 of 60
Number of Batches:  244
Instructions for updating:
Use `tf.compat.v1.graph_util.tensor_shape_from_node_def_name`
FLOP =  17213537509990
Average per batch was:  0.8756521701812744
Time taken by batch 6  was 3.901090383529663 seconds.
Time taken by epoch0 was 172.69893503189087 seconds.

Testing for epoch 0:
(64, 256)
(64, 256)
(64, 256)
(64, 256)
(64, 256)
(64, 256)
(64, 256)
(64, 256)
FLOP =  12099770693736
Average per batch was:  0.13445558547973632


In [14]:
!python gan_main.py -h

usage: gan_main.py [-h] [--multi_node MULTI_NODE]
                   [--workers WORKERS [WORKERS ...]] [--index INDEX]
                   [--use_gs USE_GS] [--datapath DATAPATH] [--outpath OUTPATH]
                   [--nbepochs NBEPOCHS] [--batchsize BATCHSIZE]
                   [--use_gpus USE_GPUS]
                   [--GLOBAL_BATCH_SIZE GLOBAL_BATCH_SIZE]
                   [--nb_epochs NB_EPOCHS] [--batch_size BATCH_SIZE]
                   [--latent_size LATENT_SIZE] [--verbose VERBOSE]
                   [--nEvents NEVENTS] [--ascale ASCALE] [--yscale YSCALE]
                   [--xscale XSCALE] [--xpower XPOWER] [--angscale ANGSCALE]
                   [--analyse ANALYSE] [--dformat DFORMAT] [--thresh THRESH]
                   [--angtype ANGTYPE] [--particle PARTICLE] [--warm WARM]
                   [--lr LR] [--events_per_file EVENTS_PER_FILE] [--name NAME]
                   [--g_weights G_WEIGHTS] [--d_weights D_WEIGHTS]
                   [--tlab TLAB] [--profiling] [--u

In [57]:
import tensorflow as tf
tf.config.experimental.enable_tensor_float_32_execution(False)
tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')

layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
print(layer.compute_dtype)
print(layer.variable_dtype)

print(tf.config.experimental.tensor_float_32_execution_enabled())


bfloat16
float32
False


In [None]:
class Args:
  data = './data/penn'
  model = 'LSTM'
  emsize = 200
  nhid = 200

args=Args()

In [63]:
import pandas as pd

df = pd.read_csv('/home/datascience/gpu_stats/tf32_bs96.csv', header=None)

#print(df)
#power.draw [W]  utilization.gpu [%]

power_values = []

for index, row in df.iterrows():
    if index > 0:
        value = int(row[1][:-2])
        if value >= 90:
            power_values.append(float(row[0][:-2]))
            
print(max(power_values))
print(min(power_values))
print(sum(power_values)/len(power_values))
            

304.44
75.02
224.8809090909091
