In [1]:
# CUDF - GPU Dataframe
# cudf is a GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.    
%load_ext cudf.pandas

#=============================
# Autoreload for executing external Python code
%load_ext autoreload
%autoreload 2
#=============================

# Remove the warnings from the notebook
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [2]:
# Importing libraries
#=============================
## pip install gputil
## pip tensorflow[and-cuda]

import tensorflow as tf
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback
import GPUtil
import pandas as pd
import os
import datetime
import requests
from bs4 import BeautifulSoup
import subprocess

# Create a Pandas DataFrame to store data
columns = ['Epoch', 'FreeMemoryMB', 'GPUUtilization', 'Temperature', 'FanSpeed']
data = []

def get_gpu_fan_speed():
    try:
        fan_speed = subprocess.check_output(['nvidia-smi', '--query-gpu=fan.speed', '--format=csv,noheader,nounits'], universal_newlines=True).strip()
        return fan_speed
    except Exception as e:
        return f"Error: {str(e)}"

class GPUMonitor(Callback):
    def on_epoch_end(self, epoch, logs=None):
        # Collect GPU info, including fan speed
        gpus = GPUtil.getGPUs()
        for gpu in gpus:
            free_memory = gpu.memoryFree
            utilization = gpu.load * 100
            temperature = gpu.temperature
            fan_speed = get_gpu_fan_speed()  # Get GPU fan speed here
            data.append([epoch, free_memory, utilization, temperature, fan_speed])

# Load the MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize the data
x_train, x_test = x_train / 255.0, x_test / 255.0

# Build a Sequential model with 30 hidden layers
model = Sequential([
    Flatten(input_shape=(28, 28))
] + [Dense(128, activation='relu') for _ in range(30)] + [Dense(10, activation='softmax')])

# Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model for 100 epochs with GPU Monitoring
gpu_monitor = GPUMonitor()
history = model.fit(x_train, y_train, epochs=100, validation_split=0.2, callbacks=[gpu_monitor])

# Evaluate the model
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)
print('\nTest accuracy:', test_acc)

# Check TensorFlow, CuDNN, CUDA versions and suggest updates if needed
tf_version = tf.__version__
cuda_version = os.popen('nvidia-smi --query-gpu=driver_version --format=csv,noheader,nounits').read().strip()
cudnn_version = tf.keras.backend.cudnn_version()
print(f'TensorFlow version: {tf_version}')
print(f'CuDNN version: {cudnn_version}')
print(f'CUDA version: {cuda_version}')

# Search for updated versions of libraries and drivers
def get_latest_version(package_name):
    try:
        url = f"https://pypi.org/project/{package_name}/"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        latest_version = soup.select_one(".package-snippet__version").get_text()
        return latest_version
    except Exception as e:
        return f"Error: {str(e)}"

# List of packages to check for updates
packages_to_check = ['tensorflow', 'pytorch', 'keras', 'cuda', 'cudnn', 'tensorrt', 'nvidia-drivers']

for package in packages_to_check:
    latest_version = get_latest_version(package)
    print(f'Latest {package} version: {latest_version}')

# Create a Pandas DataFrame
df = pd.DataFrame(data, columns=columns)

# Create a CSV report with timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
report_filename = f'TF_Stress_Test_{timestamp}.csv'
df.to_csv(report_filename, index=False)

# Print a message about the report file
print(f'Report saved as: {report_filename}')


2023-12-15 04:41:11.683514: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-15 04:41:11.683551: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-15 04:41:11.684035: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Epoch 1/100


I0000 00:00:1702590075.096424   53260 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
 182/1500 [==>...........................] - ETA: 8s - loss: 0.4702 - accuracy: 0.8858