In [None]:
import matplotlib.pyplot as plt
import matplotlib.transforms as transforms
import numpy as np

In [None]:
cifar_predict_mem = 25146949120
cifar_measure_mem = 23366852608
cifar_measure_bs = 13466

imagenet_predict_mem = 25120424448
imagenet_measure_mem = 24362376704
imagenet_measure_bs = 859

mem_max = 25147867136

In [None]:
# relative error
cifar_error = (cifar_predict_mem - cifar_measure_mem) / cifar_measure_mem
imagenet_error = (imagenet_predict_mem - imagenet_measure_mem) / imagenet_measure_mem
print(f'cifar relative error: {cifar_error: .3f}')
print(f'imagenet relative error: {imagenet_error: .3f}')

In [None]:
species = ['CIFAR-100', 'ImageNet']
mem = {
    'prediction': [i / 2**20 for i in [cifar_predict_mem, imagenet_predict_mem]],
    'measurement': [i / 2**20 for i in [cifar_measure_mem, imagenet_measure_mem]],
}
x = np.arange(len(species))
width = 0.25
multiplier = 0

fig, ax = plt.subplots()

for attribute, value in mem.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, value, width, label=attribute)
    ax.bar_label(rects)
    multiplier += 1

GPU_MiB = mem_max / 2**20
ax.axhline(y=GPU_MiB, color='red', linestyle='--')
trans = transforms.blended_transform_factory(ax.get_yticklabels()[0].get_transform(), ax.transData)
ax.text(0, GPU_MiB, "{:.0f}".format(GPU_MiB), color="red", transform=trans, ha="right", va="center")

ax.set_ylabel('Memory Usage (MiB)')
ax.set_xlabel('Dataset')
ax.set_title('Memory Usage for Training ResNet-18')
ax.set_xticks(x + width / len(species), species)
ax.legend()

plt.savefig('mem_usage_gap.png')

In [None]:
# for print image
#"""
species = ['CIFAR-100', 'ImageNet']
mem = {
    'prediction': [i / 2**20 for i in [cifar_predict_mem, imagenet_predict_mem]],
    'measurement': [i / 2**20 for i in [cifar_measure_mem, imagenet_measure_mem]],
}
x = np.arange(len(species))
width = 0.25
multiplier = 0

fig, ax = plt.subplots(dpi=300)

for attribute, value in mem.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, value, width, label=attribute)
    ax.bar_label(rects)
    multiplier += 1

GPU_MiB = mem_max / 2**20
ax.axhline(y=GPU_MiB, color='red', linestyle='--')
trans = transforms.blended_transform_factory(ax.get_yticklabels()[0].get_transform(), ax.transData)
ax.text(0, GPU_MiB, "{:.0f}".format(GPU_MiB), color="red", transform=trans, ha="right", va="center")

ax.set_ylabel('Memory Usage (MiB)')
ax.set_xlabel('Dataset')
#ax.set_title('Memory Usage for Training ResNet-18')
ax.set_xticks(x + width / len(species), species)
ax.legend()

plt.savefig('mem_usage_gap.png', transparent=True)
#"""

Dear Professor Nguyen,

I apologize for any inconvenience this may have caused.
As I have not received a response from you in about a week, I am unsure whether my previous email reached you.
Therefore, I am sending it again.

As we discussed in our previous meeting, we talked about how to automate the configuration of training settings.
For my work, the maximum batch size $B_{max}$ requires manual input from the user.
You suggested that I explore a method to automate this parameter.

To address this, I propose integrating a program to calculate $B_{max}$ before training begins.
As we know, PyTorch will crash if the GPU cannot allocate memory beyond its available free memory.
In practice, users often determine $B_{max}$ through trial and error, which is also the approach I followed in my paper.
After careful consideration, I believe it is feasible to automate $B_{max}$ by profiling memory usage.

Assuming that memory usage scales linearly with batch size $B$, we can compute the approximate total memory usage $M(B)$ using the following equation:
$$M(B) = P + B \times (A \times L)$$
Here, $P$ is the memory required for model parameters, $A$ is the memory used for activations per sample, and $L$ is the number of layers in the model.
Our goal is to find the maximum batch size $B_{max}$ such that $M(B_{max}) < M_{GPU}$, where $M_{GPU}$ is the total GPU memory.
To achieve this, we can measure memory usage by running two batches with different sizes and calculating the memory usage gap.
Using this gap, we can estimate the memory usage per sample, infer $M(B_{max})$, and determine $B_{max}$ with minimal additional computational overhead.

I conducted an experiment to validate this approach, and the results of memory usage are shown in the attached figure.
The discrepancy between predicted and measured values is likely due to PyTorch's memory management mechanisms.
As the measured values are consistently lower than the predicted ones, the computed batch size $B_{comp}$ is smaller than the actual maximum batch size $B_{max}$.
Although $B_{comp}$ is not optimal, it remains a sub-optimal yet practical solution.
Based on this observation, we can reasonably conclude that $B_{comp}$ is both reliable and applicable in practice.

I hope this approach meets your expectations, and I would greatly appreciate any further suggestions you may have.
Thank you for your time and consideration.

Sincerely,
Kuan-Wei Lu