In [1]:
import os

import pandas as pd
from tqdm.notebook import tqdm_notebook
os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["HF_HOME"] = "D:/models/Huggingface"
from max_batch_size_calculation import get_example_prompt
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoConfig
import torch
from utils import clear_cuda
import logging
from math import floor

In [2]:
model_name = 'meta-llama/Llama-3.1-8B-Instruct'

In [3]:
torch.cuda.set_per_process_memory_fraction(1., 0)

In [4]:
device_memory = torch.cuda.get_device_properties(0).total_memory
print(f'Available GPU Memory: {device_memory/(1024 ** 3):.4f} GB')

Available GPU Memory: 23.9878 GB


In [5]:
# 1. Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    r"D:\models\Huggingface\hub\models--meta-llama--Llama-3.1-8B-Instruct"
)

# 2. Load model with FlashAttention-2 enabled
model = AutoModelForCausalLM.from_pretrained(
    r"D:\models\Huggingface\hub\models--meta-llama--Llama-3.1-8B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="cuda:0"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
tokenizer.model_max_length

131072

In [7]:
logging.basicConfig(level=logging.WARNING)

In [5]:
num_input_tokens: int = 1024 - 256
num_output_tokens: int = 256
gpu_max_memory: float = device_memory
cuda_overhead: int|float = 0 # 1e9 is equivalent to 1 GB
flash_attn: bool = False

In [9]:
1024 - 256

768

In [10]:
input_lengths  = [1024,2048,4096,8192, 1537, 2347, 5139, 7133]
output_lengths = [128, 256, 512, 1024, 2048, 158, 289, 548, 1132, 2079]

In [11]:
seq_len_output_len_pairs = [(i, o) for i in input_lengths for o in output_lengths]

In [10]:
attention_implementation_mapping = {
    "google/gemma-2-9b-it": ("GQA", 2),
    "meta-llama/Llama-3.1-8B-Instruct": ("GQA", 8),
}

parameter_number_mapping = {
    "meta-llama/Llama-3.1-8B-Instruct": 8.03 * 10**9,
    "google/gemma-2-9b-it": 9.24 * 10**9,
}
def get_vram_relevant_model_information(
    model_name: str,
) -> tuple[int, int, int, str, int, int, int]:
    """
    Retrieves relevant model information regarding VRAM usage for a given model.
    This function extracts details such as the total number of parameters,
    head dimensions, hidden size, number of layers, attention type,
    and the number of groups for the specified model.

    :param model_name: Name of the pre-trained model.
    :return: A tuple containing the number of parameters, head dimensions,
             hidden size, number of layers, attention type, number of groups, d_ffn and num_heads.
    """
    config = AutoConfig.from_pretrained(model_name)
    num_params = parameter_number_mapping[model_name]
    hidden_size = config.hidden_size
    num_layers = config.num_hidden_layers
    attention_type, num_groups = attention_implementation_mapping[model_name]
    d_ffn = config.intermediate_size
    num_heads = config.num_attention_heads
    return (
        num_params,
        hidden_size,
        num_layers,
        attention_type,
        num_groups,
        d_ffn,
        num_heads,
    )
def calculate_kv_cache_size(num_layers:int, sequence_length:int, head_dimension:int,n_groups:int, k_precision:int, v_precision:int):
    """

    :param num_layers:
    :param sequence_length:
    :param head_dimension:
    :param n_groups:
    :param k_precision:
    :param v_precision:
    :return:
    """
    # head_dimension * n_groups = group dimension
    return num_layers * sequence_length * head_dimension * n_groups * (k_precision + v_precision)


def calculate_activation_memory(sequence_length:int,ffn_dim:int, precision:int) -> int:
    """

    :param sequence_length:
    :param ffn_dim:
    :return:
    """
    # 9 is for 9 Flops per Swish Activation (1 for the multiplication, 8 for the sigmoid)
    return 9 * sequence_length * ffn_dim * precision

def calculate_theoretical_batchsize(
    num_parameters: int,
    hidden_size: int,
    maximum_memory: int,
    max_num_input_tokens,
    max_num_output_tokens: int,
    num_layers: int,
    d_ffn:int,
    n_heads:int,
    precision: int = 16,
    gqa_groups: int = 1,
    cuda_overhead: int = 8e9,
    **kwargs,
) -> int:
    """
    Calculate the theoretical batch size for a model inference operation given system and
    model constraints such as available memory, model parameters and implementation details.

    :param num_parameters:
        Total number of parameters in the model.

    :param hidden_size:
        Dimension of the hidden layers used in the model.

    :param head_dimension:
        Dimension of each attention head in the model.

    :param maximum_memory:
        Total available memory for the operation, measured in bytes.

    :param max_num_input_tokens:
        Maximum number of input tokens the model can process during inference.

    :param max_num_output_tokens:
        Maximum number of output tokens the model can produce during inference.

    :param num_layers:
        Total number of layers in the transformer model.

    :param use_flash_attn:
        Whether to use flash attention for reducing memory usage.

    :param precision:
        Precision in bits for both model parameters and computation (e.g., 16 for FP16).

    :param attn_implementation:
        Attention implementation strategy used in the model. Possible values are:
        - "MHA" for Multi-Headed Attention,
        - "MQA" for Multi-Query Attention,
        - Other grouped-query attention (GQA) variations.

    :param gqa_groups:
        Number of groups to use in grouped-query attention (GQA) setup.

    :param cuda_overhead:
        Memory overhead in bytes due to CUDA-specific artifacts, such as driver buffers and
        workspace fragmentation during operation.

    :param kwargs:
        Additional keyword arguments, including:
        - k_precision: Precision for Key (K) attention cache values, measured in bits.
        - v_precision: Precision for Value (V) attention cache values, measured in bits.

    :return:
        Theoretical maximum batch size for the inference operation.
    """
    byte_constant: int = 8  # Number of bits per byte
    bytes_per_parameter = precision / byte_constant
    model_weight_size = num_parameters * bytes_per_parameter
    context_length = max_num_input_tokens + max_num_output_tokens
    head_size = hidden_size//n_heads

    if (k_precision := kwargs.get("k_precision")) is None:
        k_bytes = bytes_per_parameter
    else:
        k_bytes = k_precision / byte_constant
    if (v_precision := kwargs.get("v_precision")) is None:
        v_bytes = bytes_per_parameter
    else:
        v_bytes = v_precision / byte_constant

    kv_cache_size = calculate_kv_cache_size(num_layers, context_length, head_size, gqa_groups,k_bytes, v_bytes)

    # Activation memory during inference
    # Flash attention significantly reduces memory footprint


    # During inference, activations are much smaller than during training
    activation_memory_inference = calculate_activation_memory(context_length, d_ffn, bytes_per_parameter)
    # subtract overhead from memory
    usable_memory = maximum_memory - cuda_overhead
    # calculate possible batchsize with maximum memory
    return floor(
        (usable_memory - model_weight_size)
        / (kv_cache_size + activation_memory_inference)
    )


def calculate_theoretical_max_batchsize(
    model_name: str,
    gpu_max_memory: int | float,
    max_input_tokens: int,
    max_output_tokens: int,
    use_flash_attn=True,
    precision: int = 16,
    k_precision=None,
    v_precision=None,
    cuda_overhead: int = 8e9,
) -> int:
    """
    Calculates the theoretical maximum batch size that can fit into the GPU memory based
    on the model's specifications and other parameters.

    This function uses various model and configuration details, such as the number of
    parameters, hidden dimensions, memory constraints, token limits and precision,
    to compute the highest possible batch size that can be handled without exceeding
    the available GPU memory.

    :param model_name: The name of the model for which the batch size will be calculated.
    :param gpu_max_memory: The maximum GPU memory available, in bytes. Accepts int or float.
    :param max_input_tokens: The maximum number of input tokens per sample.
    :param max_output_tokens: The maximum number of output tokens per sample.
    :param use_flash_attn: A flag indicating whether to use flash attention for reduced memory usage. Default is True.
    :param precision: The numerical precision (e.g., 16-bit or 32-bit) to be used
        while performing calculations. Defaults to 16.
    :param k_precision: The precision for the keys in attention projection. Defaults to None.
    :param v_precision: The precision for the values in attention projection. Defaults to None.
    :param cuda_overhead: Estimated CUDA memory overhead in bytes, which accounts
        for additional memory consumed by CUDA processes. Defaults to 8e9 (1 GB).
    :return: The maximum theoretical batch size that can fit into available GPU memory.
    """
    num_params, hidden_size, num_layers, attention_type, num_groups, d_ffn, num_heads= get_vram_relevant_model_information(model_name)
    return calculate_theoretical_batchsize(
        num_params,
        hidden_size,
        gpu_max_memory,
        max_input_tokens,
        max_output_tokens,
        num_layers,
        d_ffn,
        num_heads,
        use_flash_attn=use_flash_attn,
        precision=precision,
        k_precision=k_precision,
        v_precision=v_precision,
        attn_implementation=attention_type,
        gqa_groups=num_groups,
        cuda_overhead=cuda_overhead,
    )


In [13]:
example_prompt = get_example_prompt(model_name, num_input_tokens)

In [14]:
max_bs = calculate_theoretical_max_batchsize(model_name, gpu_max_memory, num_input_tokens, num_output_tokens, cuda_overhead=cuda_overhead, use_flash_attn=False)
print(f'Approximated Max batch size: {max_bs}')

Approximated Max batch size: 23


In [15]:
test_bs = max_bs - 1

In [16]:
data = {'input_length': [], 'output_length': [], 'max_batch_size': [], 'calculated_max_batch_size': []}
with torch.inference_mode():
    conn = pipeline('text-generation',model, tokenizer=tokenizer, batch_size=10**4, torch_dtype=torch.bfloat16)
    conn.tokenizer.pad_token_id = 0 #
    for sequence_length, output_length in tqdm_notebook(seq_len_output_len_pairs):
        example_prompt = get_example_prompt(model_name, sequence_length)
        calculated_bs = calculate_theoretical_max_batchsize(model_name, gpu_max_memory, sequence_length, output_length, cuda_overhead=cuda_overhead,use_flash_attn=False)
        test_bs = calculated_bs - 1
        clear_cuda()
        while 1:
            try:
                conn([example_prompt for _ in range(test_bs)], max_new_tokens=output_length, batch_size=test_bs)
                test_bs += 1
                clear_cuda()
            except Exception as ex:
                if test_bs >= calculated_bs:
                    data['input_length'].append(sequence_length)
                    data['output_length'].append(output_length)
                    data['max_batch_size'].append(test_bs)
                else:
                    print(f"Calculated batch size {calculated_bs} was too large for input length {sequence_length} and output length {output_length}")
                break


Device set to use cuda:0


  0%|          | 0/80 [00:00<?, ?it/s]

  return isinstance(obj, torch.Tensor)
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 f

In [17]:
for k,v in data.items():
    print(f'{k}: {len(v)}')

input_length: 80
output_length: 80
max_batch_size: 80
calculated_max_batch_size: 0


In [18]:
data.pop('calculated_max_batch_size')

[]

In [7]:
import pandas as pd
df_batchsize = pd.DataFrame(data)
df_batchsize.to_csv('batchsize_experiments.csv', index=False)

NameError: name 'data' is not defined

In [8]:
df_batchsize = pd.read_csv('batchsize_experiments.csv')

In [11]:
df_batchsize['calculated_max_batch_size'] = df_batchsize.apply(lambda x: calculate_theoretical_max_batchsize(model_name, gpu_max_memory, max_input_tokens=x['input_length'], max_output_tokens=x['output_length'], use_flash_attn=False, cuda_overhead=cuda_overhead), axis=1)

In [12]:
# We got this results by plugging in the input sequence length in the tool and increasing batch size until it doe snot fit anymore (according to the tool) then took the largest batch size that could fit a sequence of that length
apxml_results = {1024:12, 2048: 6, 4096:2, 8192:1} # https://apxml.com/tools/vram-calculator
asmirnov_results = {1024:25, 2048:8, 4096:2, 8192:0, 1537:13, 2347:6, 5139:1, 7133:0} # https://vram.asmirnov.xyz/

In [13]:
df_batchsize['calculated_max_batch_size_apxml'] = df_batchsize['input_length'].apply(lambda x: apxml_results.get(x))
df_batchsize['calculated_max_batch_size_asmirnov'] = df_batchsize['input_length'].apply(lambda x: asmirnov_results[x])

In [14]:
df_batchsize['delta_computation'] = df_batchsize['max_batch_size'] - df_batchsize['calculated_max_batch_size']
df_batchsize['delta_apxml'] = df_batchsize['max_batch_size'] - df_batchsize['calculated_max_batch_size_apxml']
df_batchsize['delta_asmirnov'] = df_batchsize['max_batch_size'] - df_batchsize['calculated_max_batch_size_asmirnov']

In [15]:
df_batchsize['delta_computation'].min(), df_batchsize['delta_computation'].max()

(np.int64(2), np.int64(16))

In [16]:
df_batchsize['delta_apxml'].min(), df_batchsize['delta_apxml'].max()

(np.float64(3.0), np.float64(19.0))

In [17]:
df_batchsize['delta_asmirnov'].min(), df_batchsize['delta_asmirnov'].max()

(np.int64(-9), np.int64(8))

In [18]:
mean_delta_computation = df_batchsize['delta_computation'].abs().mean()
mean_delta_apxml = df_batchsize['delta_apxml'].abs().mean()
mean_delta_asmirnov = df_batchsize['delta_asmirnov'].abs().mean()
mean_delta_computation, mean_delta_apxml, mean_delta_asmirnov

(np.float64(5.475), np.float64(8.225), np.float64(5.9875))

In [19]:
(df_batchsize['delta_computation'].abs() / df_batchsize['max_batch_size']).mean() * 100

np.float64(45.20206875327681)

In [20]:
(df_batchsize['delta_asmirnov'].abs() / df_batchsize['max_batch_size']).mean() * 100

np.float64(64.74178990299949)

In [21]:
(df_batchsize['delta_apxml'].abs() / df_batchsize['max_batch_size']).mean() * 100

np.float64(65.53535440716367)

In [22]:
df_batchsize.to_csv('batchsize_experiments_with_calculation.csv', index=False)