In [None]:
from transformers import AutoConfig
import torch
from transformers.models.llama.configuration_llama import LlamaConfig
from utils import ComputeClient
import time
from utils import get_example_prompt
import pandas as pd
import random

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
torch.set_default_device(device)

In [5]:
model_name = "D:\models\Huggingface\hub\models--meta-llama--Llama-3.1-8B-Instruct"  #'meta-llama/Llama-3.1-8B-Instruct'

In [6]:
model_config = AutoConfig.from_pretrained(model_name)

In [35]:
# Tokenization cost is assumed to scale linearly with sequence length.
# Since the exact multiplicative constant and additive bias are unknown,
# we assume a zero bias and adopt a conservatively large constant (200).
# This overestimation is acceptable, as tokenization FLOPs are expected
# to be negligible compared to the quadratic and higher-order operations
# in transformer layers.
guessed_tokenization_constant = 1
flash_attention_version = 3  # 3 is the only Flash Attention Version implemented on SGLang
flash_attn_efficiencies = {1: .325, 2: .615, 3: .75}
GPU_FLOPS = 165.2 * 10 ** 12  # FLOPS of Nvidia RTX 4090 in float16
GPU_BANDWIDTH = 1008 * 1024 ** 3

In [None]:
client = ComputeClient()

In [None]:
def calculate_ffn_flops(batch_size: int, sequence_length: int, d_i: int, d_m: int) -> int:
    """
    Calculate the FLOPs required for a Feed-Forward Network (FFN) layer using a SwiGLU activation.

    The formula used is:
        FLOP_{FFN} = batch_size * sequence_length * d_inner * (10 + 3 * d_model)

    :param batch_size: Number of input sequences processed in parallel.
    :param sequence_length: Number of tokens per sequence.
    :param d_i: Inner dimension of the FFN (also referred to as d_inner).
    :param d_m: Hidden dimension of the model (also referred to as d_model).
    :return: Total number of FLOPs for the FFN layer.
    """
    return batch_size * sequence_length * d_i * (10 + 5 * d_m)


def calculate_rms_flops(batch_size: int, sequence_length: int, d_m: int) -> int:
    """
    Calculate the FLOPs required for an RMSNorm operation.

    The formula used is:
        FLOP_{RMS}^{Batch} = batch_size * sequence_length * (4 * d_model + 3)

    :param batch_size: Number of input sequences processed in parallel.
    :param sequence_length: Number of tokens per sequence.
    :param d_m: Model's hidden dimension (d_model).
    :return: Total FLOPs for one RMSNorm operation over the batch.
    """
    return batch_size * sequence_length * (4 * d_m + 3)


def calculate_transformer_block_flops(batch_size: int, sequence_length: int, d_i: int, d_m: int, n_g: int,
                                      d_h: int) -> int:
    """
    Calculate the total FLOPs for a single transformer block as used in LLaMA 3.1 8B.

    Formula derived from the following components:
        F_TB = 2 * F_skip + 2 * F_RMS + F_RoPE + F_Projection + F_Attention + F_FFN

        Which simplifies to:
        F_TB = batch_size * sequence_length * (
            6 + 10 * d_i + d_m * (15 + 2 * sequence_length + d_m + 6 * d_h * n_g + 3 * d_i)
        )

    :param batch_size: Number of sequences in the batch.
    :param sequence_length: Number of tokens per sequence.
    :param d_i: Inner FFN dimension.
    :param d_m: Model (hidden) dimension.
    :param n_g: Number of attention groups (GQA setup).
    :param d_h: Dimension of each attention head.
    :return: Total number of FLOPs for a single transformer block.
    """
    return batch_size * sequence_length * (
            d_m * (2 * d_m * n_g + 6 * d_h * n_g + 5 * d_i + 9) + n_g * d_h * (
                4 * sequence_length - 1) + 10 * d_i + 5 * n_g * sequence_length + 3
    )


def calculate_final_linear_flops(batch_size: int, sequence_length: int, d_m: int, vocab_size: int) -> int:
    """
    Calculate the FLOPs required for the final linear projection to vocabulary logits.

    The formula used is:
        FLOP = batch_size * (2 * sequence_length * d_model * vocab_size)

    :param batch_size: Number of sequences processed in parallel.
    :param sequence_length: Number of tokens per sequence.
    :param d_m: Model's hidden dimension (d_model).
    :param vocab_size: Size of the vocabulary used for prediction.
    :return: Total FLOPs for the final linear transformation.
    """
    return batch_size * 2 * sequence_length * d_m * vocab_size


def calculate_final_softmax_flops(batch_size: int, vocab_size: int) -> int:
    """
    Estimate the FLOPs for the final softmax layer applied only to the last token of each sequence.

    According to TensorFlow Profiler, softmax costs approximately 5 * vocab_size FLOPs per call.

    :param batch_size: Number of sequences processed in parallel.
    :param vocab_size: Number of logits per token (typically equals vocabulary size).
    :return: Total FLOPs for softmax applied to the final token in each sequence.
    """
    return batch_size * 5 * vocab_size


def calculate_theoretical_flop_count(input_shape: tuple[int, int], model_config: LlamaConfig,
                                     tokenization_constant: int) -> int:
    """
    Estimate the total theoretical number of floating-point operations (FLOPs) required to
    generate a sequence using a LLaMA-style transformer architecture.

    This includes the FLOPs for:
        - Tokenization (approximate cost per token)
        - All transformer blocks (multi-head attention, FFN, normalization, etc.)
        - Final linear projection to logits
        - Final softmax operation (applied to the last token of each sequence)

    :param input_shape: Tuple of shape (batch_size, sequence_length), representing the
                        number of sequences processed in parallel and the number of
                        tokens generated per sequence.
    :param model_config: Configuration object containing model hyperparameters. Expected to have:
        - vocab_size (int): Size of the vocabulary used for output predictions.
        - n_layers (int): Number of transformer blocks in the model.
        - hidden_size (int): Dimensionality of the model's hidden states (d_model).
        - intermediate_size (int): Dimensionality of the FFN's inner layer (d_inner).
        - num_attention_heads (int): Total number of attention heads (n_h).
        - num_key_value_heads (int): Number of key/value attention groups (n_g, for GQA).
    :param tokenization_constant: Linear multiplier for tokenization cost (we only know that it is linear, but constant and bias are unknown)
    :return: Total estimated FLOP count as an integer for generating one batch of sequences.
    """
    b = input_shape[0]
    s_L = input_shape[1]
    d_m = model_config.hidden_size
    d_i = model_config.intermediate_size
    n_g = model_config.num_key_value_heads
    d_h = model_config.head_dim
    L_v = model_config.vocab_size
    n_blocks = model_config.num_hidden_layers

    f_tokenization = tokenization_constant * s_L
    f_transformer_total = n_blocks * calculate_transformer_block_flops(b, s_L, d_i, d_m, n_g, d_h)
    f_final_linear = calculate_final_linear_flops(b, s_L, d_m, L_v)
    f_final_softmax = calculate_final_softmax_flops(b, L_v)

    return int(f_tokenization + f_transformer_total + f_final_linear + f_final_softmax)


In [None]:
def permute_string(s: str) -> str:
    """
    Randomly permutes the input string s.
    :param s: String to be permuted
    :return: Permuted String
    """
    chars = list(s)
    random.shuffle(chars)
    return ''.join(chars)

In [11]:
def create_sequence(length: int, pattern='$§') -> str:
    repeated = (pattern * ((length // len(pattern)) + 1))[:length]
    return repeated

In [45]:
sequence_lengths = [1, 128, 256, 512, 1024, 2048, 4096] + [1366, 5264, 4677, 7151, 5875]


In [89]:
effective_flops = GPU_FLOPS * flash_attn_efficiencies[flash_attention_version]
data = {'sequence_length': [], 'calculated_flops': [], 'estimated_inference_time': [], 'inference_time': [],
        'time_delta': [], 'delta_percentage': []}
for seq_len in sequence_lengths:
    input_sequence = create_sequence(seq_len)
    data['sequence_length'].append(seq_len)
    estimated_flops = calculate_theoretical_flop_count((1, seq_len), model_config, guessed_tokenization_constant)
    data['calculated_flops'].append(estimated_flops)
    estimated_time = estimated_flops / effective_flops
    estimated_memory_time = estimated_flops / GPU_BANDWIDTH
    data['estimated_inference_time'].append(estimated_time)
    t_0 = time.time()
    client.call_llm('', input_sequence, {'max_new_tokens': 1, 'temperature': 0})
    t_final = time.time()
    time_taken = t_final - t_0
    delta = time_taken - estimated_time
    data['time_delta'].append(delta)
    data['inference_time'].append(time_taken)
    data['delta_percentage'].append(abs(delta / time_taken))
    client.call_llm('This is sent to overwrite your cache', 'This should make the cache useless.',
                    {'max_new_tokens': 1, 'temperature': 0})



NameError: name 'sequence_lengths' is not defined

In [50]:
df = pd.DataFrame(data).iloc[1:]

In [51]:
df

Unnamed: 0,sequence_length,calculated_flops,estimated_inference_time,inference_time,time_delta,delta_percentage
1,128,1032973880576,0.016674,0.169113,0.152439,0.901401
2,256,2074537054464,0.033487,0.176942,0.143455,0.810744
3,512,4183433206016,0.067529,0.158545,0.091016,0.574069
4,1024,8504304724224,0.137277,0.158224,0.020947,0.132387
5,2048,17558364621056,0.283428,0.158125,-0.125303,0.792434
6,4096,37315751856384,0.602353,0.184976,-0.417376,2.256377
7,1366,11467075744944,0.185102,0.194239,0.009137,0.04704
8,5264,49568325125504,0.800134,0.181893,-0.618241,3.398926
9,4677,43321164415656,0.699292,0.188178,-0.511115,2.716125
10,7151,70874568101624,1.144061,0.167521,-0.97654,5.829358


In [81]:
seq_len = 4096

In [82]:
n_flops = calculate_theoretical_flop_count((1,seq_len), model_config, guessed_tokenization_constant)
n_flops

83513358407936

In [83]:
n_approx = 2000 * 8.03 * 1e9
n_approx, n_flops/n_approx

(16059999999999.998, 5.200084583308594)

In [84]:
t_pf = (2000 * 8.03 * 1e9 / GPU_FLOPS)
t_pf

0.09721549636803872

In [85]:
t_tot = t_pf + (2 * 8.03 * 1e9 / GPU_BANDWIDTH)
t_tot

0.11205383024534486

In [86]:
estimated_memory_time = .08 * n_flops / GPU_BANDWIDTH
estimated_memory_time

6.17284729898922

In [None]:
estimated_compute_time = n_flops / GPU_FLOPS
estimated_compute_time