In [1]:
import numpy as np
import torch
from dataclasses import dataclass

In [2]:
@dataclass
class ModelArgs:
    dim: int = 4096
    n_layers: int = 32
    n_heads: int = 32
    n_kv_heads: int | None = None  # Group-Query Attention head number, groups = n_rep = n_heads // n_kv_heads
    vocab_size: int = -1  # defined later by tokenizer
    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
    ffn_dim_multiplier: float | None = None
    norm_eps: float = 1e-5

    max_batch_size: int = 32
    max_seq_len: int = 2048


In [3]:
def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
    """
    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.

    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
    and the end index 'end'. The 'theta' parameter scales the frequencies.
    The returned tensor contains complex values in complex64 data type.

    Args:
        dim (int): Dimension of the frequency tensor.
        end (int): End index for precomputing frequencies.
        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.

    Returns:
        torch.Tensor: Precomputed frequency tensor with complex exponentials.

    """
    # 计算词向量元素两两分组之后，每组元素对应的旋转角度\theta_i
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))  # [dim/2]
    # 生成 token 序列索引 t = [0, 1,..., seq_len-1]
    t = torch.arange(end, device=freqs.device)  # type: ignore  [end/2]
    # 计算m * \theta
    freqs = torch.outer(t, freqs).float()  # type: ignore  [end, dim/2]
    # 计算结果是个复数向量
    # 假设 freqs = [x, y]
    # 则 freqs_cis = [cos(x) + sin(x)i, cos(y) + sin(y)i]
    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64  [end, dim/2]
    return freqs_cis

In [4]:
def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
    """
    Reshape frequency tensor for broadcasting it with another tensor.

    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
    for the purpose of broadcasting the frequency tensor during element-wise operations.

    Args:
        freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
        x (torch.Tensor): Target tensor for broadcasting compatibility.

    Returns:
        torch.Tensor: Reshaped frequency tensor.

    Raises:
        AssertionError: If the frequency tensor doesn't match the expected shape.
        AssertionError: If the target tensor 'x' doesn't have the expected number of dimensions.

    """
    # (bs, seqlen, n_local_heads, head_dim/2, 2)
    ndim = x.ndim
    assert 0 <= 1 < ndim
    # (seqlen, head_dim/2)
    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
    # (1, seqlen, 1, head_dim/2)
    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
    return freqs_cis.view(*shape)

In [5]:
def apply_rotary_emb(
    xq: torch.Tensor,
    xk: torch.Tensor,
    freqs_cis: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Apply rotary embeddings to input tensors using the given frequency tensor.

    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
    returned as real tensors.

    Args:
        xq (torch.Tensor): Query tensor to apply rotary embeddings.
        xk (torch.Tensor): Key tensor to apply rotary embeddings.
        freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.

    """
    # view_as_complex: 转为复数域
    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))  # (bs, seqlen, n_local_heads, head_dim) -> (bs, seqlen, n_local_heads, head_dim/2, 2)
    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))  # (bs, seqlen, n_local_kv_heads, head_dim) -> (bs, seqlen, n_local_kv_heads, head_dim/2, 2)
    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)       # (seqlen, head_dim/2) -> (1, seqlen, 1, head_dim/2)
    # 应用旋转操作，然后将结果转回实数域
    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) # (bs, seqlen, n_local_heads, head_dim/2, 2) -> (bs, seqlen, n_local_heads, head_dim)
    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) # (bs, seqlen, n_local_kv_heads, head_dim/2, 2) -> (bs, seqlen, n_local_kv_heads, head_dim)
    return xq_out.type_as(xq), xk_out.type_as(xk)

In [6]:
params = ModelArgs(n_kv_heads=8)
params

ModelArgs(dim=4096, n_layers=32, n_heads=32, n_kv_heads=8, vocab_size=-1, multiple_of=256, ffn_dim_multiplier=None, norm_eps=1e-05, max_batch_size=32, max_seq_len=2048)

In [7]:
freqs_cis = precompute_freqs_cis(dim=params.dim // params.n_heads, end=params.max_seq_len)
freqs_cis.shape

torch.Size([2048, 64])

In [8]:
freqs_cis[:4, :4]

tensor([[ 1.0000+0.0000j,  1.0000+0.0000j,  1.0000+0.0000j,  1.0000+0.0000j],
        [ 0.5403+0.8415j,  0.6479+0.7617j,  0.7318+0.6816j,  0.7965+0.6047j],
        [-0.4161+0.9093j, -0.1604+0.9870j,  0.0709+0.9975j,  0.2687+0.9632j],
        [-0.9900+0.1411j, -0.8558+0.5173j, -0.6279+0.7783j, -0.3685+0.9296j]])

In [9]:
xq = torch.randn(params.max_batch_size, params.max_seq_len, params.n_heads, params.dim // params.n_heads)
xk = torch.randn(params.max_batch_size, params.max_seq_len, params.n_kv_heads, params.dim // params.n_heads)
xq.shape, xk.shape

(torch.Size([32, 2048, 32, 128]), torch.Size([32, 2048, 8, 128]))

In [10]:
xq_, xk_ = apply_rotary_emb(xq=xq, xk=xk, freqs_cis=freqs_cis)
xq_.shape, xk_.shape

(torch.Size([32, 2048, 32, 128]), torch.Size([32, 2048, 8, 128]))