In [2]:
from importlib.metadata import version
import torch

print("TORCH VERSION :", version("torch"))
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backend.mps.is_available() else "cpu"
)
print("Device  : ", device.upper())

TORCH VERSION : 2.2.2
Device  :  CUDA


In [3]:
torch.manual_seed(123)
torch.cuda.manual_seed(123)

dtype = (
    "bfloat16"
    if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
    else "float16"
)

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [4]:
from torch import nn
import torch.nn.functional as F
from dataclasses import dataclass

In [5]:
# Default llama 2 params
@dataclass
class ModelArgs:
    vocab_size: int = 32000  # llama2 tokenizer has 32k vocab size
    emebdding_dim: int = 4096
    max_seq_len = 2048

    rms_norm_eps: float = (1e-05,)

    rope_scaling: float = (1.0,)
    rope_theta: float = (10000.0,)

    attention_bias: bool = (False,)
    attention_dropout: float = (0.0,)
    num_attention_heads: int = (32,)
    num_key_value_heads: int = (32,)
    use_cache: bool = (True,)
    use_sliding_window: bool = True

    mlp_hidden_size = None  # set some lambda function or scaling factor
    mlp_dropout: float = 0.0

    num_layers: int = 32