In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True)
config = AutoConfig.from_pretrained("./")
model = AutoModelForCausalLM.from_pretrained("./pytorch_model.bin", config=config)
model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32256, 2048)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaLinearScalingRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (down_proj): Linear(in_features=5504, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  

In [4]:
prompt = "Generate sql query to find the number of students whose name is john?"
inputs = tokenizer(prompt, return_tensors="pt")
generate_ids = model.generate(inputs.input_ids, max_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


"Generate sql query to find the number of students whose name is john?\n\n\nSELECT COUNT(*) \nFROM students \nWHERE name = 'john';\n\n\n\n\n\n\n\n\n\n\n\n\n"

In [7]:
def write_model_details_to_file(model, filename="architecture.txt"):
    total_params = 0
    total_layers = 0
    lines = []
    lines.append("Model Layers and Parameters:\n\n")
    for name, param in model.named_parameters():
        total_layers += 1
        num_params = param.numel()
        total_params += num_params
        lines.append(f"Layer {total_layers}: {name}\n")
        lines.append(f" - Shape: {tuple(param.shape)}\n")
        lines.append(f" - Data type: {param.dtype}\n")
        lines.append(f" - Number of parameters: {num_params}\n\n")
    lines.append(f"Total number of layers: {total_layers}\n")
    lines.append(f"Total number of parameters: {total_params}\n")
    lines.append(f"Model data-type precision: {next(model.parameters()).dtype}\n")
    
    with open(filename, 'w') as f:
        f.writelines(lines)
    print(f"Model details have been written to {filename}")

In [5]:
def print_model_details(model):
    total_params = 0
    total_layers = 0
    print("Model Layers and Parameters:\n")
    for name, param in model.named_parameters():
        total_layers += 1
        num_params = param.numel()
        total_params += num_params
        print(f"Layer {total_layers}: {name}")
        print(f" - Shape: {tuple(param.shape)}")
        print(f" - Data type: {param.dtype}")
        print(f" - Number of parameters: {num_params}\n")
    print(f"Total number of layers: {total_layers}")
    print(f"Total number of parameters: {total_params}")
    print(f"Model data-type precision: {next(model.parameters()).dtype}")

In [None]:
def write_model_details_and_vram_estimate(model, filename="architecture.txt"):
    total_params = 0
    total_layers = 0
    lines = []
    lines.append("Model Layers and Parameters:\n\n")
    for name, param in model.named_parameters():
        total_layers += 1
        num_params = param.numel()
        total_params += num_params
        lines.append(f"Layer {total_layers}: {name}\n")
        lines.append(f" - Shape: {tuple(param.shape)}\n")
        lines.append(f" - Data type: {param.dtype}\n")
        lines.append(f" - Number of parameters: {num_params}\n\n")
    lines.append(f"Total number of layers: {total_layers}\n")
    lines.append(f"Total number of parameters: {total_params}\n")
    data_type = next(model.parameters()).dtype
    lines.append(f"Model data-type precision: {data_type}\n\n")
    
    if data_type == torch.float32:
        bytes_per_param = 4
    elif data_type == torch.float16 or data_type == torch.bfloat16:
        bytes_per_param = 2
    elif data_type == torch.int8:
        bytes_per_param = 1
    else:
        bytes_per_param = 4  

    total_memory_bytes = total_params * bytes_per_param
    total_memory_gb = total_memory_bytes / (1024 ** 3)

    lines.append("Estimated VRAM Required to Hold Model Parameters:\n\n")
    lines.append(f" - Data type: {data_type}\n")
    lines.append(f" - Bytes per parameter: {bytes_per_param} bytes\n")
    lines.append(f" - Total memory required for parameters: {total_memory_bytes} bytes\n")
    lines.append(f" - Total memory required for parameters: {total_memory_gb:.2f} GB\n\n")

    # Include considerations for training
    lines.append("Additional VRAM Considerations for Training:\n\n")
    lines.append("During training, additional memory is required for gradients, optimizer states, and activations.\n")
    lines.append(f" - Memory for gradients: similar to parameters (~{total_memory_gb:.2f} GB)\n")
    lines.append(" - Memory for optimizer states: depends on optimizer (e.g., Adam may require up to twice the parameter memory)\n")
    lines.append(" - Memory for activations: depends on batch size, sequence length, and model architecture\n")
    lines.append(" - Other overheads: CUDA context, framework buffers, etc.\n\n")
    lines.append("Approximate VRAM Required for Training (excluding activations and overheads):\n")
    total_training_memory_gb = total_memory_gb * 3  # Parameters + Gradients + Optimizer States
    lines.append(f" - Estimated total memory for parameters, gradients, and optimizer states: {total_training_memory_gb:.2f} GB\n")
    lines.append("Note: Activation memory and other overheads can significantly increase total VRAM usage.\n")

    with open(filename, 'w') as f:
        f.writelines(lines)
    print(f"Model details and VRAM estimation have been written to {filename}")


In [17]:
write_model_details_and_vram_estimate(model,"architectureBeforeQLoRA.txt")

Model details and VRAM estimation have been written to architectureBeforeQLoRA.txt
