In [None]:
## Hello Nemotron is the first lesson and the first part of the first lesson. It will involve running nemotron locally with transformers. 

In [None]:
## First we ensure we can see the local cache where nemotron is stored (we are using the hugging face caching system)

In [1]:
import os
print("HF_HOME =", os.environ.get("HF_HOME"))

!pwd
!ls -la /data | head
!ls -la /data/hf | head
!ls -la $HF_HOME | head

HF_HOME = /data/hf
/workspace/labs/lesson_01
total 20
drwxrwxr-x 5 ubuntu ubuntu 4096 Dec 29 05:36 .
drwxr-xr-x 1 root   root   4096 Dec 29 12:04 ..
drwxrwxr-x 6 ubuntu ubuntu 4096 Dec 29 10:39 hf
drwxrwxr-x 3 ubuntu ubuntu 4096 Nov 29 00:59 models
drwxrwxr-x 3 ubuntu ubuntu 4096 Nov 29 05:35 vllm_cache
total 32
drwxrwxr-x 6 ubuntu ubuntu 4096 Dec 29 10:39 .
drwxrwxr-x 5 ubuntu ubuntu 4096 Dec 29 05:36 ..
drwxrwxr-x 4 ubuntu ubuntu 4096 Dec 29 05:47 hub
drwxr-xr-x 3 ubuntu ubuntu 4096 Dec 29 10:39 modules
-rw-rw-r-- 1 ubuntu ubuntu   60 Dec 29 05:46 stored_tokens
-rw-rw-r-- 1 ubuntu ubuntu   37 Dec 29 05:46 token
drwxr-xr-x 4 ubuntu ubuntu 4096 Dec 29 10:39 transformers
drwxrwxr-x 4 ubuntu ubuntu 4096 Dec 29 05:47 xet
total 32
drwxrwxr-x 6 ubuntu ubuntu 4096 Dec 29 10:39 .
drwxrwxr-x 5 ubuntu ubuntu 4096 Dec 29 05:36 ..
drwxrwxr-x 4 ubuntu ubuntu 4096 Dec 29 05:47 hub
drwxr-xr-x 3 ubuntu ubuntu 4096 Dec 29 10:39 modules
-rw-rw-r-- 1 ubuntu ubuntu   60 Dec 29 05:46 stored_tokens
-rw-rw-

In [None]:
## Now we ensure that transformers will load the model weights from the local Hugging Face Cache (HF Hub Cache) without redownlaoding

In [2]:
import os
from pathlib import Path
from huggingface_hub import snapshot_download

model_id = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"

print("HF_HOME =", os.environ.get("HF_HOME"))
print("HF_HUB_CACHE =", os.environ.get("HF_HUB_CACHE"))

#This forces an offline check: It will fail if the files arent already cached.
snap_dir = snapshot_download(
    repo_id=model_id,
    local_files_only=True,
)

snap_path = Path(snap_dir)
print("Snapshot dir:", snap_path)

# Prove the 13 shards are present in this snapshot.
shards = sorted(snap_path.glob("model-*-of-00013.safetensors"))
print("Shard count:", len(shards))
print("First shard:", shards[0] if shards else None)

# Optional: show whether these are symlinks into blobs (typical HF cache layout)
if shards:
    print("First shard is_symlink:", shards[0].is_symlink())
    if shards[0].is_symlink():
        print("first shard ->", os.readlink(shards[0]))
        



HF_HOME = /data/hf
HF_HUB_CACHE = /data/hf/hub
Snapshot dir: /data/hf/hub/models--nvidia--NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/snapshots/2e43387afd60157064e5bef4e9a583f887c6dfdd
Shard count: 13
First shard: /data/hf/hub/models--nvidia--NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/snapshots/2e43387afd60157064e5bef4e9a583f887c6dfdd/model-00001-of-00013.safetensors
First shard is_symlink: True
first shard -> ../../blobs/4c77b0f1717f1fb11791fb62fc57ca56f59fd1427ac466849ef9705ac90729ea


In [None]:
## Now we load the tokenizer and model from the local HF cache and generate a short completion

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
revision = "2e43387afd60157064e5bef4e9a583f887c6dfdd"  # your cached snapshot
cache_dir = "/data/hf/hub"

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True,
    use_fast=True,
    revision=revision,
    cache_dir=cache_dir,
    local_files_only=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    dtype=torch.bfloat16,
    device_map="cuda:0",
    revision=revision,
    cache_dir=cache_dir,
    local_files_only=True,
)

messages = [{"role":"user","content":"Hello Nemotron. In one sentence, who are you?"}]
input_ids = tokenizer.apply_chat_template(messages, tokenize=True, enable_thinking=False,
                                          add_generation_prompt=True,
                                          return_tensors="pt").to(model.device)

out = model.generate(input_ids, max_new_tokens=60, do_sample=False)
print(tokenizer.decode(out[0], skip_special_tokens=True))


Loading checkpoint shards:   0%|          | 0/13 [00:00<?, ?it/s]

NemotronH requires an initialized `NemotronHHybridDynamicCache` to return a cache. None was provided, so no cache will be returned.


system

user
Hello Nemotron. In one sentence, who are you?
assistant
<think></think>I am Nemotron, a large language model created by NVIDIA to assist with reasoning, creativity, and problem-solving across a wide range of tasks.


In [None]:
## We will now rerun with more verbosity to see if you can pick up more data about what the model is doing. 

In [12]:
##  We run a new model.generate() call using whatever input_ids were defined in the previous run (given that everything is still alive in the notebook kernel). Effectively, we are using the smae prompt as before. We first run generation and capture the result and the timing while it runs.  Then, we analyze the captured results (token coutns, decode, logprobs, etc)##

In [23]:
import time, torch

# Reuse existing: `model`, `tokenizer`, `input_ids`
eos = tokenizer.eos_token_id

torch.cuda.reset_peak_memory_stats()
t0 = time.perf_counter()

with torch.inference_mode():
    gen = model.generate(
        input_ids,
        max_new_tokens=500,
        do_sample=False,
        eos_token_id=eos,
        pad_token_id=eos,
        return_dict_in_generate=True,
        output_scores=True,
    )

torch.cuda.synchronize()
t1 = time.perf_counter()

seq = gen.sequences[0]
prompt_tokens = input_ids.shape[-1]
generated = seq[prompt_tokens:]

# Trim at first EOS
if eos is not None:
    eos_pos = (generated == eos).nonzero(as_tuple=True)[0]
    if eos_pos.numel() > 0:
        generated = generated[: eos_pos[0]]

text = tokenizer.decode(generated.detach().to("cpu").tolist(),
skip_special_tokens=True)

new_tokens = int(generated.numel())
dt = t1 - t0
peak_gb = torch.cuda.max_memory_allocated() / (1024**3)

print(text)
print(f"prompt_tokens={prompt_tokens} new_tokens={new_tokens} time_s={dt:.3f} tok_per_s={(new_tokens/dt if dt>0 else float('inf')):.2f} peak_cuda_mem_gb={peak_gb:.2f}")

# Top-5 alternatives for the first generated token
if gen.scores:
    logits0 = gen.scores[0][0].float()
    probs0 = torch.softmax(logits0, dim=-1)
    top = torch.topk(probs0, k=5)
    ids = top.indices.tolist()
    ps = top.values.tolist()
    toks = [tokenizer.decode([i], skip_special_tokens=False) for i in ids]
    print("top5_first_token:", list(zip(ids, ps, toks)))


I am Nemotron, a large language model created by NVIDIA to assist with reasoning, creativity, and problem-solving across a wide range of tasks.
prompt_tokens=28 new_tokens=31 time_s=27.629 tok_per_s=1.12 peak_cuda_mem_gb=58.94


In [18]:
import torch

with torch.inference_mode():
    outputs = model(input_ids)

print("outputs.logits.device =", outputs.logits.device)

if "gen" in globals() and getattr(gen, "scores", None):
    print("gen.scores[0].device =", gen.scores[0].device)
else:
    print("gen.scores not available (run generate with output_scores=True)")

outputs.logits.device = cuda:0
gen.scores[0].device = cuda:0


NameError: name 'outputs' is not defined

type(out) = <class 'transformers.generation.utils.GenerateDecoderOnlyOutput'>
has sequences = True
seq dtype/device/shape = torch.int64 cuda:0 (60,)
seq min/max = 0 4472591304561587027
bad ids count = 30
first bad ids = [4454694472195620972, 4472591304561587027, 4470358711844355279, 4438422632114747494, 4439308907195912046, 4442371755751307659, 4438863235618130594, 4442021299304623625, 4438912799541253756, 4441276582037614266, 4458430367888893160, 4464418449943726864, 4451499798209831319, 4432637246734100849, 4429952767616152071, 4453393556659185601, 4438361892677319790, 4452046182466844564, 4455312754200068214, 4435918799314393832]
