In [None]:
## We will now build a script which loads the model, runs the same prompt multiple times under different decoding settings and saves the results

In [None]:
#first we check to ensure we are aimed at the right cache and it is visible

In [1]:
import os
print("HF_HOME =", os.environ.get("HF_HOME"))

!pwd
!ls -la /data | head
!ls -la /data/hf | head
!ls -la $HF_HOME | head

HF_HOME = /data/hf
/workspace/labs/lesson_01
total 20
drwxrwxr-x 5 ubuntu ubuntu 4096 Dec 29 05:36 .
drwxr-xr-x 1 root   root   4096 Jan  1 13:14 ..
drwxrwxr-x 6 ubuntu ubuntu 4096 Dec 29 10:39 hf
drwxrwxr-x 3 ubuntu ubuntu 4096 Nov 29 00:59 models
drwxrwxr-x 3 ubuntu ubuntu 4096 Nov 29 05:35 vllm_cache
total 32
drwxrwxr-x 6 ubuntu ubuntu 4096 Dec 29 10:39 .
drwxrwxr-x 5 ubuntu ubuntu 4096 Dec 29 05:36 ..
drwxrwxr-x 4 ubuntu ubuntu 4096 Dec 29 05:47 hub
drwxr-xr-x 3 ubuntu ubuntu 4096 Dec 29 10:39 modules
-rw-rw-r-- 1 ubuntu ubuntu   60 Dec 29 05:46 stored_tokens
-rw-rw-r-- 1 ubuntu ubuntu   37 Dec 29 05:46 token
drwxr-xr-x 4 ubuntu ubuntu 4096 Dec 29 10:39 transformers
drwxrwxr-x 4 ubuntu ubuntu 4096 Dec 29 05:47 xet
total 32
drwxrwxr-x 6 ubuntu ubuntu 4096 Dec 29 10:39 .
drwxrwxr-x 5 ubuntu ubuntu 4096 Dec 29 05:36 ..
drwxrwxr-x 4 ubuntu ubuntu 4096 Dec 29 05:47 hub
drwxr-xr-x 3 ubuntu ubuntu 4096 Dec 29 10:39 modules
-rw-rw-r-- 1 ubuntu ubuntu   60 Dec 29 05:46 stored_tokens
-rw-rw-

In [None]:
#then we do a checkl to ensure that thtransformers will indeed load the model weights locally from the offline hugging face cache

In [2]:
import os
from pathlib import Path
from huggingface_hub import snapshot_download

model_id = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"

print("HF_HOME =", os.environ.get("HF_HOME"))
print("HF_HUB_CACHE =", os.environ.get("HF_HUB_CACHE"))

#This forces an offline check: It will fail if the files arent already cached.
snap_dir = snapshot_download(
    repo_id=model_id,
    local_files_only=True,
)

snap_path = Path(snap_dir)
print("Snapshot dir:", snap_path)

# Prove the 13 shards are present in this snapshot.
shards = sorted(snap_path.glob("model-*-of-00013.safetensors"))
print("Shard count:", len(shards))
print("First shard:", shards[0] if shards else None)

# Optional: show whether these are symlinks into blobs (typical HF cache layout)
if shards:
    print("First shard is_symlink:", shards[0].is_symlink())
    if shards[0].is_symlink():
        print("first shard ->", os.readlink(shards[0]))
        

HF_HOME = /data/hf
HF_HUB_CACHE = /data/hf/hub
Snapshot dir: /data/hf/hub/models--nvidia--NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/snapshots/2e43387afd60157064e5bef4e9a583f887c6dfdd
Shard count: 13
First shard: /data/hf/hub/models--nvidia--NVIDIA-Nemotron-3-Nano-30B-A3B-BF16/snapshots/2e43387afd60157064e5bef4e9a583f887c6dfdd/model-00001-of-00013.safetensors
First shard is_symlink: True
first shard -> ../../blobs/4c77b0f1717f1fb11791fb62fc57ca56f59fd1427ac466849ef9705ac90729ea


In [None]:
#Now we load the model and tokenizer

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
revision = "2e43387afd60157064e5bef4e9a583f887c6dfdd"  # your cached snapshot
cache_dir = "/data/hf/hub"

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True,
    use_fast=True,
    revision=revision,
    cache_dir=cache_dir,
    local_files_only=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    dtype=torch.bfloat16,
    device_map="cuda:0",
    revision=revision,
    cache_dir=cache_dir,
    local_files_only=True,
)

print("Loaded tokenizer + model on", model.device)



Loading checkpoint shards:   0%|          | 0/13 [00:00<?, ?it/s]

Loaded tokenizer + model on cuda:0


In [None]:
# Now first we will explore the impact of do_sample which is a knob we can set to true or false and allows for a greedy pick of the top token each step or instead a random token from the distribution. 

In [6]:
messages = [{"role": "user", "content": "Describe cyanobacteria photoprotective mechanisms"}]

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    enable_thinking=False, #on by default in nemotron
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

with torch.inference_mode(): #disables autograd makes it faster
    out = model.generate(
        input_ids,
        max_new_tokens=250,
        do_sample=False, # for this experiment we toggle this on and off
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

prompt_len = input_ids.shape[-1]
gen_ids = out[0, prompt_len:].detach().to("cpu").tolist() #only decode the answer, as oppsoed to teh prompt and answer- send the token ids to the cpu to be turned into text
print(tokenizer.decode(gen_ids, skip_special_tokens=True))



Cyanobacteria, as oxygenic photosynthetic prokaryotes, face intense light stress in aquatic and terrestrial environments. To protect themselves from photodamage while maintaining efficient photosynthesis, they employ a sophisticated array of **photoprotective mechanisms**. These mechanisms primarily focus on **dissipating excess excitation energy as heat** to prevent the formation of harmful reactive oxygen species (ROS) that can damage cellular components (DNA, proteins, lipids). Below is a structured description of key mechanisms, emphasizing their biological context and functional interplay:

---

### **1. Photoprotective Pigment Systems & Energy Dissipation**
   - **Carotenoids (e.g., β-carotene, zeaxanthin)**:  
     - Serve dual roles:  
       (a) **Accessory pigments** that broaden light absorption (complementing chlorophyll *a* and *b*).  
       (b) **Quenching singlet oxygen (¹O₂)** generated during light reactions.  
     - *Mechanism*: Carotenoids safely dissipate excess e

In [7]:
# There is indeed a difference. hopwever in order to truly quantify what the difference is at this level we will 
# run the experiemnt with 5 trials of each and then do a diversity similarity statstic 
# to see how mucht the outcomes changed within and between eachother. 
# we also set a seed to control for randomness

In [12]:
import re
import torch

N_RUNS = 5
BASE_SEED = 1234 #we set a seed to control for randomness

def normalize(text: str) -> str:
    return re.sub(r"\s+", " ", text.strip())

def jaccard(a: str, b: str) -> float:
    wa = set(re.findall(r"[a-zA-Z]+", a.lower()))
    wb = set(re.findall(r"[a-zA-Z]+", b.lower()))
    return len(wa & wb) / max(1, len(wa | wb))

def generate_once(do_sample: bool, seed: int):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    with torch.inference_mode():
        out = model.generate(
            input_ids,
            max_new_tokens=250,
            do_sample=do_sample,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )

    prompt_len = input_ids.shape[-1]
    gen_ids = out[0, prompt_len:].detach().to("cpu").tolist()
    return normalize(tokenizer.decode(gen_ids, skip_special_tokens=True))

def pairwise_stats(texts_a, texts_b):
    sims = []
    for a in texts_a:
        for b in texts_b:
            sims.append(jaccard(a, b))
    return {"min": min(sims), "mean": sum(sims)/len(sims), "max": max(sims)}

# Run
seeds = [BASE_SEED + i for i in range(N_RUNS)]
greedy = [generate_once(False, s) for s in seeds]
sample = [generate_once(True, s) for s in seeds]

print("Unique outputs:")
print("greedy:", len(set(greedy)), "of", N_RUNS)
print("sample:", len(set(sample)), "of", N_RUNS)

print("\nWithin-set similarity (Jaccard):")
print("greedy×greedy:", pairwise_stats(greedy, greedy))
print("sample×sample:", pairwise_stats(sample, sample))

print("\nBetween-set similarity (Jaccard):")
print("greedy×sample:", pairwise_stats(greedy, sample))

print("\n--- Outputs (greedy) ---")
for i, (s, t) in enumerate(zip(seeds, greedy), 1):
    print(f"\n[greedy {i} seed={s}]\n{t}")

print("\n--- Outputs (sample) ---")
for i, (s, t) in enumerate(zip(seeds, sample), 1):
    print(f"\n[sample {i} seed={s}]\n{t}")

Unique outputs:
greedy: 1 of 5
sample: 5 of 5

Within-set similarity (Jaccard):
greedy×greedy: {'min': 1.0, 'mean': 1.0, 'max': 1.0}
sample×sample: {'min': 0.143646408839779, 'mean': 0.35636687966727054, 'max': 1.0}

Between-set similarity (Jaccard):
greedy×sample: {'min': 0.19767441860465115, 'mean': 0.2175417088889349, 'max': 0.25153374233128833}

--- Outputs (greedy) ---

[greedy 1 seed=1234]
Cyanobacteria, as oxygenic photosynthetic prokaryotes, face intense light stress in aquatic and terrestrial environments. To protect themselves from photodamage while maintaining efficient photosynthesis, they employ a sophisticated array of **photoprotective mechanisms**. These mechanisms primarily focus on **dissipating excess excitation energy as heat** to prevent the formation of harmful reactive oxygen species (ROS) that can damage cellular components (DNA, proteins, lipids). Below is a structured description of key mechanisms, emphasizing their biological context and functional interplay:

In [None]:
# sampling is usually paired with temperature and a constraint like top_p or top_k or min_p to trim the tail of unlikely tokens before picking. As we previosuly saw that when sampling is true and all tokens were allowed, even very werid ones had a nonzero chance of beign selected. 

In [15]:
# We will keep do_sample=True and choose a various temperatures:
# 0.2, 0.7, 1.0, 1.3, 1.8
# for each temperature we will rerun 3 times. (3 seeds)
# We will measure within -temp variability and between temp shifts (different T)

In [16]:
import re
import torch

TEMPS = [0.2, 0.7, 1.0, 1.3, 1.8]  # 5 temperature settings
RUNS_PER_TEMP = 3
BASE_SEED = 1234

def normalize(text: str) -> str:
    return re.sub(r"\s+", " ", text.strip())

def jaccard(a: str, b: str) -> float:
    wa = set(re.findall(r"[a-zA-Z]+", a.lower()))
    wb = set(re.findall(r"[a-zA-Z]+", b.lower()))
    return len(wa & wb) / max(1, len(wa | wb))

def generate_once(seed: int, temperature: float):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    with torch.inference_mode():
        out = model.generate(
            input_ids,
            max_new_tokens=80,
            do_sample=True,
            temperature=temperature,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
        )

    prompt_len = input_ids.shape[-1]
    gen_ids = out[0, prompt_len:].detach().to("cpu").tolist()
    return normalize(tokenizer.decode(gen_ids, skip_special_tokens=True))

def pairwise_stats(texts_a, texts_b):
    sims = []
    for a in texts_a:
        for b in texts_b:
            sims.append(jaccard(a, b))
    return {"min": min(sims), "mean": sum(sims) / len(sims), "max": max(sims)}

results = {}  # temperature -> list[str]

seed_counter = 0
for T in TEMPS:
    texts = []
    for _ in range(RUNS_PER_TEMP):
        seed = BASE_SEED + seed_counter
        seed_counter += 1
        texts.append(generate_once(seed=seed, temperature=T))
    results[T] = texts

print("Unique outputs per temperature:")
for T, texts in results.items():
    print(f"T={T}: {len(set(texts))} of {RUNS_PER_TEMP}")

print("\nWithin-temp similarity (Jaccard):")
for T, texts in results.items():
    print(f"T={T}:", pairwise_stats(texts, texts))

print("\nBetween-temp similarity (Jaccard), temp i vs i+1:")
for (T1, T2) in zip(TEMPS, TEMPS[1:]):
    print(f"{T1} vs {T2}:", pairwise_stats(results[T1], results[T2]))

print("\n--- Outputs ---")
seed_counter = 0
for T in TEMPS:
    print(f"\n===== temperature={T} =====")
    for i, text in enumerate(results[T], 1):
        seed = BASE_SEED + seed_counter
        seed_counter += 1
        print(f"\n[run {i} seed={seed}]\n{text}")


Unique outputs per temperature:
T=0.2: 3 of 3
T=0.7: 3 of 3
T=1.0: 3 of 3
T=1.3: 3 of 3
T=1.8: 3 of 3

Within-temp similarity (Jaccard):
T=0.2: {'min': 0.2289156626506024, 'mean': 0.6099530396719152, 'max': 1.0}
T=0.7: {'min': 0.17045454545454544, 'mean': 0.49382319095647464, 'max': 1.0}
T=1.0: {'min': 0.15384615384615385, 'mean': 0.47799160704712257, 'max': 1.0}
T=1.3: {'min': 0.1744186046511628, 'mean': 0.4583675330597355, 'max': 1.0}
T=1.8: {'min': 0.11956521739130435, 'mean': 0.41641829530028285, 'max': 1.0}

Between-temp similarity (Jaccard), temp i vs i+1:
0.2 vs 0.7: {'min': 0.11458333333333333, 'mean': 0.29572568683514566, 'max': 0.5625}
0.7 vs 1.0: {'min': 0.15294117647058825, 'mean': 0.24569283080079599, 'max': 0.37681159420289856}
1.0 vs 1.3: {'min': 0.11363636363636363, 'mean': 0.18418958611470856, 'max': 0.39705882352941174}
1.3 vs 1.8: {'min': 0.09375, 'mean': 0.15536525735440446, 'max': 0.21839080459770116}

--- Outputs ---

===== temperature=0.2 =====

[run 1 seed=1234]

In [18]:
# Sampling Filters
#Now we will see how influencing which tokens are allowed to be 
#selected influences the outputs. 

In [19]:
#Top_K
# We pick the K most likely tokens at each 
#step, renormalize probabilities to sum to one for those remaining K tokens and 
#sample from that reduced set. 
#Top_K=1 shoudl technically be the same as when sampling=False becasue we only allow the single 
#best token to proceed into selection

In [21]:
import re
import torch

TOP_KS = [1, 2, 3, 4, 5]
RUNS_PER_K = 3
TEMPERATURE = 0.5
BASE_SEED = 1234

def normalize(text: str) -> str:
    return re.sub(r"\s+", " ", text.strip())

def jaccard(a: str, b: str) -> float:
    wa = set(re.findall(r"[a-zA-Z]+", a.lower()))
    wb = set(re.findall(r"[a-zA-Z]+", b.lower()))
    return len(wa & wb) / max(1, len(wa | wb))

def generate_once(seed: int, do_sample: bool, temperature: float | None = None,
top_k: int | None = None):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    gen_kwargs = dict(
        max_new_tokens=80,
        do_sample=do_sample,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    if temperature is not None:
        gen_kwargs["temperature"] = temperature
    if top_k is not None:
        gen_kwargs["top_k"] = top_k

    with torch.inference_mode():
        out = model.generate(input_ids, **gen_kwargs)

    prompt_len = input_ids.shape[-1]
    gen_ids = out[0, prompt_len:].detach().to("cpu").tolist()
    return normalize(tokenizer.decode(gen_ids, skip_special_tokens=True))

def pairwise_stats(texts_a, texts_b):
    sims = []
    for a in texts_a:
        for b in texts_b:
            sims.append(jaccard(a, b))
    return {"min": min(sims), "mean": sum(sims) / len(sims), "max": max(sims)}

results = {}  # label -> list[str]
meta = {}     # label -> list[seed]

seed_counter = 0

# Control: greedy (do_sample=False)
label = "control_greedy"
seeds = [BASE_SEED + seed_counter]  # one run is enough; it should be deterministic
seed_counter += 1
results[label] = [generate_once(seed=seeds[0], do_sample=False)]
meta[label] = seeds

# top_k experiments (do_sample=True)
for k in TOP_KS:
    label = f"top_k={k}"
    seeds = []
    texts = []
    for _ in range(RUNS_PER_K):
        seed = BASE_SEED + seed_counter
        seed_counter += 1
        seeds.append(seed)
        texts.append(generate_once(seed=seed, do_sample=True,
temperature=TEMPERATURE, top_k=k))
    results[label] = texts
    meta[label] = seeds

print("Unique outputs per setting:")
for label, texts in results.items():
    print(f"{label}: {len(set(texts))} of {len(texts)}")

print("\nWithin-setting similarity (Jaccard):")
for label, texts in results.items():
    print(f"{label}:", pairwise_stats(texts, texts))

print("\nBetween-setting similarity (Jaccard), adjacent top_k:")
for k1, k2 in zip(TOP_KS, TOP_KS[1:]):
    a = results[f"top_k={k1}"]
    b = results[f"top_k={k2}"]
    print(f"top_k={k1} vs top_k={k2}:", pairwise_stats(a, b))

print("\n--- Outputs ---")
for label, texts in results.items():
    print(f"\n===== {label} =====")
    for i, text in enumerate(texts, 1):
        seed = meta[label][i - 1]
        print(f"\n[run {i} seed={seed}]\n{text}")


Unique outputs per setting:
control_greedy: 1 of 1
top_k=1: 3 of 3
top_k=2: 3 of 3
top_k=3: 3 of 3
top_k=4: 3 of 3
top_k=5: 3 of 3

Within-setting similarity (Jaccard):
control_greedy: {'min': 1.0, 'mean': 1.0, 'max': 1.0}
top_k=1: {'min': 0.5454545454545454, 'mean': 0.7834534066151713, 'max': 1.0}
top_k=2: {'min': 0.17647058823529413, 'mean': 0.4765152643335815, 'max': 1.0}
top_k=3: {'min': 0.1518987341772152, 'mean': 0.5054729908992984, 'max': 1.0}
top_k=4: {'min': 0.125, 'mean': 0.4977115355757069, 'max': 1.0}
top_k=5: {'min': 0.2597402597402597, 'mean': 0.5247228158620563, 'max': 1.0}

Between-setting similarity (Jaccard), adjacent top_k:
top_k=1 vs top_k=2: {'min': 0.24358974358974358, 'mean': 0.3454948582551978, 'max': 0.576271186440678}
top_k=2 vs top_k=3: {'min': 0.17647058823529413, 'mean': 0.2963473401609485, 'max': 0.5901639344262295}
top_k=3 vs top_k=4: {'min': 0.14102564102564102, 'mean': 0.2893887693178497, 'max': 0.6031746031746031}
top_k=4 vs top_k=5: {'min': 0.13924050

In [23]:
# We observe interesting influence at various levels. Indeed we see mopre variation with higher top k and therefore a prbability of options availabel to be sampled
#However there is an interesting phenomeon in the top_k=1 and control greedy runs int aht they should technically
#be deterministic, and yet we witness very small and slight differences
# Next we need to diverge and figure out where that nondeterminsim is coming from.
# we set up accordingly: 
#Greedy (do_sample=False) repeated with different seeds
#do_sample=True, top_k=1 repeated with the same seed
# Compare greedy vs do_sample=True, top_k=1




In [25]:
import torch

def set_seed(seed: int):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def gen_text(**kwargs):
    with torch.inference_mode():
        out = model.generate(
            input_ids,
            max_new_tokens=80,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            **kwargs,
        )
    gen_ids = out[0, input_ids.shape[-1]:].detach().cpu().tolist()
    return tokenizer.decode(gen_ids, skip_special_tokens=True)

SEEDS = [111, 222, 333]

print("1) Greedy, SAME seed repeated (should match exactly):")
for i in range(3):
    set_seed(999)
    print(f"\nrepeat={i+1} seed=999\n{gen_text(do_sample=False)}")

print("\n2) Greedy, DIFFERENT seeds (should still match exactly):")
for s in SEEDS:
    set_seed(s)
    print(f"\nseed={s}\n{gen_text(do_sample=False)}")

print("\n3) top_k=1, SAME seed repeated (should match; if not, nondeterminism in sampling path):")
for i in range(3):
    set_seed(999)
    print(f"\nrepeat={i+1} seed=999\n{gen_text(do_sample=True, temperature=0.5,
top_k=1)}")

print("\n4) top_k=1, DIFFERENT seeds (should still match; seed shouldn't matter with k=1):")
for s in SEEDS:
    set_seed(s)
    print(f"\nseed={s}\n{gen_text(do_sample=True, temperature=0.5, top_k=1)}")


1) Greedy, SAME seed repeated (should match exactly):

repeat=1 seed=999
Cyanobacteria, as oxygenic photosynthetic prokaryotes, face intense light stress in aquatic and terrestrial environments. To protect themselves from photodamage while maintaining efficient photosynthesis, they employ a sophisticated array of **photoprotective mechanisms**. These mechanisms primarily focus on **dissipating excess excitation energy as heat** to prevent the formation of harmful reactive oxygen species (ROS) that can damage cellular components

repeat=2 seed=999
Cyanobacteria, as oxygenic photosynthetic prokaryotes, face intense light stress in aquatic and terrestrial environments. To protect themselves from photodamage while maintaining efficient photosynthesis, they employ a sophisticated array of **photoprotective mechanisms**. These mechanisms primarily focus on **dissipating excess excitation energy as heat** to prevent the formation of harmful reactive oxygen species (ROS) that can damage cellul

In [27]:
# Interestingly we still see this apprent nondeterministic behaviour. This is
#possible if there are ties between logits, which with tok_k=1 could mean both are survivors
# of the filtering. 
#we will test if there is a tie by revealing how many tokens survive the top_k=1 filter at the first token position wwhere
#two runs diverge.

In [29]:
import torch

def set_seed(seed: int):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def gen_with_scores(seed: int):
    set_seed(seed)
    return model.generate(
        input_ids,
        max_new_tokens=80,
        do_sample=True,
        temperature=0.5,
        top_k=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        return_dict_in_generate=True,
        output_scores=True,
    )

g1 = gen_with_scores(111)
g2 = gen_with_scores(222)

seq1 = g1.sequences[0]
seq2 = g2.sequences[0]
prompt_len = input_ids.shape[-1]

# find first differing generated token
diff_pos = None
for j in range(min(seq1.numel(), seq2.numel()) - prompt_len):
    if int(seq1[prompt_len + j]) != int(seq2[prompt_len + j]):
        diff_pos = j
        break

print("first_diff_generated_step =", diff_pos)  # 0 means first generated token differs

if diff_pos is not None:
    tok1 = int(seq1[prompt_len + diff_pos])
    tok2 = int(seq2[prompt_len + diff_pos])
    print("seed111 token:", tok1, repr(tokenizer.decode([tok1])))
    print("seed222 token:", tok2, repr(tokenizer.decode([tok2])))

    # scores[j] corresponds to the distribution used to sample generated token at step j
    scores = g1.scores[diff_pos][0]  # take seed111's processed scores at that step
    finite = torch.isfinite(scores)
    allowed = int(finite.sum().item())
    print("allowed_tokens_after_top_k=1_at_that_step =", allowed)

first_diff_generated_step = 15
seed111 token: 7500 ' stress'
seed222 token: 11915 ' exposure'
allowed_tokens_after_top_k=1_at_that_step = 2


In [30]:
# We see that top_k=1 is not alwasy equivalent to Greedy do_sample=False because
# of ties. Specifically, due to precision, two tokens can score the same
# after temperature processing and as a consequence Transformers keeps both of them when sampling


In [38]:
# Top_p
# Now we investigate the filtering technioque of top_p (nucleus sampling
# After temperature is applied, we sort the tokens by probability
# highest to lowest 
# and then we walk down that list and add up the prbabilities : p1 + p2...+p15
#that is cumulative p 
# we then take the smallest batch of those probabilities (which add up to eh cumulative) such
#that it is equal to or greater than the top_p we set. 
# That then leaves us with a smaller set of tokens, 
# we renormalize that smaller set such that they sum back to 1. 
#we then sample from there. 
# temperature strongly effects the peaking of the distribution and therefore has a strong impact on the mass each probability puts forth in terms of the cumualative mass
# and therefore how many tokens are requried to surpass your top_p value. 

In [34]:
import re
import torch

TOP_PS = [0.01, 0.1, 0.3, 0.5, 0.8]
RUNS_PER_P = 3
TEMPERATURE = 0.5
BASE_SEED = 1234

def normalize(text: str) -> str:
    return re.sub(r"\s+", " ", text.strip())

def jaccard(a: str, b: str) -> float:
    wa = set(re.findall(r"[a-zA-Z]+", a.lower()))
    wb = set(re.findall(r"[a-zA-Z]+", b.lower()))
    return len(wa & wb) / max(1, len(wa | wb))

def generate_once(seed: int, do_sample: bool, temperature: float | None = None,
top_p: float | None = None):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    gen_kwargs = dict(
        max_new_tokens=80,
        do_sample=do_sample,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    if temperature is not None:
        gen_kwargs["temperature"] = temperature
    if top_p is not None:
        gen_kwargs["top_p"] = top_p

    with torch.inference_mode():
        out = model.generate(input_ids, **gen_kwargs)

    prompt_len = input_ids.shape[-1]
    gen_ids = out[0, prompt_len:].detach().to("cpu").tolist()
    return normalize(tokenizer.decode(gen_ids, skip_special_tokens=True))

def pairwise_stats(texts_a, texts_b):
    sims = []
    for a in texts_a:
        for b in texts_b:
            sims.append(jaccard(a, b))
    return {"min": min(sims), "mean": sum(sims) / len(sims), "max": max(sims)}

results = {}  # label -> list[str]
meta = {}     # label -> list[seed]

seed_counter = 0

# Control: greedy (do_sample=False)
label = "control_greedy"
seeds = [BASE_SEED + seed_counter]  # one run is enough; it should be deterministic
seed_counter += 1
results[label] = [generate_once(seed=seeds[0], do_sample=False)]
meta[label] = seeds

# top_p experiments (do_sample=True)
for p in TOP_PS:
    label = f"top_p={p}"
    seeds = []
    texts = []
    for _ in range(RUNS_PER_P):
        seed = BASE_SEED + seed_counter
        seed_counter += 1
        seeds.append(seed)
        texts.append(generate_once(seed=seed, do_sample=True,
temperature=TEMPERATURE, top_p=p))
    results[label] = texts
    meta[label] = seeds

print("Unique outputs per setting:")
for label, texts in results.items():
    print(f"{label}: {len(set(texts))} of {len(texts)}")

print("\nWithin-setting similarity (Jaccard):")
for label, texts in results.items():
    print(f"{label}:", pairwise_stats(texts, texts))

print("\nBetween-setting similarity (Jaccard), adjacent top_p:")
for p1, p2 in zip(TOP_PS, TOP_PS[1:]):
    a = results[f"top_p={p1}"]
    b = results[f"top_p={p2}"]
    print(f"top_p={p1} vs top_p={p2}:", pairwise_stats(a, b))

print("\n--- Outputs ---")
for label, texts in results.items():
    print(f"\n===== {label} =====")
    for i, text in enumerate(texts, 1):
        seed = meta[label][i - 1]
        print(f"\n[run {i} seed={seed}]\n{text}")

Unique outputs per setting:
control_greedy: 1 of 1
top_p=0.01: 1 of 3
top_p=0.1: 1 of 3
top_p=0.3: 1 of 3
top_p=0.5: 2 of 3
top_p=0.8: 3 of 3

Within-setting similarity (Jaccard):
control_greedy: {'min': 1.0, 'mean': 1.0, 'max': 1.0}
top_p=0.01: {'min': 1.0, 'mean': 1.0, 'max': 1.0}
top_p=0.1: {'min': 1.0, 'mean': 1.0, 'max': 1.0}
top_p=0.3: {'min': 1.0, 'mean': 1.0, 'max': 1.0}
top_p=0.5: {'min': 0.578125, 'mean': 0.8125, 'max': 1.0}
top_p=0.8: {'min': 0.37142857142857144, 'mean': 0.6853589196872778, 'max': 1.0}

Between-setting similarity (Jaccard), adjacent top_p:
top_p=0.01 vs top_p=0.1: {'min': 1.0, 'mean': 1.0, 'max': 1.0}
top_p=0.1 vs top_p=0.3: {'min': 1.0, 'mean': 1.0, 'max': 1.0}
top_p=0.3 vs top_p=0.5: {'min': 0.5454545454545454, 'mean': 0.6403659233847914, 'max': 0.8301886792452831}
top_p=0.5 vs top_p=0.8: {'min': 0.23170731707317074, 'mean': 0.43212590369069054, 'max': 0.6896551724137931}

--- Outputs ---

===== control_greedy =====

[run 1 seed=1234]
Cyanobacteria, as oxy

In [44]:
# we can verify empiracly how many tokens survive 
# top_p filtering at each generated step. 
# this gives us an idea of what gets filtered for each top_p size and where it does. 
# It is likely that given there were no branches for p<0.5 that the nucleus 
# remained size of 1 for every generation step. 
# However this is not guarenteed, as it is possible that we just got lucky
#and a peaky enough branch was selected each time

In [37]:
import torch

def nucleus_sizes(top_p: float, seed: int = 1234, steps: int = 40):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    gen = model.generate(
        input_ids,
        max_new_tokens=steps,
        do_sample=True,
        temperature=0.5,
        top_p=top_p,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        return_dict_in_generate=True,
        output_scores=True,
    )

    sizes = [int(torch.isfinite(s[0]).sum().item()) for s in gen.scores]
    return sizes

for p in [0.05, 0.1, 0.3, 0.5, 0.8]:
    sizes = nucleus_sizes(p)
    print(f"top_p={p} nucleus_sizes(first 20 steps) =", sizes[:20], "min/mean/max =", min(sizes), sum(sizes)/len(sizes), max(sizes))

top_p=0.05 nucleus_sizes(first 20 steps) = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] min/mean/max = 1 1.0 1
top_p=0.1 nucleus_sizes(first 20 steps) = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] min/mean/max = 1 1.0 1
top_p=0.3 nucleus_sizes(first 20 steps) = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] min/mean/max = 1 1.0 1
top_p=0.5 nucleus_sizes(first 20 steps) = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1] min/mean/max = 1 1.075 2
top_p=0.8 nucleus_sizes(first 20 steps) = [2, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1] min/mean/max = 1 1.25 3


In [39]:
# min_p
# min_p takes a different appraoch. Again we first apply temperature
# Multiply the p_max (the probability that is greatest in the distribution)
# by our chosen value for min_p
# we then lose the tail end of the distribution which is less than that value
# in other words, we keep the tokens whose temperature normalized probabilities
#a re greater than or equal to the compute threshold (min_p * p_max)
# we then renormalize to this smaller batch and sample accordingly
# fundamentally we are keeping tokens whose probability exists in a relative ffactor to the best 
# (most probable) token
# The bigger our min_p value, the greater likelihood that max_p * min_p is close to max_p and therefore remvoing more and more of the
# tail of the distribution

In [40]:
import re
import torch

MIN_PS = [0.01, 0.1, 0.3, 0.5, 0.8]
RUNS_PER_P = 3
TEMPERATURE = 0.5
BASE_SEED = 1234

def normalize(text: str) -> str:
    return re.sub(r"\s+", " ", text.strip())

def jaccard(a: str, b: str) -> float:
    wa = set(re.findall(r"[a-zA-Z]+", a.lower()))
    wb = set(re.findall(r"[a-zA-Z]+", b.lower()))
    return len(wa & wb) / max(1, len(wa | wb))

def generate_once(
    seed: int,
    do_sample: bool,
    temperature: float | None = None,
    min_p: float | None = None,
):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    gen_kwargs = dict(
        max_new_tokens=80,
        do_sample=do_sample,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )
    if temperature is not None:
        gen_kwargs["temperature"] = temperature
    if min_p is not None:
        gen_kwargs["min_p"] = min_p

    with torch.inference_mode():
        out = model.generate(input_ids, **gen_kwargs)

    prompt_len = input_ids.shape[-1]
    gen_ids = out[0, prompt_len:].detach().to("cpu").tolist()
    return normalize(tokenizer.decode(gen_ids, skip_special_tokens=True))

def pairwise_stats(texts_a, texts_b):
    sims = []
    for a in texts_a:
        for b in texts_b:
            sims.append(jaccard(a, b))
    return {"min": min(sims), "mean": sum(sims) / len(sims), "max": max(sims)}

results = {}  # label -> list[str]
meta = {}     # label -> list[seed]

seed_counter = 0

# Control: greedy (do_sample=False)
label = "control_greedy"
seeds = [BASE_SEED + seed_counter]
seed_counter += 1
results[label] = [generate_once(seed=seeds[0], do_sample=False)]
meta[label] = seeds

# min_p experiments (do_sample=True)
for p in MIN_PS:
    label = f"min_p={p}"
    seeds = []
    texts = []
    for _ in range(RUNS_PER_P):
        seed = BASE_SEED + seed_counter
        seed_counter += 1
        seeds.append(seed)
        texts.append(generate_once(seed=seed, do_sample=True,
temperature=TEMPERATURE, min_p=p))
    results[label] = texts
    meta[label] = seeds

print("Unique outputs per setting:")
for label, texts in results.items():
    print(f"{label}: {len(set(texts))} of {len(texts)}")

print("\nWithin-setting similarity (Jaccard):")
for label, texts in results.items():
    print(f"{label}:", pairwise_stats(texts, texts))

print("\nBetween-setting similarity (Jaccard), adjacent min_p:")
for p1, p2 in zip(MIN_PS, MIN_PS[1:]):
    a = results[f"min_p={p1}"]
    b = results[f"min_p={p2}"]
    print(f"min_p={p1} vs min_p={p2}:", pairwise_stats(a, b))

print("\n--- Outputs ---")
for label, texts in results.items():
    print(f"\n===== {label} =====")
    for i, text in enumerate(texts, 1):
        seed = meta[label][i - 1]
        print(f"\n[run {i} seed={seed}]\n{text}")


Unique outputs per setting:
control_greedy: 1 of 1
min_p=0.01: 3 of 3
min_p=0.1: 3 of 3
min_p=0.3: 3 of 3
min_p=0.5: 2 of 3
min_p=0.8: 3 of 3

Within-setting similarity (Jaccard):
control_greedy: {'min': 1.0, 'mean': 1.0, 'max': 1.0}
min_p=0.01: {'min': 0.2236842105263158, 'mean': 0.5695729222045012, 'max': 1.0}
min_p=0.1: {'min': 0.225, 'mean': 0.496343779677113, 'max': 1.0}
min_p=0.3: {'min': 0.375, 'mean': 0.6050580431177446, 'max': 1.0}
min_p=0.5: {'min': 0.45588235294117646, 'mean': 0.7581699346405228, 'max': 1.0}
min_p=0.8: {'min': 0.5373134328358209, 'mean': 0.744583360255002, 'max': 1.0}

Between-setting similarity (Jaccard), adjacent min_p:
min_p=0.01 vs min_p=0.1: {'min': 0.15, 'mean': 0.27724663262683347, 'max': 0.4307692307692308}
min_p=0.1 vs min_p=0.3: {'min': 0.18518518518518517, 'mean': 0.32819757520353154, 'max': 0.6666666666666666}
min_p=0.3 vs min_p=0.5: {'min': 0.3835616438356164, 'mean': 0.45480920705694, 'max': 0.7454545454545455}
min_p=0.5 vs min_p=0.8: {'min': 0

In [41]:
# We can once again eimpirically test where the branches occur per generation
# This also allows us to see the effect of bringing min_p closer to one and collapsing
# upon the argmax, singel most probabilistic token, whcih we see in the greedy control

In [42]:
import torch

def minp_sizes(min_p: float, seed: int = 1234, steps: int = 40):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    gen = model.generate(
        input_ids,
        max_new_tokens=steps,
        do_sample=True,
        temperature=0.5,
        min_p=min_p,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        return_dict_in_generate=True,
        output_scores=True,
    )
    sizes = [int(torch.isfinite(s[0]).sum().item()) for s in gen.scores]
    return sizes

for p in [0.01, 0.1, 0.3, 0.5, 0.8]:
    sizes = minp_sizes(p)
    print(f"min_p={p} min/mean/max =", min(sizes), sum(sizes)/len(sizes),
max(sizes))


min_p=0.01 min/mean/max = 1 1.6 8
min_p=0.1 min/mean/max = 1 1.35 4
min_p=0.3 min/mean/max = 1 1.125 2
min_p=0.5 min/mean/max = 1 1.05 2
min_p=0.8 min/mean/max = 1 1.075 2


In [43]:
# Now we move on to max_new_tokens, a setting which specifies how many new 
# tokens can be generated after the proceeding prompt is encoded. 

In [46]:
messages = [{"role": "user", "content": "Describe cyanobacteria photoprotective mechanisms"}]

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    enable_thinking=False, #on by default in nemotron
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

with torch.inference_mode():
    out = model.generate(
        input_ids,
        max_new_tokens=250, #the focus point of this experiment, determines how many new tokens we can add
        do_sample=True, 
        temperature=0.8, #sharper than baseline (baseline is 1.0)
        top_k=15, #default is 50, we accept only the top 15(can be more or less due to tieing
        top_p=0.9, #order those 15 (that we determined with top_k = 15) and take the first batch that add up to probability of 0.9
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

prompt_len = input_ids.shape[-1]
gen_ids = out[0, prompt_len:].detach().to("cpu").tolist() 
print(tokenizer.decode(gen_ids, skip_special_tokens=True))

Cyanobacteria, like plants and algae, employ sophisticated photoprotective mechanisms to cope with excess light energy, which can cause oxidative damage to photosynthetic apparatus. These mechanisms are crucial for survival in variable light environments, especially under high light conditions that can lead to photodamage. Here's a detailed description of their key photoprotective strategies:

### 1. **Pigment Composition and Organization**
   - **Light-Harvesting Antennas (Phycobilisomes):** 
     Cyanobacteria use phycobilisomes (PBS) as their primary light-harvesting complexes, composed of phycobiliproteins (e.g., phycoerythrin, phycocyanin) and linker proteins. These antennas efficiently capture light but can become over-excited under high light.
     - **Photoprotective Role:** PBS structures are dynamically regulated. Under high light, cyanobacteria may reduce PBS size or alter their composition to limit light capture, preventing overexcitation of photosystem II (PSII).
     - **

In [47]:
# Next, we similarly look at min_new tokens. 
# This forces the model to produce at least that many new tokens after the prompt is encoded
# before it is allowed to stop. 
# genrally paired with max_new_tokens (as without max, it could run for a long time until eos is produced)

In [48]:
messages = [{"role": "user", "content": "Describe cyanobacteria photoprotective mechanisms"}]

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    enable_thinking=False, #on by default in nemotron
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

with torch.inference_mode():
    out = model.generate(
        input_ids,
        max_new_tokens=50,
        min_new_tokens=20,#the focus point of this experiment, determines minimum tokens before eos
        do_sample=True, 
        temperature=0.8, #sharper than baseline (baseline is 1.0)
        top_k=15, #default is 50, we accept only the top 15(can be more or less due to tieing
        top_p=0.9, #order those 15 (that we determined with top_k = 15) and take the first batch that add up to probability of 0.9
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

prompt_len = input_ids.shape[-1]
gen_ids = out[0, prompt_len:].detach().to("cpu").tolist() 
print(tokenizer.decode(gen_ids, skip_special_tokens=True))

Cyanobacteria, as oxygenic photosynthetic prokaryotes, face intense light exposure in aquatic and terrestrial environments, which can lead to **photoinhibition** (damage to the photosynthetic apparatus) and **photodamage** (oxidative stress


In [49]:
# Now we will take a look at eos_token_id
# This is the end of sequence token, like a stop codon in DNA
# If the model generates the eos token ID , Transfomrers treats it as a stop signal and stops
# generationfor that ssequence (unless we override with min_new_tokens)

In [50]:
# We can inspect what eos is currently set to right now by default


In [52]:
tokenizer.eos_token, tokenizer.eos_token_id,
model.generation_config.eos_token_id


[2, 11]

In [None]:
# eos_token is the special end token as text. a string. 
# eos_token_id is the integer ID that the tokenizer reserves for that token
# the string is treated as an atomic token and maps to one ID - different from normal
# text which may map to multiple id's
# model.generation_config.eos_token_id can be a list of ids (any of these tokens ends generation)
# we can also modify/add/override special tokens after loading the model
# Even can override it per call

#In our case we do not have a signle eos, but a list of validd "stop tokens"

# we can see what they actually are by doing this: 

In [53]:
print("tokenizer.eos_token      =", tokenizer.eos_token)
print("tokenizer.eos_token_id   =", tokenizer.eos_token_id)
print("generation eos_token_id  =", model.generation_config.eos_token_id)

for tid in model.generation_config.eos_token_id:
    print(tid, repr(tokenizer.decode([tid], skip_special_tokens=False)))

tokenizer.eos_token      = <|im_end|>
tokenizer.eos_token_id   = 11
generation eos_token_id  = [2, 11]
2 '</s>'
11 '<|im_end|>'


In [55]:
# Our tokenizers official EOS token is <|im_end|> with an id of 11
# The models generation defaults say stop if either id 2 (</s>) or id 11 (<|im_end|>
# If we edit our model generate such that model.generate(..., eos_token_id=tokenizer.eos_token_id)
# we would be setting it such that it will only stop on id 11
# if we instead do model.generate(...,eos_token_id=model.generation_config.eos_token_id)
# we would stop on either 2 or 11. 

In [57]:
# Now we take a look at pad_token_id
# pad_token_id is a token ID used for padding and filling shorter sequences so that 
# a batch has the same length
# many operations expect rectangular tensors (same legnth alogn sequence dimension)
# useful in beam search where some sequences may end ealrier than others
# if one sequence hits eops early and the others keep generating
# Transformers pads the finished ones out to the same length to keep everything in one tensor
# pad_token_id tells transformers which token id to use for the padding
# We can see what it is with:

In [58]:
print("tokenizer.pad_token =", tokenizer.pad_token)
print("tokenizer.pad_token_id =", tokenizer.pad_token_id)

print("model.config.pad_token_id =", getattr(model.config, "pad_token_id",
None))
print("model.generation_config.pad_token_id =", getattr(model.generation_config,
"pad_token_id", None))

tokenizer.pad_token = None
tokenizer.pad_token_id = None
model.config.pad_token_id = 0
model.generation_config.pad_token_id = 0


In [59]:
# There is no defined padding token so pad_token_id is none.
# However, the model itself uses token id 0 as the pad
# we can inspect what id 0 is:

In [60]:
print(tokenizer.convert_ids_to_tokens(0))
print(repr(tokenizer.decode([0], skip_special_tokens=False)))
print(0 in tokenizer.all_special_ids, tokenizer.all_special_ids)

<unk>
'<unk>'
True [1, 11, 0]


In [61]:
# so pad with id 0 means pad with <unk> 
# 0 decodes to <unk> and its listed as a special id [1, 11, 0]

In [62]:
# Just as a reminder lets decode all teh special tokens:


In [63]:
for tid in [0, 1, 2, 11]:
    print(tid, tokenizer.convert_ids_to_tokens(tid),
repr(tokenizer.decode([tid], skip_special_tokens=False)))

0 <unk> '<unk>'
1 <s> '<s>'
2 </s> '</s>'
11 <|im_end|> '<|im_end|>'


In [64]:
# This maps to 
# 0 = <unk> unknown
# 1 = <s> start-of-sequence
# 2 = </s> end-of-sequence
# 11 = <|im_end|> end-of-message - tokenizer EOS

In [65]:
# <unk> isnt necessarily a safe token to have as padding becasue it can smemantically interfere
# with the models attention unless it is given an attention_mask explicitely. 
# A safe fallback is to use pad_token_id=tokenizer.eos_token_id  
# it is usually safe to use the eos token id as padding becasue models are typically trained seeign teh eos 
# near the ned. 
# Later when we discuss attention_mask with can pad with <unk> with no worries
# this becomes more relevent when we discuss batching

In [30]:
messages = [{"role": "user", "content": "Describe cyanobacteria photoprotective mechanisms"}]

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    enable_thinking=True, #on by default in nemotron
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

with torch.inference_mode():
    out = model.generate(
        input_ids,
        max_new_tokens=120,
        min_new_tokens=20,#determines minimum tokens before eos
        do_sample=True, 
        temperature=0.3, #sharper than baseline (baseline is 1.0)
        top_k=15, #default is 50, we accept only the top 15(can be more or less due to tieing
        top_p=0.9, #order those 15 (that we determined with top_k = 15) and take the first batch that add up to probability of 0.9
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id, #safe as we are not doing batching yet
    )

prompt_len = input_ids.shape[-1]
gen_ids = out[0, prompt_len:].detach().to("cpu").tolist() 
print(tokenizer.decode(gen_ids, skip_special_tokens=True))

The user asks: "Describe cyanobacterial photoprotective mechanisms". They want a description. Provide detailed explanation of photoprotection in cyanobacteria: mechanisms like non-photochemical quenching (NPQ), state transitions, xanthophyll cycle (though cyanobacteria have different pigments), protective pigments like scytonemin, mycosporine-like amino acids, carotenoids, phycobiliproteins, PSII repair, cyclic electron flow, antioxidant systems, etc. Also mention acclimation to high light, structural changes, etc. Provide thorough answer. No disallowed content. So


In [9]:
# The remainder of this tutorial will focus on Beam Searching
# Beam Searcher is a deterministc decoding algorithm which keeps the top
# num_beams partial continuations (beams) at every generation step. 
# these beams are ranked by their cumulative log-probability 
# which is also hindered by a length penalty (or other paramter)
# Like no_sampling (Greedy) beam searching is deterministic. 
# It therefore doesnt sample a distribution of probable next tokens
# It aquires the next probable token for each beam after testing cumulative logprob
# Beam searching always retains the top num_beams, irrespective on whose original pool
# the pootential beam came from. During generation X for example, if
# there are num_beams = 8, all 8 new beams could come from generations 7's 
# beam number 5 for example.
# if the other beams cumulative logprob isnt good enough
# important to note is that there are 2 pools
# active beams whicha re alwasy kept at num_beams and finished beams whicha re stored
# seperately as finished beams (have had an eos).
# there may be 2 beams in finished beams, but there would still be 3 beams in 
# active beams if num_beams=3

In [12]:
# Early stopping is a phenomenon in which beam search ends as a consequence of
# defined criteria (before max_new_tokens)
# The goal is to say "can any unfinished beam, beat the finished beam"
# Finisheed beams are those which have ended with EOS and are stored as a complete 
# Beamhypothesis
# To stop early, the algorithm therefore requires an upper bound on how
# good an unfinished beam could possibly become if it was extended further. 
# Huggingface uses a current score and length normalization
# (length_penalty) to determine this. 
# If early_stopping = True, then as soon as num_beams = the num finished candidates
# the search ends (this means active beams that are still searching cease).



In [18]:
messages = [{"role": "user", "content": "Describe cyanobacteria photoprotective mechanisms"}]

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    enable_thinking=False, #on by default in nemotron
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

with torch.inference_mode():
    out = model.generate(
        input_ids,
        max_new_tokens=50,
        min_new_tokens=20,#determines minimum tokens before eos
        do_sample=False, 
        num_beams=5,
        num_return_sequences=5, # so we can see the top beams produced
        early_stopping=True, # once num_beams = finished beams , stop
        length_penalty=1.0, # default
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id, #safe as we are not doing batching yet
    )

prompt_len = input_ids.shape[-1]
# So we can decode and see all outputs
for i in range(out.shape[0]):
    gen_ids = out[i, prompt_len:].detach().cpu().tolist()
    print(f"\n--- sequence {i+1} ---")
    print(tokenizer.decode(gen_ids, skip_special_tokens=True))


--- sequence 1 ---
Cyanobacteria, as oxygenic photosynthetic prokaryotes, face intense light stress in aquatic and terrestrial environments. To protect themselves from photodamage while maintaining efficient photosynthesis, they employ a sophisticated array of **photoprotective mechanisms**. These mechanisms primarily

--- sequence 2 ---
Cyanobacteria, as oxygenic photosynthetic prokaryotes, face intense light stress in aquatic and terrestrial environments. To protect themselves from photodamage while maintaining efficient photosynthesis, they employ a sophisticated array of **photoprotective mechanisms**. These can be

--- sequence 3 ---
Cyanobacteria, as oxygenic photosynthetic prokaryotes, face intense light stress in aquatic and terrestrial environments. To protect themselves from photodamage while maintaining efficient photosynthesis, they employ a sophisticated array of **photoprotective mechanisms**, primarily centered on

--- sequence 4 ---
Cyanobacteria, as photosynthetic pro

In [None]:
# Important to not eis that we will not see any
# changes in outputs if stopping conditions of eos are not met
# In other words, in order for early stopping to matter, eos conditions must be hit
# Below I have changed the prompt to directly specify 1 sentences. 

In [14]:
# If early_stopping=False, The search does not end when we collect num_beams
# compelted sequences, instead a new check emerges which fundamentally asksd the question:
# "can any unfinished beam still win?"
# if the check says no, then it stops, if it says true, it porocedes until it
# says no.
# Once num_beams = finished beams HF computes a upper bound on how good the
# best active beam could possibly get given the scoring 
# scheme (length penalty, etc)
# If the best case active score cant beat the worst of the finished beams
# then it stops, otherwise the process continues.
# this process is considered heuristic in that the decoder cant actually
# know if a token will emerge that makes an active beam beat a finished beam
# so instead it uses the scoring normalization (length penalty, default 1)
# The length penalty normalizes all beams scores so that active beams can be compared
# to finished beams. 
# once the num_beams = finished beams (we can think of these as limited
# slots tha can be filled) , 
# the heuristic check occurs 
# if an active beam has a best possible case score such that it surpasses the worst case in the
# finished pool (both ebing normalzied via length penalty), the generation procede


In [21]:
messages = [{"role": "user", "content": "Describe cyanobacteria photoprotective mechanisms in 1 sentences"}]

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    enable_thinking=False, #on by default in nemotron
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

with torch.inference_mode():
    out = model.generate(
        input_ids,
        max_new_tokens=100, # higher means more opportunity for eos
        do_sample=False, 
        num_beams=5,
        num_return_sequences=5, # so we can see the top beams produced
        early_stopping=False, # Heuristic, can current normalized scores beat finished scores
        length_penalty=1.0, # default
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id, #safe as we are not doing batching yet
    )

prompt_len = input_ids.shape[-1]
# So we can decode and see all outputs
for i in range(out.shape[0]):
    gen_ids = out[i, prompt_len:].detach().cpu().tolist()
    print(f"\n--- sequence {i+1} ---")
    print(tokenizer.decode(gen_ids, skip_special_tokens=True))


--- sequence 1 ---
Cyanobacteria protect themselves from excess light by employing photoprotective mechanisms such as non‑photochemical quenching (NPQ), where excess energy is safely dissipated as heat via carotenoid‑dependent pathways and the xanthophyll cycle, alongside the production of photoprotective pigments (e.g., scytonemin, mycosporine-like amino acids) and robust antioxidant systems that scavenge reactive oxygen species.

--- sequence 2 ---
Cyanobacteria protect themselves from excess light by employing photoprotective mechanisms such as non‑photochemical quenching (NPQ), where excess energy is safely dissipated as heat via carotenoid‑dependent pathways and the xanthophyll cycle, alongside the production of photoprotective pigments (e.g., scytonemin and mycosporine‑like amino acids) and robust antioxidant systems that scavenge reactive oxygen species.

--- sequence 3 ---
Cyanobacteria protect themselves from excess light by employing photoprotective mechanisms such as non‑ph

In [15]:
# Early stopping='never' is the most conservative beam search stop
# Fundamentally, while early_stopping=false uses the current length of the beam, early_stopping
# =never uses the max length of the beams
# specifically it says:
# After num_beams=finished beams, do not stop until we determine
# there are no active beams which can ever beat the worst finished beam under
# the scoring function (this time it uses length peenalty based on 
# the max_length that we set, as oppsoed to teh current length of the beams
# "assume logprob of all remaining tokens until max_length is 0 (probability of 1)
# (a quick aside is that max_length and max_new_tokens are different knobs , but we can set
# them to be equal

In [23]:
messages = [{"role": "user", "content": "Describe cyanobacteria photoprotective mechanisms in 1 sentence"}]

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    enable_thinking=False, #on by default in nemotron
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

with torch.inference_mode():
    out = model.generate(
        input_ids,
        max_new_tokens=100, # higher means more opportunity for eos
        do_sample=False, 
        num_beams=5,
        num_return_sequences=5, # so we can see the top beams produced
        early_stopping='never', # Heuristic, can max length normalized scores beat finished scores
        length_penalty=1.0, # default
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id, #safe as we are not doing batching yet
    )

prompt_len = input_ids.shape[-1]
# So we can decode and see all outputs
for i in range(out.shape[0]):
    gen_ids = out[i, prompt_len:].detach().cpu().tolist()
    print(f"\n--- sequence {i+1} ---")
    print(tokenizer.decode(gen_ids, skip_special_tokens=True))


--- sequence 1 ---
Cyanobacteria protect themselves from excess light by employing photoprotective mechanisms such as non‑photochemical quenching (NPQ) and carotenoid‑mediated energy dissipation, which safely dissipate surplus excitation energy as heat to prevent oxidative damage.

--- sequence 2 ---
Cyanobacteria protect themselves from excess light by employing photoprotective mechanisms such as non‑photochemical quenching (NPQ) and carotenoid‑mediated energy dissipation, which safely dissipate surplus excitation energy as heat.

--- sequence 3 ---
Cyanobacteria protect themselves from excess light by employing photoprotective mechanisms such as non‑photochemical quenching (NPQ) and carotenoid‑mediated energy dissipation, which safely dissipate surplus excitation energy as heat and prevent oxidative damage.

--- sequence 4 ---
Cyanobacteria protect themselves from excess light by employing photoprotective mechanisms such as non‑photochemical quenching (NPQ) and carotenoid‑mediated e

In [24]:
# We will now return internal bookkeeping to really get a feel for what is 
# happening under the hood for each of these calls. 

In [41]:
import torch
from transformers import StoppingCriteria, StoppingCriteriaList

class BeamPrefixTracer(StoppingCriteria):
    def __init__(self, tokenizer, num_beams, max_steps: int = 25, tail_tokens: int = 12):
        self.tokenizer = tokenizer
        self.num_beams = num_beams
        self.max_steps = max_steps
        self.tail_tokens = tail_tokens
        self.step = 0
        self.records = []  # list of beam tail strings per step

    def __call__(self, input_ids, scores, **kwargs):
        if self.step == 0:
            print("beam rows (input_ids.shape[0]) =", input_ids.shape[0])
        if self.step < self.max_steps:
            rows = input_ids[: self.num_beams]  # keep only active beams
            tails = rows[:, -self.tail_tokens:].detach().cpu().tolist()
            tail_texts = [self.tokenizer.decode(t, skip_special_tokens=False) for t in tails]
            self.records.append(tail_texts)
        self.step += 1
        return False  # never stop early ourselves

messages = [{"role": "user", "content": "Describe cyanobacteria photoprotective mechanisms in 1 sentence"}]

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    enable_thinking=False,
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

NUM_BEAMS = 5
prompt_len = input_ids.shape[-1]
tracer = BeamPrefixTracer(tokenizer, NUM_BEAMS, max_steps=25, tail_tokens=14)

with torch.inference_mode():
    gen = model.generate(
        input_ids,
        max_new_tokens=100,
        do_sample=False,
        num_beams=NUM_BEAMS,
        num_return_sequences=NUM_BEAMS,
        early_stopping=False,
        length_penalty=1.0,
        return_dict_in_generate=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        stopping_criteria=StoppingCriteriaList([tracer]),
    )

# We print the new statistics
print("sequences shape:", tuple(gen.sequences.shape))
print("has beam_indices:", getattr(gen, "beam_indices", None) is not None)

print("\n=== Beam evolution (tail of each active beam per step) ===")
for step, tails in enumerate(tracer.records):
    print(f"\nstep {step}")
    for b, t in enumerate(tails):
        print(f"beam_row {b}: {t!r}")

# Decode all outputs
for i in range(gen.sequences.shape[0]):
    gen_ids = gen.sequences[i, prompt_len:].detach().cpu().tolist()
    print(f"\n--- sequence {i+1} ---")
    print(tokenizer.decode(gen_ids, skip_special_tokens=True))

beam rows (input_ids.shape[0]) = 10
sequences shape: (5, 82)
has beam_indices: True

=== Beam evolution (tail of each active beam per step) ===

step 0
beam_row 0: ' mechanisms in 1 sentence<|im_end|>\n<|im_start|>assistant\n<think></think>C'
beam_row 1: ' mechanisms in 1 sentence<|im_end|>\n<|im_start|>assistant\n<think></think>They'
beam_row 2: ' mechanisms in 1 sentence<|im_end|>\n<|im_start|>assistant\n<think></think>"'
beam_row 3: ' mechanisms in 1 sentence<|im_end|>\n<|im_start|>assistant\n<think></think>**'
beam_row 4: ' mechanisms in 1 sentence<|im_end|>\n<|im_start|>assistant\n<think></think>The'

step 1
beam_row 0: ' in 1 sentence<|im_end|>\n<|im_start|>assistant\n<think></think>Cyan'
beam_row 1: ' in 1 sentence<|im_end|>\n<|im_start|>assistant\n<think></think>They employ'
beam_row 2: ' in 1 sentence<|im_end|>\n<|im_start|>assistant\n<think></think>"C'
beam_row 3: ' in 1 sentence<|im_end|>\n<|im_start|>assistant\n<think></think>**C'
beam_row 4: ' in 1 sentence<|im_end|>\n<|im

In [42]:
# In chat style tasks it is hard to reap the full benefits of beam
# search and the knobs therewithin. 
# Its optimal use really come in when we are dealing with longer and structurally constained 
# outputs

In [43]:
# Now we will look at ngram, repetition penalties,
# and ngram sizes

In [44]:
# an ngram is a sequence of ntokens, like a codon
# 1-gram (one token)
# 2-gram (two token)
# etc. 
# We can tune transformers to restrict repition on the ngram scale
# Specifically preventing ngrams of a certain size 
# no_repeat_ngram_size=N

In [45]:
messages = [{"role": "user", "content": "Describe cyanobacteria photoprotective mechanisms"}]

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    enable_thinking=False, #on by default in nemotron
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

with torch.inference_mode():
    out = model.generate(
        input_ids,
        max_new_tokens=250, 
        do_sample=True, 
        temperature=0.8,
        top_k=15, 
        top_p=0.9, 
        no_repeat_ngram_size=2, # We can tune the ngram size here
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

prompt_len = input_ids.shape[-1]
gen_ids = out[0, prompt_len:].detach().to("cpu").tolist() 
print(tokenizer.decode(gen_ids, skip_special_tokens=True))

Cyanobacteria, as oxygenic photosynthetic prokaryotes, face intense sunlight that can cause oxidative damage to their photosynthetic machinery. To protect against this, they employ a sophisticated array of **photoprotection mechanisms**, primarily focused on **dissipating excess energy as heat** and **preventing the formation of harmful reactive oxygen species (ROS)**. Below is a structured breakdown of the key mechanisms, emphasizing their biological significance and molecular basis:

---

### **1. Non-Photochemical Quenching (NPQ) – The Primary Defense**
   - **Purpose**: Safely dissipate excess excitation energy from chlorophyll as harmless heat, preventing ROS generation.
   **Key Components & Processes**:
   * **Light-Harvesting Complexes (LHCs)**: 
     - Cyanobacterial LHC-like complexes (e.g., **CP43**, **LhcX**, and the **PE/PC** antenna system) absorb light energy. Under high light, these complexes become over-excited.
     * **"Quencher" Molecules**: 

       *   **(Xanthoph

In [46]:
# The concept is quite intuitive. Heres the same 
# code again but with a different size ngram restriction


In [47]:
messages = [{"role": "user", "content": "Describe cyanobacteria photoprotective mechanisms"}]

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    enable_thinking=False, #on by default in nemotron
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

with torch.inference_mode():
    out = model.generate(
        input_ids,
        max_new_tokens=250, 
        do_sample=True, 
        temperature=0.8,
        top_k=15, 
        top_p=0.9, 
        no_repeat_ngram_size=3, # We can tune the ngram size here
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

prompt_len = input_ids.shape[-1]
gen_ids = out[0, prompt_len:].detach().to("cpu").tolist() 
print(tokenizer.decode(gen_ids, skip_special_tokens=True))

Cyanobacteria, like plants and algae, employ a sophisticated suite of **photoprotective strategies** to shield their photosynthetic machinery (primarily Photosystem II, PSII) from oxidative damage caused by excess light energy. These mechanisms are critical for survival in high-light environments (e.g., surface waters, shallow lakes) where absorbed light energy can exceed the capacity for safe energy dissipation. Below is a structured overview of their key photoprotection mechanisms, integrating molecular, physiological, and structural adaptations:

---

### **1. Non-Photochemical Quenching (NPQ) in Cyanobacteria**
   - **Mechanism**: 
     - In cyanobacteria, NPQ is primarily driven by **energy dissipation within the phycobilisome (PBS) antenna complexes**. When light intensity exceeds photosynthetic demand, excess energy is safely dissipated as heat via conformational changes in phycocyanin (a major PBS pigment).
     - This process is mediated by the **orange carotenoid protein (OCP

In [48]:
#the ngram thing is again something hard to notice unless we are doing a really huge output.
# it seems like a fudnamantlally import thing to set though so one inst dissapointed with
# repetition later on
# it is indeed a critical guardrail, important to leave on for longer generations
# However, asn alternative exists which isnt as harsh. repetition_penalty



In [49]:
# repetition_penalty as opposed to banning repetition of a certain size, instead
# penalizes it. It is a soft restriction, reducing the probability
# of repeating words by modifying logits
# In practice, transformers adjusts the logits for tokens
# that have appeared before so they are less attractive
# 1.0 means no penalty and is on by default
# greater than 1.0 means penalize repeats more
# less than 1.0 encourages repetition

# Specifically, logits are divided by the penalty if they
# occured more than zero times
# and are multiplied by the penalty if they appear less than 0
# this thereofre encourages or decourages specific logits depending how you set
# the value of the penalty

In [50]:
# Below we will see what happens when we run the same p[rompt and settings 
# but with reepetion_penalty set less than one, encouraging repetion. 


In [54]:
messages = [{"role": "user", "content": "Describe cyanobacteria photoprotective mechanisms"}]

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    enable_thinking=False, #on by default in nemotron
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

with torch.inference_mode():
    out = model.generate(
        input_ids,
        max_new_tokens=250,
        min_new_tokens=40,
        do_sample=True, 
        temperature=0.8,
        top_k=15, 
        top_p=0.9, 
        repetition_penalty=0.7, # Encourages lots of repeats since logits tthat have occured are divided by a small number = big and logits that havent occured are multiplied by a small number = small
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
    )

prompt_len = input_ids.shape[-1]
gen_ids = out[0, prompt_len:].detach().to("cpu").tolist() 
print(tokenizer.decode(gen_ids, skip_special_tokens=True))

Cyanobacteria, as phototrophic cyanobacteria, employ a sophisticated, multi-layered photoprotective mechanisms to cope with intense, rapidly fluctuating, and potentially photodamaging intense sunlight, especially to prevent photodamage to the photosystems and the photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystems, especially photosystem