In [16]:
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained(
    "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
    trust_remote_code=True
)

text = "Cyanobacteria produce photoprotective pigments"
enc = tok(text)

print("Text:", text)
print("Token IDs:", enc["input_ids"])
print("Tokens:", tok.convert_ids_to_tokens(enc["input_ids"]))
print("Decoded back:", tok.decode(enc["input_ids"]))

Text: Cyanobacteria produce photoprotective pigments
Token IDs: [1067, 10978, 70475, 9470, 7024, 25227, 67532, 95592]
Tokens: ['C', 'yan', 'obacteria', 'Ġproduce', 'Ġphot', 'oprote', 'ctive', 'Ġpigments']
Decoded back: Cyanobacteria produce photoprotective pigments


In [3]:
# Notice that not every word is a token and not every token is a word,
# sometimes weird word combiantion are a token and sometimes
# only a letter or part of a word is a token


In [4]:
# A tokenizer is that converts text to token ID's that the model
# operates on
# Autotokenizer is a transformers helper that
# automatically loads the correct tokenizer
# class and files for a given model so we dont have to know what the tokenizer class is


In [5]:
# Last Lab we spent some time discussing eos. 
# Here we will force the model to stop unexpectedly by setting 
# eos to be a specific token of our choice, running the model with high
# probability of producing that token as an output,
# and observe


In [6]:
# First, lets identify the token associated

In [17]:
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained(
    "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
    trust_remote_code=True
)

text = "cell"
enc = tok(text)

print("Text:", text)
print("Token IDs:", enc["input_ids"])
print("Tokens:", tok.convert_ids_to_tokens(enc["input_ids"]))
print("Decoded back:", tok.decode(enc["input_ids"]))

Text: cell
Token IDs: [19187]
Tokens: ['cell']
Decoded back: cell


In [10]:
# the word cell is luckily one token long. Another key 
# word like scytonemin, if you look, is four!

In [11]:
# We can take a look again at the special tokens 
# associated with our model currently: 

In [15]:
#load model

In [18]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16"
revision = "2e43387afd60157064e5bef4e9a583f887c6dfdd"  # your cached snapshot
cache_dir = "/data/hf/hub"

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True,
    use_fast=True,
    revision=revision,
    cache_dir=cache_dir,
    local_files_only=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    dtype=torch.bfloat16,
    device_map="cuda:0",
    revision=revision,
    cache_dir=cache_dir,
    local_files_only=True,
)

print("Loaded tokenizer + model on", model.device)

Loading checkpoint shards:   0%|          | 0/13 [00:00<?, ?it/s]

Loaded tokenizer + model on cuda:0


In [19]:
tokenizer.eos_token, tokenizer.eos_token_id,
model.generation_config.eos_token_id

print("tokenizer.eos_token      =", tokenizer.eos_token)
print("tokenizer.eos_token_id   =", tokenizer.eos_token_id)
print("generation eos_token_id  =", model.generation_config.eos_token_id)

for tid in model.generation_config.eos_token_id:
    print(tid, repr(tokenizer.decode([tid], skip_special_tokens=False)))

tokenizer.eos_token      = <|im_end|>
tokenizer.eos_token_id   = 11
generation eos_token_id  = [2, 11]
2 '</s>'
11 '<|im_end|>'


In [21]:
messages = [{"role": "user", "content": "Describe cyanobacteria photoprotective mechanisms in relation to practecting  the prokaryotic system. Contained in a phospholipid bilayer"}]

input_ids = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    enable_thinking=False, #on by default in nemotron
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)

with torch.inference_mode():
    out = model.generate(
        input_ids,
        max_new_tokens=250,
        min_new_tokens=40,
        do_sample=True, 
        temperature=0.8,
        top_k=15, 
        top_p=0.9, 
        repetition_penalty=1.2,
        eos_token_id=19187, #token for cell
        pad_token_id=tokenizer.eos_token_id,
    )

prompt_len = input_ids.shape[-1]
gen_ids = out[0, prompt_len:].detach().to("cpu").tolist() 
print(tokenizer.decode(gen_ids, skip_special_tokens=True))

## Cyanobacterial Photoprotection – How Prokaryotes Keep Their Light‑Harvest Machinery Safe  

Cyanobacteria are photoautotrophic bacteria that live at the interface of light and water, where they must harvest photons for photosynthesis while avoiding the damaging effects of excess excitation energy (EEE).  Unlike eukaryotes whose photosynthetic apparatus is housed inside chloroplasts bounded by double membranes, cyanobacteria have their photosystems embedded directly into **a single, continuous plasma membrane** that also contains all other essential bioenergetic complexes (respiratory chains, transporters, etc.). This “prokaryotic” organization imposes unique constraints on how protective strategies can be assembled, deployed, and coordinated.

Below we walk through the main photoprotective pathways that cyanobacteria employ, linking each mechanism back to its structural context within the cell envelope. The focus is always on *how* the architecture—a lipid bilayer dotted with pigmen

In [22]:
# This experiemnt proivides valuable isnight. Setting eos, 
# we command " stop if the token 1987 is proiduced" not "stop
# if the word 'cell' is produced
# token is the fundamental unit, and as we have seen,
# even thought he word cell is present in our output
# it is not associated with the token ID of "cell"- which can be produced by different token ID's