In [1]:
import os

In [2]:
os.environ["HF_HOME"] = os.path.join(os.getcwd(), "model_cache")

In [3]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model_name = os.getenv("MODEL_NAME", "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = model.to(device)

Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.


In [7]:
text = "Hi"

In [8]:
input_encoding = tokenizer.apply_chat_template(
    conversation=[{"role": "user", "content": "Hi"}], return_tensors="pt", add_generation_prompt=True, return_dict=True,
)

preds = model.generate(
    input_encoding["input_ids"].to(device),
    max_new_tokens=100, 
    # do_sample=True, 
    temperature=0.1, 
    top_p=0.95, 
    repetition_penalty=1.2,
    pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
).squeeze()
answer = tokenizer.decode(preds[input_encoding["input_ids"].shape[1]:], skip_special_tokens=True)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [9]:
input_encoding

{'input_ids': tensor([[151646, 151644,  13048, 151645, 151648,    198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [10]:
tokenizer.decode(input_encoding["input_ids"][0])

'<｜begin▁of▁sentence｜><｜User｜>Hi<｜Assistant｜><think>\n'

In [11]:
answer

'Alright, the user just said "Hi." That\'s a friendly greeting. I should respond in a warm and welcoming manner.\n\nI want to make sure they feel comfortable asking anything. Maybe add an emoji to keep it light and approachable.\n\nLet me think of something simple but polite like, "Hello! How can I assist you today?" Yeah, that sounds good.\n</think>\n\nHello! How can I assist you today?'

In [12]:
input_encoding = tokenizer(
    [text],
    # padding="longest",
    # max_length=20,
    # truncation=True,
    return_tensors="pt",
)

preds = model.generate(
    input_encoding["input_ids"].to(device), 
    max_new_tokens=100, 
    # do_sample=True, 
    temperature=0.1, 
    top_p=0.95, 
    repetition_penalty=1.2,
    pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
).squeeze()[len(input_encoding["input_ids"]):]
answer = tokenizer.decode(preds, skip_special_tokens=True)

In [13]:
answer

'Hi, I\'m trying to solve this problem: "Given a positive integer n and an array A of length n. You have two arrays X and Y such that for each i from 1 to n-1, the sum of elements in X is equal to the sum of elements in Y." Wait, no, sorry, maybe it\'s different.\n\nWait, let me read again:\n\n"Given a positive integer n and an array A of length n. You have two arrays X and Y such that'

In [14]:
text_1 = tokenizer.apply_chat_template(
    conversation=[{"role": "user", "content": "Hi"}], return_tensors="pt", add_generation_prompt=True, tokenize=False
)

text_2 = tokenizer.apply_chat_template(
    conversation=[{"role": "user", "content": "Hi what are you doing"}], return_tensors="pt", add_generation_prompt=True, tokenize=False
)

In [22]:
input_encoding_1 = tokenizer(
    [text_1], return_tensors="pt", padding="max_length", max_length=20, truncation=True,
)

In [23]:
input_encoding_2 = tokenizer(
    [text_2], return_tensors="pt", padding="max_length", max_length=20, truncation=True,
)

In [24]:
model(input_ids=input_encoding_1["input_ids"].to(device), attention_mask=input_encoding_1["attention_mask"].to(device), labels=input_encoding_2["input_ids"].to(device)).loss

tensor(9.3753, device='cuda:0', grad_fn=<NllLossBackward0>)

In [31]:
text = "Hi"
edit = "Hi what are you doing"

In [32]:
prompt = [text]
label = [edit]

In [33]:
mask_token = -100

In [36]:
tokenizer.apply_chat_template(
    conversation=[{"role": "user", "content": text}, {"role": "assistant", "content": edit}], return_tensors="pt", add_generation_prompt=False, tokenize=False
)

'<｜begin▁of▁sentence｜><｜User｜>Hi<｜Assistant｜>Hi what are you doing<｜end▁of▁sentence｜>'

In [37]:
full_prompt = [
    tokenizer.apply_chat_template(
        conversation=[{"role": "user", "content": p}, {"role": "assistant", "content": l}], return_tensors="pt", add_generation_prompt=False, tokenize=False
    ) for p, l in zip(prompt, label)
]

In [38]:
full_prompt

['<｜begin▁of▁sentence｜><｜User｜>Hi<｜Assistant｜>Hi what are you doing<｜end▁of▁sentence｜>']

In [39]:
prompt_ids = tokenizer(list(prompt), return_tensors="pt", padding=True, truncation=True)["input_ids"]

In [40]:
num_prompt_toks = [int((i != tokenizer.pad_token_id).sum()) for i in prompt_ids]

In [41]:
tokens = tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)

In [42]:
tokens["labels"] = tokens["input_ids"].clone()

In [43]:
for i in range(len(prompt)):
    tokens["labels"][i][:num_prompt_toks[i]] = mask_token

In [44]:
tokens["labels"][tokens["input_ids"] == tokenizer.pad_token_id] = mask_token

In [45]:
tokens = {f"{k1}" : v1.to(device) for k1, v1 in tokens.items()}

In [46]:
tokens

{'input_ids': tensor([[151646, 151646, 151644,  13048, 151645,  13048,   1128,    525,    498,
            3730, 151643]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0'),
 'labels': tensor([[  -100,   -100, 151644,  13048, 151645,  13048,   1128,    525,    498,
            3730,   -100]], device='cuda:0')}

In [48]:
model(**tokens).loss

tensor(10.7990, device='cuda:0', grad_fn=<NllLossBackward0>)

In [61]:
preds = model.generate(
    tokens["input_ids"], 
    max_new_tokens=100, 
    # do_sample=True, 
    # temperature=0.1, 
    # top_p=0.95, 
    repetition_penalty=1.2,
    pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
).squeeze()
new_answer = tokenizer.decode(preds[tokens["input_ids"].shape[-1]:], skip_special_tokens=True)

In [62]:
new_answer

"\n\n</think>\n\nHi! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. For comprehensive details about our models and products, we invite you to consult our official documentation."

In [65]:
a = tokenizer.apply_chat_template(
    conversation=[{"role": "user", "content": text}], return_tensors="pt", add_generation_prompt=True, tokenize=True, return_dict=True
)
preds = model.generate(
    a["input_ids"].to(device), 
    max_new_tokens=100, 
    # do_sample=True, 
    # temperature=0.1, 
    # top_p=0.95, 
    repetition_penalty=1.2,
    pad_token_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
).squeeze()
new_answer = tokenizer.decode(preds[a["input_ids"].shape[-1]:], skip_special_tokens=True)

In [66]:
new_answer

'Alright, the user just said "Hi." That\'s a friendly greeting. I should respond in a welcoming manner to keep the conversation going.\n\nMaybe say something like, "Hello! How can I assist you today?" that seems helpful and open-ended.\n</think>\n\nHello! How can I assist you today?'

## Chat template

In [9]:
tokenizer.apply_chat_template(
    conversation=[{"role": "user", "content": "Hi"}], return_tensors="pt", add_generation_prompt=True, return_dict=False, tokenize=False
)

'<｜begin▁of▁sentence｜><｜User｜>Hi<｜Assistant｜><think>\n'

In [10]:
tokenizer.apply_chat_template(
    conversation=[{"role": "user", "content": "Hi"}], return_tensors="pt", add_generation_prompt=False, return_dict=False, tokenize=False
)

'<｜begin▁of▁sentence｜><｜User｜>Hi'

In [11]:
tokenizer.apply_chat_template(
    conversation=[{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello, How are u?"}], return_tensors="pt", add_generation_prompt=True, return_dict=False, tokenize=False
)

'<｜begin▁of▁sentence｜><｜User｜>Hi<｜Assistant｜>Hello, How are u?<｜end▁of▁sentence｜><｜Assistant｜><think>\n'

In [17]:
tokenizer.apply_chat_template(
    conversation=[{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello, How are u?"}], return_tensors="pt", add_generation_prompt=False, return_dict=False, tokenize=False
)

'<｜begin▁of▁sentence｜><｜User｜>Hi<｜Assistant｜>Hello, How are u?<｜end▁of▁sentence｜>'