In [1]:
import torch
from transformers import AutoTokenizer
from deeptrust.models.llama.modeling_llama import LlamaForCausalLM, COMMIT_CONFIG
import time
from pathlib import Path

model_name = "meta-llama/Llama-3.1-8B-Instruct"
model = LlamaForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.float32)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

COMMIT_DIR = Path("/tmp/deeptrust-commits")
COMMIT_DIR.mkdir(exist_ok=True)

def get_commit_path_from_time():
    return COMMIT_DIR / f"{int(time.time())}.log"

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:31<00:00,  7.92s/it]


# Happy Path

In [2]:
input_text = "What is proto-danksharding and how is it related to eth sharding?"
commit_file = get_commit_path_from_time().open("w")
print(f"Writing commit to {commit_file.name}")
COMMIT_CONFIG.commit_file = commit_file
#commit_file.write(input_text)
#commit_file.write("\n")

input_ids = tokenizer(input_text, return_tensors="pt").input_ids

output = model.generate(input_ids.cuda(), do_sample=True, max_length=24, num_return_sequences=1)

#commit_file.write(tokenizer.decode(output[0]))
print(tokenizer.decode(output[0]))

commit_file.close()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Writing commit to /tmp/deeptrust-commits/1729417546.log


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


<|begin_of_text|>What is proto-danksharding and how is it related to eth sharding? - Ethereum Stack Exchange
What


In [8]:
!cat /tmp/deeptrust-commits/1729417546.log

float32[1, 18, 4096](73728, 4096, 1)<aa928465160a2bcfd32412cd6b89f643>
float32[1, 1, 4096](4096, 4096, 1)<ea28606193535faac892c356b85c8807>
float32[1, 1, 4096](4096, 4096, 1)<2c0d38265766b48fc78daaa38fe144ba>
float32[1, 1, 4096](4096, 4096, 1)<7b0c2c08c0eff261564cbc49f16f354e>
float32[1, 1, 4096](4096, 4096, 1)<55013c178394e6555d3e23840057e08e>
float32[1, 1, 4096](4096, 4096, 1)<f5e6133cfd303d05d68f5eba51666655>


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [3]:
commit_file = get_commit_path_from_time().open("w")
print(f"Writing commit to {commit_file.name}")
COMMIT_CONFIG.commit_file = commit_file
COMMIT_CONFIG.input_prompt_length = 18

with torch.inference_mode():
    _ = model(output[:, :-1])

#commit_file.write(input_text)
commit_file.close()

Writing commit to /tmp/deeptrust-commits/1729417547.log


In [9]:
!cat /tmp/deeptrust-commits/1729417547.log

float32[1, 18, 4096](73728, 4096, 1)<aa928465160a2bcfd32412cd6b89f643>
float32[1, 1, 4096](4096, 4096, 1)<ea28606193535faac892c356b85c8807>
float32[1, 1, 4096](4096, 4096, 1)<2c0d38265766b48fc78daaa38fe144ba>
float32[1, 1, 4096](4096, 4096, 1)<7b0c2c08c0eff261564cbc49f16f354e>
float32[1, 1, 4096](4096, 4096, 1)<55013c178394e6555d3e23840057e08e>
float32[1, 1, 4096](4096, 4096, 1)<f5e6133cfd303d05d68f5eba51666655>


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Bad path

In [4]:
bad_model_name = "meta-llama/Llama-3.2-1B"
bad_model = LlamaForCausalLM.from_pretrained(bad_model_name, device_map="cuda", torch_dtype=torch.bfloat16)
bad_tokenizer = AutoTokenizer.from_pretrained(bad_model_name, use_fast=True)

In [5]:
input_text = "What is proto-danksharding and how is it related to eth sharding?"
commit_file = get_commit_path_from_time().open("w")
print(f"Writing commit to {commit_file.name}")
COMMIT_CONFIG.commit_file = commit_file
#commit_file.write(input_text)
#commit_file.write("\n")

input_ids = bad_tokenizer(input_text, return_tensors="pt").input_ids

output = bad_model.generate(input_ids.cuda(), do_sample=True, max_length=24, num_return_sequences=1)

#commit_file.write(tokenizer.decode(output[0]))
print(bad_tokenizer.decode(output[0]))

commit_file.close()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Writing commit to /tmp/deeptrust-commits/1729417616.log
<|begin_of_text|>What is proto-danksharding and how is it related to eth sharding? (Part 1)
There


In [10]:
!cat /tmp/deeptrust-commits/1729417616.log

float32[1, 18, 2048](36864, 2048, 1)<01999b437754b382a5d466021fe4f175>
float32[1, 1, 2048](2048, 2048, 1)<4a56879d7b923d315eda99c1e5e46c9f>
float32[1, 1, 2048](2048, 2048, 1)<893ffac03114273f2b8b9beecf2b0d03>
float32[1, 1, 2048](2048, 2048, 1)<7bc1154582a3e231ff8079006d3312c9>
float32[1, 1, 2048](2048, 2048, 1)<d204ed4251bd8680be1e1715409a4739>
float32[1, 1, 2048](2048, 2048, 1)<fe9ca2be2d40ec3c213093473404f72e>


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
commit_file = get_commit_path_from_time().open("w")
print(f"Writing commit to {commit_file.name}")
COMMIT_CONFIG.commit_file = commit_file
COMMIT_CONFIG.input_prompt_length = 18

with torch.inference_mode():
    _ = model(output[:, :-1])

#commit_file.write(input_text)
commit_file.close()

Writing commit to /tmp/deeptrust-commits/1729417619.log


In [11]:
!cat /tmp/deeptrust-commits/1729417619.log

float32[1, 18, 4096](73728, 4096, 1)<aa928465160a2bcfd32412cd6b89f643>
float32[1, 1, 4096](4096, 4096, 1)<6061c917900a759f57dc24bae4fbd9ad>
float32[1, 1, 4096](4096, 4096, 1)<43d7b70fcc5c04413e65a84ddee26f9d>
float32[1, 1, 4096](4096, 4096, 1)<9bf521e8f4e64a971ac986495f76c750>
float32[1, 1, 4096](4096, 4096, 1)<8476cb21a29ef9412248b27c6afba64b>
float32[1, 1, 4096](4096, 4096, 1)<0066153bfb74cf46dbeed9ed18317638>


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
