In [1]:
import time
import torch
from transformers import AutoTokenizer
from deeptrust.models.llama.modeling_llama import LlamaForCausalLM
import time
from pathlib import Path

model_name = "meta-llama/Llama-3.1-8B-Instruct"
model = LlamaForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.float32)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

COMMIT_DIR = Path("/tmp/deeptrust-commits")
COMMIT_DIR.mkdir(exist_ok=True)

def get_commit_path_from_time():
    return COMMIT_DIR / f"{int(time.time())}.log"

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:27<00:00,  6.88s/it]


# Happy Path

In [2]:
from deeptrust.commits import Commit

input_text = "What is proto-danksharding and how is it related to eth sharding?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

model.deeptrust_commit = Commit(
    model_name=model_name,
    device="cuda",
    dtype="float32",
    engine="transformers",
    hashes=[],
    completion=None,
    input_tokens=len(input_ids[0]),
    generation_config={"do_sample": True, "max_length": 24, "num_return_sequences": 1},
)

start_time = time.time()
output = model.generate(input_ids.cuda(), **model.deeptrust_commit.generation_config)
time_taken = time.time() - start_time

model.deeptrust_commit.completion = [i.item() for i in output[0]]
model.deeptrust_commit.to_file("./happy-gen.json")
print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


<|begin_of_text|>What is proto-danksharding and how is it related to eth sharding??
Proto-danksharding


In [3]:
model.deeptrust_commit.hashes, f"{time_taken * 1000:.2f}ms"

(['float32[1, 18, 4096](73728, 4096, 1)<aa928465160a2bcfd32412cd6b89f643>',
  'float32[1, 1, 4096](4096, 4096, 1)<f4984e739ece7968d92d3b1b612e24a7>',
  'float32[1, 1, 4096](4096, 4096, 1)<29373b36e19d21c349fc0e222d0dcd9a>',
  'float32[1, 1, 4096](4096, 4096, 1)<5aad93b667266d6d9eb7255a8434c0e6>',
  'float32[1, 1, 4096](4096, 4096, 1)<6503ecf7272bda825c241a7f37b9c26b>',
  'float32[1, 1, 4096](4096, 4096, 1)<faea240ad8b86ef2c1f5d6001ccf7314>'],
 '1149.36ms')

In [4]:
model.deeptrust_commit.hashes = []

start_time = time.time()
with torch.inference_mode():
    _ = model(output[:, :-1])
time_taken = time.time() - start_time

model.deeptrust_commit.to_file("./happy-val.json")
model.deeptrust_commit.hashes, f"{time_taken * 1000:.2f}ms"

(['float32[1, 18, 4096](73728, 4096, 1)<aa928465160a2bcfd32412cd6b89f643>',
  'float32[1, 1, 4096](4096, 4096, 1)<f4984e739ece7968d92d3b1b612e24a7>',
  'float32[1, 1, 4096](4096, 4096, 1)<29373b36e19d21c349fc0e222d0dcd9a>',
  'float32[1, 1, 4096](4096, 4096, 1)<5aad93b667266d6d9eb7255a8434c0e6>',
  'float32[1, 1, 4096](4096, 4096, 1)<6503ecf7272bda825c241a7f37b9c26b>',
  'float32[1, 1, 4096](4096, 4096, 1)<faea240ad8b86ef2c1f5d6001ccf7314>'],
 '136.74ms')

# Bad path

In [5]:
bad_model_name = "meta-llama/Llama-3.2-1B"
bad_model = LlamaForCausalLM.from_pretrained(bad_model_name, device_map="cuda", torch_dtype=torch.bfloat16)
bad_tokenizer = AutoTokenizer.from_pretrained(bad_model_name, use_fast=True)

In [6]:
from deeptrust.commits import Commit

input_text = "What is proto-danksharding and how is it related to eth sharding?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

bad_model.deeptrust_commit = Commit(
    model_name=model_name,
    device="cuda",
    dtype="float32",
    engine="transformers",
    hashes=[],
    completion=None,
    input_tokens=len(input_ids[0]),
    generation_config={"do_sample": True, "max_length": 24, "num_return_sequences": 1},
)

output = bad_model.generate(input_ids.cuda(), do_sample=True, max_length=24, num_return_sequences=1)

model.deeptrust_commit.completion = [i.item() for i in output[0]]
model.deeptrust_commit.to_file("./bad-gen.json")
print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<|begin_of_text|>What is proto-danksharding and how is it related to eth sharding? [closed]
I am trying


In [7]:
bad_model.deeptrust_commit.hashes

['bfloat16[1, 18, 2048](36864, 2048, 1)<01999b437754b382a5d466021fe4f175>',
 'bfloat16[1, 1, 2048](2048, 2048, 1)<b8093ad5f535245b38533b083f557faa>',
 'bfloat16[1, 1, 2048](2048, 2048, 1)<133b5cd3e1ad0fe6951c6168f26e1aa8>',
 'bfloat16[1, 1, 2048](2048, 2048, 1)<391edd413b779b8fbaba766e3d5c9491>',
 'bfloat16[1, 1, 2048](2048, 2048, 1)<b99ae85b6830c40237a1e64b3d39d830>',
 'bfloat16[1, 1, 2048](2048, 2048, 1)<0d5f40875bd4b4a3e0fc18b22648ae35>']

In [8]:
model.deeptrust_commit.hashes = []

with torch.inference_mode():
    _ = model(output[:, :-1])

model.deeptrust_commit.to_file("./bad-val.json")
model.deeptrust_commit.hashes

['float32[1, 18, 4096](73728, 4096, 1)<aa928465160a2bcfd32412cd6b89f643>',
 'float32[1, 1, 4096](4096, 4096, 1)<1c83d87ee63d18103107d38ec2e6e3fb>',
 'float32[1, 1, 4096](4096, 4096, 1)<36bde5c8921867c651dbe30bca4c53ec>',
 'float32[1, 1, 4096](4096, 4096, 1)<956671ea77307971b7787e4009c977c9>',
 'float32[1, 1, 4096](4096, 4096, 1)<1e109293bde0257e2e159845b7193c66>',
 'float32[1, 1, 4096](4096, 4096, 1)<e6c79af5a8ecef316a213aa843978a1c>']

# Validation

In [9]:
%%bash
diff happy-gen.json happy-val.json 2>&1 > /dev/null
echo $?

0


In [10]:
%%bash
diff bad-gen.json bad-val.json 2>&1 > /dev/null
echo $?

1
