# Running Inference

In [None]:
import torch
from config import get_config, get_weights_file_path
from train import get_model, get_ds, run_validation


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device {device}')
config = get_config()
train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)

Using device cuda
Max length of source sentence: 466
Max length of target sentence: 479


In [4]:
model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)

# Load the pretrained weights
model_filename = get_weights_file_path(config, f"10")
state = torch.load(model_filename)
model.load_state_dict(state['model_state_dict'])

<All keys matched successfully>

In [None]:
import time
import numpy as np
import pandas as pd

model.eval()
torch.set_grad_enabled(False)


torch.autograd.grad_mode.set_grad_enabled(mode=False)

In [3]:
# IMPORTANT: this import should NOT trigger inference because the file uses if __name__ == "__main__"
from run_trt_split import TRTTransformer, greedy_decode as greedy_decode_trt

# (optional) same memory fraction trick in notebook
if torch.cuda.is_available():
    torch.cuda.set_per_process_memory_fraction(0.6)

trt_model = TRTTransformer(
    enc_path="tensorrt_split/tmodel_10_encoder_fp32.engine",
    dec_path="tensorrt_split/tmodel_10_decoder_fp32.engine",
    proj_path="tensorrt_split/tmodel_10_projection_fp32.engine"
)

trt_model  # sanity: should print "Engines loaded."

Loading TRT Engines...
Engines loaded.


<run_trt_split.TRTTransformer at 0xfffeea8cdf00>

In [6]:
from train import greedy_decode as greedy_decode_pt


In [10]:
def benchmark_decode(
    decode_fn,
    model_obj,
    dataloader,
    tokenizer_src,
    tokenizer_tgt,
    max_len,
    device,
    n_batches=50,
    warmup_batches=5,
    label="",
    print_warmup_samples=0   # <-- NEW
):
    times_ms = []

    # ---------------- Warmup (optional prints here) ----------------
    for i, batch in enumerate(dataloader):
        if i >= warmup_batches:
            break

        src = batch["encoder_input"].to(device)
        src_mask = batch["encoder_mask"].to(device)

        out_ids = decode_fn(model_obj, src, src_mask, tokenizer_src, tokenizer_tgt, max_len, device)

        if i < print_warmup_samples:
            out_text = tokenizer_tgt.decode(out_ids.detach().cpu().numpy())
            print("-" * 80)
            print(f"{label} WARMUP SAMPLE {i+1}")
            print(f"SOURCE:    {batch['src_text'][0]}")
            print(f"TARGET:    {batch['tgt_text'][0]}")
            print(f"PRED:      {out_text}")

    if device.type == "cuda":
        torch.cuda.synchronize()

    # ---------------- Timed runs (no printing) ----------------
    for i, batch in enumerate(dataloader):
        if i >= n_batches:
            break

        src = batch["encoder_input"].to(device)
        src_mask = batch["encoder_mask"].to(device)

        t0 = time.perf_counter()
        _ = decode_fn(model_obj, src, src_mask, tokenizer_src, tokenizer_tgt, max_len, device)
        if device.type == "cuda":
            torch.cuda.synchronize()
        t1 = time.perf_counter()

        times_ms.append((t1 - t0) * 1000)

    arr = np.array(times_ms, dtype=np.float32)
    print(f"\n{label} latency over {len(arr)} batches:")
    print(f"  mean: {arr.mean():.2f} ms")
    print(f"  p50 : {np.percentile(arr, 50):.2f} ms")
    print(f"  p90 : {np.percentile(arr, 90):.2f} ms")
    print(f"  p99 : {np.percentile(arr, 99):.2f} ms")

    return times_ms


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_len = config["seq_len"]

pt_times = benchmark_decode(
    greedy_decode_pt, model, val_dataloader,
    tokenizer_src, tokenizer_tgt,
    max_len=max_len,
    device=device,
    n_batches=50,
    warmup_batches=5,
    label="PyTorch",
    print_warmup_samples=2
)

trt_times = benchmark_decode(
    greedy_decode_trt, trt_model, val_dataloader,
    tokenizer_src, tokenizer_tgt,
    max_len=max_len,
    device=device,
    n_batches=50,
    warmup_batches=5,
    label="TensorRT",
    print_warmup_samples=2
)

speedup = np.mean(pt_times) / np.mean(trt_times)
print(f"\nSpeedup (PyTorch / TRT): {speedup:.2f}x")


NvMapMemAllocInternalTagged: 1075072515 error 12
NvMapMemHandleAlloc: error 0
NvMapMemAllocInternalTagged: 1075072515 error 12
NvMapMemHandleAlloc: error 0
NvMapMemAllocInternalTagged: 1075072515 error 12
NvMapMemHandleAlloc: error 0


--------------------------------------------------------------------------------
PyTorch WARMUP SAMPLE 1
SOURCE:    "Is there then no possibility of sparing these two their beating?" he asked him.
TARGET:    »Gibt es keine Möglichkeit, den beiden die Prügel zu ersparen?« fragte er ihn.
PRED:      » Ist denn denn denn denn nicht die Möglichkeit , die er hier verlassen ?« fragte er .
--------------------------------------------------------------------------------
PyTorch WARMUP SAMPLE 2
SOURCE:    He asked himself what motive could have impelled Quasimodo to save her.
TARGET:    Er fragte sich, welchen Beweggrund Quasimodo hätte haben können, sie zu retten.
PRED:      Er fragte sich , was Quasimodo hätte haben können , um sie zu retten .

PyTorch latency over 50 batches:
  mean: 931.61 ms
  p50 : 821.56 ms
  p90 : 1729.10 ms
  p99 : 2847.09 ms
--------------------------------------------------------------------------------
TensorRT WARMUP SAMPLE 1
SOURCE:    All the long afternoon the vi

In [22]:

def verbose_compare_same_sample(
    pt_model,
    trt_model,
    dataloader,
    tokenizer_src,
    tokenizer_tgt,
    max_len,
    device,
    batch_index=0,   # pick which batch to compare
):
    pt_model.eval()
    torch.set_grad_enabled(False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # ---- get ONE specific batch ----
    it = iter(dataloader)
    batch = None
    for i in range(batch_index + 1):
        batch = next(it)

    encoder_input = batch["encoder_input"].to(device)
    encoder_mask  = batch["encoder_mask"].to(device)
    source_text   = batch["src_text"][0]
    target_text   = batch["tgt_text"][0]

    # ---- PyTorch decode on SAME tensors ----
    out_pt_ids = greedy_decode_pt(
        pt_model, encoder_input, encoder_mask,
        tokenizer_src, tokenizer_tgt,
        max_len, device
    )
    out_pt_text = tokenizer_tgt.decode(out_pt_ids.detach().cpu().numpy())

    # ---- TensorRT decode on SAME tensors ----
    out_trt_ids = greedy_decode_trt(
        trt_model, encoder_input, encoder_mask,
        tokenizer_src, tokenizer_tgt,
        max_len, device
    )
    out_trt_text = tokenizer_tgt.decode(out_trt_ids.detach().cpu().numpy())

    # ---- print SAME sample ----
    print("-" * 80)
    print(f"SOURCE:     {source_text}")
    print(f"TARGET:     {target_text}")
    print(f"PT  PRED:   {out_pt_text}")
    print(f"TRT PRED:   {out_trt_text}")
    print("-" * 80)

    return out_pt_ids, out_trt_ids


# run on batch 0 (same sample)
_ = verbose_compare_same_sample(
    model, trt_model, val_dataloader,
    tokenizer_src, tokenizer_tgt,
    config["seq_len"], device,
    batch_index=1
)


--------------------------------------------------------------------------------
SOURCE:     The passage was tolerably long.
TARGET:     Der Weg nach der Höhe war ziemlich lang.
PT  PRED:   Der große Korridor war ziemlich lange .
TRT PRED:   Der große Korridor war ziemlich lange .
--------------------------------------------------------------------------------


In [14]:
from dataset import causal_mask
def compare_first_step_logits(pt_model, trt_model, batch):
    device = torch.device("cuda")
    pt_model.eval()
    torch.set_grad_enabled(False)

    src = batch["encoder_input"].to(device)
    src_mask = batch["encoder_mask"].to(device)

    # PT: encoder + one decoder step (SOS)
    sos_idx = tokenizer_tgt.token_to_id("[SOS]")
    dec_in = torch.tensor([[sos_idx]], device=device, dtype=src.dtype)

    enc_pt = pt_model.encode(src, src_mask)
    dec_mask = causal_mask(1).type_as(src_mask).to(device)  # (1,1,1)
    out_pt = pt_model.decode(enc_pt, src_mask, dec_in, dec_mask)
    logits_pt = pt_model.project(out_pt[:, -1])  # (B, vocab)

    # TRT: same
    enc_trt = trt_model.encode(src, src_mask)
    dec_mask_trt = causal_mask(1).type_as(src_mask).to(device).unsqueeze(1)  # (1,1,1,1)
    out_trt = trt_model.decode(enc_trt, src_mask, dec_in, dec_mask_trt)
    logits_trt = trt_model.project(out_trt[:, -1])

    lp = logits_pt.detach().cpu().float()
    lt = logits_trt.detach().cpu().float()

    max_abs = (lp - lt).abs().max().item()
    mean_abs = (lp - lt).abs().mean().item()
    print("First-step logits diff:")
    print("  max abs:", max_abs)
    print("  mean abs:", mean_abs)

batch0 = next(iter(val_dataloader))
compare_first_step_logits(model, trt_model, batch0)


First-step logits diff:
  max abs: 0.005407452583312988
  mean abs: 0.0012930561788380146


In [16]:
def pt_encode_trt_style(pt_model, src, src_mask):
    # Make PT use the same square mask shape TRT encoder expects
    S = src.shape[1]
    if src_mask.dim() == 3:
        src_mask = src_mask.unsqueeze(1)   # (B,1,S)->(B,1,1,S)
    if src_mask.dim() == 4 and src_mask.shape[2] == 1:
        src_mask = src_mask.repeat(1,1,S,1)  # -> (B,1,S,S)
    src_mask = src_mask.float()  # binary float
    return pt_model.encode(src, src_mask)

def compare_encoder_outputs(pt_model, trt_model, batch):
    device = torch.device("cuda")
    pt_model.eval()
    torch.set_grad_enabled(False)

    src = batch["encoder_input"].to(device)
    src_mask = batch["encoder_mask"].to(device)

    enc_pt  = pt_encode_trt_style(pt_model, src, src_mask).detach().cpu().float()
    enc_trt = trt_model.encode(src, src_mask).detach().cpu().float()

    diff = (enc_pt - enc_trt).abs()
    print("ENCODER diff: max =", diff.max().item(), "mean =", diff.mean().item())

batch0 = next(iter(val_dataloader))
compare_encoder_outputs(model, trt_model, batch0)

ENCODER diff: max = 0.0012094676494598389 mean = 7.819623715477064e-05


In [17]:
device = torch.device("cuda")
batch0 = next(iter(val_dataloader))
src = batch0["encoder_input"].to(device)
src_mask = batch0["encoder_mask"].to(device)

enc_pt  = pt_encode_trt_style(model, src, src_mask).detach().cpu().float()
enc_trt = trt_model.encode(src, src_mask).detach().cpu().float()

# per-position mean diff across hidden dim
pos_diff = (enc_pt - enc_trt).abs().mean(-1).squeeze(0)   # (S,)

# padded vs unpadded positions (mask is (1,1,1,S))
key_mask = src_mask.squeeze().cpu()  # (S,) with 0/1
pad_pos = key_mask == 0
unpad_pos = key_mask == 1

print("mean diff padded   :", pos_diff[pad_pos].mean().item())
print("mean diff unpadded :", pos_diff[unpad_pos].mean().item())
print("max diff overall   :", pos_diff.max().item())

mean diff padded   : 9.05511187738739e-05
mean diff unpadded : 5.1022652769461274e-05
max diff overall   : 0.00023682956816628575


In [None]:
m = min(len(pt_times), len(trt_times))
df = pd.DataFrame({
    "pytorch_ms": pt_times[:m],
    "tensorrt_ms": trt_times[:m],
})
df["speedup_x"] = df["pytorch_ms"] / df["tensorrt_ms"]

df.to_csv("benchmark_times.csv", index=False)
df.describe()


In [None]:
#run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: print(msg), 0, None, num_examples=10)

stty: 'standard input': Inappropriate ioctl for device


--------------------------------------------------------------------------------
    SOURCE: "Messire," said Liénarde.
    TARGET: »Herr,« sagte Liénarde.
 PREDICTED: » Herr ,« sagte Liénarde .
--------------------------------------------------------------------------------
    SOURCE: He picked up a clean pine shingle that lay in the moon-light, took a little fragment of "red keel" out of his pocket, got the moon on his work, and painfully scrawled these lines, emphasizing each slow down-stroke by clamping his tongue between his teeth, and letting up the pressure on the up-strokes. [See next page.]
    TARGET: Er nahm eine glänzend geschliffene Schindel auf, die im Mondlicht lag, zog ein Stückchen Rotstift aus der Tasche, ließ das Mondlicht sein Werk bescheinen, und kritzelte mühsam, jeden schwerfälligen Grundstrich hervorhebend, indem er die Zunge zwischen die Zähne klemmte und sie bei den Haarstrichen wieder freiließ, folgende Zeilen: ,,Huck Finn und Tom Sawyer schwöhren, Sie wolen 

# Checking the tensorboard files

In [4]:
import tensorboard as tb
import os

In [5]:
# If you are on ssh then make sure you are doing port forwarding
#ssh -L 6005:localhost:6005 user@jetson_ip
#Finally on your host on a browser open http://localhost:6005

logdir = "./runs/tmodel/"
os.system(f"tensorboard --logdir {logdir} --port 6005")



TensorFlow installation not found - running with reduced feature set.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.20.0 at http://localhost:6005/ (Press CTRL+C to quit)


2