In [1]:
import os
import pickle
import time
import numpy as np
import torch
from jellyfish import damerau_levenshtein_distance
from transformers import AutoModel, AutoTokenizer

from neural_network.llamp_multiout import BertMultiOutputClassificationHeads
from preprocessing.log_to_history import Log

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
csv_log = "helpdesk"   # e.g. helpdesk, sepsis, bpic2017_o, bpic2020 ...
TYPE = "all"

semantic_dir = "semantic_data"
models_dir = "models"
model_name = "prajjwal1/bert-medium"
MAX_LEN = 512

beta = 0.8
threshold = 0.4

N_RUNS = 2000

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device ->", device)

device -> cuda


In [3]:
def load_pickle(path: str):
    with open(path, "rb") as f:
        return pickle.load(f)

def clean_sequence(sequence_str, label2id):
    sequence_list = sequence_str.split(" ")
    end_activity_str = str(label2id["activity"]["ENDactivity"])
    if end_activity_str in sequence_list:
        first_end_index = sequence_list.index(end_activity_str)
        sequence_list = sequence_list[: first_end_index + 1]
    return " ".join(sequence_list)

def pad_list_to_length(seq, target_length, end_id):
    if target_length <= 0:
        return seq
    if len(seq) == 0:
        return [0] * target_length
    if len(seq) < target_length:
        return seq + [end_id] * (target_length - len(seq))
    if len(seq) > target_length:
        return seq[:target_length]
    return seq

In [4]:
def predict_suffix_no_freq(model_output):
    predicted = []
    for i in range(len(model_output)):
        pred = model_output[i].argmax(dim=1).cpu().numpy()
        predicted.append(str(pred[0]))
    return predicted

In [5]:
def predict_suffix_with_freq(model_output, prefix_sequence, trace_frequencies, label2id, beta, threshold):
    # fixed padding length from db
    if len(trace_frequencies) == 0:
        max_len_in_db = 0
    else:
        max_len_in_db = max(len(k) for k in trace_frequencies.keys())

    end_id = label2id["activity"]["ENDactivity"]

    # 1) model suffix
    model_suffix_full = predict_suffix_no_freq(model_output)          
    model_suffix_cut = clean_sequence(" ".join(model_suffix_full), label2id).split()
    
    if len(model_suffix_cut) > 0 and model_suffix_cut[-1] == str(end_id):
        model_suffix = model_suffix_cut[:-1]
    else:
        model_suffix = model_suffix_cut

    prefix_ints = [int(x) for x in prefix_sequence]
    suffix_ints = [int(x) for x in model_suffix]
    candidate_trace = prefix_ints + suffix_ints

    # 2) pad candidate before exact match
    padded_candidate = pad_list_to_length(candidate_trace, max_len_in_db, end_id)
    candidate_tuple = tuple(padded_candidate)

    # 3) exact match
    if candidate_tuple in trace_frequencies:
        return model_suffix

    # 4) best match
    if len(trace_frequencies) == 0 or max_len_in_db == 0:
        return model_suffix

    best_trace = None
    best_similarity = -1.0
    best_freq = -1.0
    best_tau = -1.0

    candidate_str = " ".join(map(str, padded_candidate))
    f_max = max(trace_frequencies.values())

    for hist_trace, freq in trace_frequencies.items():
        hist_list = list(hist_trace)
        padded_hist = pad_list_to_length(hist_list, max_len_in_db, end_id)
        hist_str = " ".join(map(str, padded_hist))

        dl_dist = damerau_levenshtein_distance(candidate_str, hist_str)
        similarity = max(0.0, 1.0 - (dl_dist / max_len_in_db))
        tau = beta * similarity + (1.0 - beta) * (freq / f_max)

        if (
            tau > best_tau
            or (tau == best_tau and similarity > best_similarity)
            or (tau == best_tau and similarity == best_similarity and freq > best_freq)
        ):
            best_tau = tau
            best_similarity = similarity
            best_freq = freq
            best_trace = hist_list

    if best_similarity >= threshold and best_trace is not None and len(best_trace) > len(prefix_ints):
        override_suffix_int = best_trace[len(prefix_ints):]
        return list(map(str, override_suffix_int))

    return model_suffix

In [6]:
def ids_to_labels(id_list, id2label):
    out = []
    for x in id_list:
        try:
            out.append(id2label["activity"][int(x)])
        except Exception:
            out.append(str(x))
    return out

In [7]:
def sync():
    if device.type == "cuda":
        torch.cuda.synchronize()

In [8]:
def parse_prefix(prefix_str: str, prefix_mode: str, label2id: dict):
    """
    prefix_mode="id"   : prefix_str is space-separated IDs (e.g., "12 5 9")
    prefix_mode="name" : prefix_str is space-separated activity names (each name must NOT contain spaces)
                         (e.g., "CreateTicket AssignTicket ENDactivity")
    Returns:
      prefix_sequence (list[str])      : IDs as strings
      prefix_text_for_tokenizer (str)  : space-separated IDs (text fed into tokenizer)
    """
    tokens = [t for t in prefix_str.strip().split() if t != ""]
    if len(tokens) == 0:
        raise ValueError("prefix_str is empty")

    if prefix_mode == "id":
        return tokens, " ".join(tokens)

    if prefix_mode == "name":
        ids = []
        for name in tokens:
            if name not in label2id["activity"]:
                raise KeyError(f"Unknown activity name: {name}")
            ids.append(str(label2id["activity"][name]))
        return ids, " ".join(ids)

    raise ValueError('prefix_mode must be "id" or "name"')

In [None]:
Log(csv_log, TYPE)

In [9]:
base = os.path.join(semantic_dir, csv_log)
id2label = load_pickle(os.path.join(base, f"{csv_log}_id2label_{TYPE}.pkl"))
label2id = load_pickle(os.path.join(base, f"{csv_log}_label2id_{TYPE}.pkl"))

y_train_suffix = load_pickle(os.path.join(base, f"{csv_log}_suffix_train_{TYPE}.pkl"))
trace_frequencies = load_pickle(os.path.join(base, f"{csv_log}_encoded_trace_frequencies_{TYPE}.pkl"))

tokenizer = AutoTokenizer.from_pretrained(model_name, truncation_side="left")
backbone = AutoModel.from_pretrained(model_name)

output_sizes = [len(id2label["activity"]) for _ in range(len(y_train_suffix))]
model = BertMultiOutputClassificationHeads(backbone, output_sizes)

model_path = os.path.join(models_dir, f"{csv_log}_{TYPE}.pth")
state = torch.load(model_path, map_location="cpu")
model.load_state_dict(state)

model = model.to(device)
model.eval()

print("Loaded model:", model_path)
print("#heads:", len(output_sizes), " classes/head:", output_sizes[0])

Loaded model: models/helpdesk_all.pth
#heads: 15  classes/head: 15


In [10]:
# =====================
# Input prefix here (two modes)
# mode="id":    prefix_str = "12 5 9"
# mode="name":  prefix_str = "Create Ticket Assign Ticket ENDactivity"
# =====================
prefix_str = "0 2" #Assignseriousness Takeinchargeticket Wait         
prefix_mode = "id"       

In [11]:
prefix_sequence, prefix_text_for_tokenizer = parse_prefix(prefix_str, prefix_mode, label2id)

enc = tokenizer(
    prefix_text_for_tokenizer,
    truncation=True,
    padding="max_length",
    max_length=MAX_LEN,
    return_tensors="pt",
)
input_ids = enc["input_ids"].to(device)
attention_mask = enc["attention_mask"].to(device)

In [12]:
with torch.no_grad():
    outputs = model(input_ids, attention_mask)

pred_no = predict_suffix_no_freq(outputs)
pred_no = clean_sequence(" ".join(pred_no), label2id).split()

pred_w = predict_suffix_with_freq(
    outputs,
    prefix_sequence,
    trace_frequencies,
    label2id,
    beta=beta,
    threshold=threshold,
)
pred_w = clean_sequence(" ".join(pred_w), label2id).split()

print("\nPrefix IDs        :", prefix_sequence)
print("No-freq Suffix IDs:", pred_no)
print("Freq   Suffix IDs :", pred_w)

print("\nNo-freq labels:", ids_to_labels(pred_no, id2label))
print("Freq   labels:", ids_to_labels(pred_w, id2label))


Prefix IDs        : ['0', '2']
No-freq Suffix IDs: ['1', '2', '3', '14']
Freq   Suffix IDs : ['1', '2', '3']

No-freq labels: ['Takeinchargeticket', 'Resolveticket', 'Closed', 'ENDactivity']
Freq   labels: ['Takeinchargeticket', 'Resolveticket', 'Closed']


In [13]:
# inference time comparison 2000 runs

In [14]:
def run_no_freq():
    with torch.no_grad():
        out = model(input_ids, attention_mask)
    pred = predict_suffix_no_freq(out)
    pred = clean_sequence(" ".join(pred), label2id).split()
    return pred

def run_with_freq():
    with torch.no_grad():
        out = model(input_ids, attention_mask)
    pred = predict_suffix_with_freq(out, prefix_sequence, trace_frequencies, label2id, beta=beta, threshold=threshold)
    pred = clean_sequence(" ".join(pred), label2id).split()
    return pred

# warmup (GPU timing更稳)
for _ in range(10):
    _ = run_no_freq()
    _ = run_with_freq()

# no-freq timing
sync()
t0 = time.perf_counter()
for _ in range(N_RUNS):
    _ = run_no_freq()
sync()
t1 = time.perf_counter()
total_no = t1 - t0

# with-freq timing
sync()
t2 = time.perf_counter()
for _ in range(N_RUNS):
    _ = run_with_freq()
sync()
t3 = time.perf_counter()
total_w = t3 - t2

print("\n========== Timing Report ==========")
print(f"No-freq   total: {total_no:.6f}s   avg/run: {total_no/N_RUNS:.6f}s")
print(f"With-freq total: {total_w:.6f}s   avg/run: {total_w/N_RUNS:.6f}s")


No-freq   total: 6.410728s   avg/run: 0.003205s
With-freq total: 6.447285s   avg/run: 0.003224s
