In [None]:
def _ensure_semantic_dir(self) -> Path:
    base_dir = Path("semantic_data") / self.__log_name
    base_dir.mkdir(parents=True, exist_ok=True)
    return base_dir

In [None]:
base_dir = self._ensure_semantic_dir()

In [None]:
filename = base_dir / f"{self.__log_name}_{obj_type}_{self.__setting}.pkl"
os.makedirs(f"semantic_data/{self.__log_name}", exist_ok=True)

In [None]:
import argparse
import os
import time
import random
import pickle

import numpy as np
import torch
from torch.utils.data import DataLoader

from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split

from neural_network.HistoryDataset import CustomDataset
from neural_network.llamp_multiout import BertMultiOutputClassificationHeads
from preprocessing.log_to_history import Log


def set_seed(seed: int):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)


def load_pickle(path: str):
    with open(path, "rb") as f:
        return pickle.load(f)


def train_fn(model, train_loader, optimizer, device, criterion, label_keys):
    model.train()
    total_loss = 0.0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"]  # expected dict-like: labels[key] -> tensor

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)  # list/tuple aligned with label_keys

        loss = 0.0
        for i, key in enumerate(label_keys):
            loss = loss + criterion[key](outputs[i].to(device), labels[key].to(device))

        loss.backward()
        optimizer.step()

        total_loss += float(loss.item())

    return total_loss / max(1, len(train_loader))


def evaluate_fn(model, data_loader, criterion, device, label_keys):
    model.eval()
    total_loss = 0.0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"]

            outputs = model(input_ids, attention_mask)

            loss = 0.0
            for i, key in enumerate(label_keys):
                loss = loss + criterion[key](outputs[i].to(device), labels[key].to(device))

            total_loss += float(loss.item())

    return total_loss / max(1, len(data_loader))


def train_llm(model, train_loader, val_loader, optimizer, epochs, criterion, device, label_keys, patience=10):
    best_valid_loss = float("inf")
    best_state = None
    early_stop_counter = 0

    for epoch in range(epochs):
        train_loss = train_fn(model, train_loader, optimizer, device, criterion, label_keys)
        valid_loss = evaluate_fn(model, val_loader, criterion, device, label_keys)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            best_state = {k: v.detach().cpu() for k, v in model.state_dict().items()}
            early_stop_counter = 0
        else:
            early_stop_counter += 1

        print(f"Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss:.4f} - Val Loss: {valid_loss:.4f}")

        if early_stop_counter >= patience:
            print(f"Validation loss hasn't improved for {patience} epochs. Early stopping...")
            break

    if best_state is not None:
        model.load_state_dict(best_state)

    return model


def parse_args():
    p = argparse.ArgumentParser()

    # dataset / paths
    p.add_argument("--csv_log", type=str, default="helpdesk", help="dataset/log name (e.g., helpdesk)")
    p.add_argument("--type", type=str, default="all", help="TYPE (e.g., all)")
    p.add_argument("--semantic_dir", type=str, default="semantic_data", help="base folder of semantic_data")

    # model / training
    p.add_argument("--model_name", type=str, default="prajjwal1/bert-medium")
    p.add_argument("--max_len", type=int, default=512)
    p.add_argument("--batch_size", type=int, default=8)
    p.add_argument("--lr", type=float, default=1e-5)
    p.add_argument("--epochs", type=int, default=100)
    p.add_argument("--seed", type=int, default=42)

    # runtime
    p.add_argument("--num_workers", type=int, default=0)
    p.add_argument("--multi_gpu", action="store_true", help="use DataParallel when multiple GPUs are available")

    # outputs
    p.add_argument("--models_dir", type=str, default="models")
    p.add_argument("--output_dir", type=str, default="output")
    p.add_argument("--early_stop_patience", type=int, default=10)

    return p.parse_args()


def main():
    args = parse_args()
    set_seed(args.seed)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("device-->", device)

    # generate semantic data (kept like your script)
    Log(args.csv_log, args.type)

    base = os.path.join(args.semantic_dir, args.csv_log)

    id2label_path = os.path.join(base, f"{args.csv_log}_id2label_{args.type}.pkl")
    label2id_path = os.path.join(base, f"{args.csv_log}_label2id_{args.type}.pkl")
    train_path = os.path.join(base, f"{args.csv_log}_train_{args.type}.pkl")
    y_train_path = os.path.join(base, f"{args.csv_log}_label_train_{args.type}.pkl")
    y_train_suffix_path = os.path.join(base, f"{args.csv_log}_suffix_train_{args.type}.pkl")

    id2label = load_pickle(id2label_path)
    _label2id = load_pickle(label2id_path)  # loaded but not used in your script (kept)
    train = load_pickle(train_path)
    _y_train = load_pickle(y_train_path)  # loaded but not used in your script (kept)
    y_train_suffix = load_pickle(y_train_suffix_path)

    # split inputs
    train_input, val_input = train_test_split(train, test_size=0.2, random_state=42)

    # split labels per head (same random_state as your code)
    train_label = {}
    val_label = {}
    for key in y_train_suffix.keys():
        train_label[key], val_label[key] = train_test_split(
            y_train_suffix[key], test_size=0.2, random_state=42
        )

    # tokenizer / backbone
    tokenizer = AutoTokenizer.from_pretrained(args.model_name, truncation_side="left")
    backbone = AutoModel.from_pretrained(args.model_name)

    # keep a stable head order
    label_keys = list(train_label.keys())

    # your original logic: output size per head == len(id2label['activity'])
    # (kept as-is, just expressed consistently)
    output_sizes = [len(id2label["activity"]) for _ in label_keys]

    train_dataset = CustomDataset(train_input, train_label, tokenizer, args.max_len)
    val_dataset = CustomDataset(val_input, val_label, tokenizer, args.max_len)

    train_loader = DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers
    )
    val_loader = DataLoader(
        val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers
    )

    model = BertMultiOutputClassificationHeads(backbone, output_sizes).to(device)

    # multi-GPU (DataParallel)
    if args.multi_gpu and torch.cuda.is_available() and torch.cuda.device_count() > 1:
        print(f"Using DataParallel on {torch.cuda.device_count()} GPUs")
        model = torch.nn.DataParallel(model)

    # criterion per head
    criterion = {k: torch.nn.CrossEntropyLoss() for k in label_keys}

    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)

    start_time = time.time()
    model = train_llm(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        epochs=args.epochs,
        criterion=criterion,
        device=device,
        label_keys=label_keys,
        patience=args.early_stop_patience,
    )
    exec_time = time.time() - start_time

    os.makedirs(args.models_dir, exist_ok=True)
    save_path = os.path.join(args.models_dir, f"{args.csv_log}_{args.type}.pth")

    # DataParallel needs .module for clean state_dict
    state_dict = model.module.state_dict() if isinstance(model, torch.nn.DataParallel) else model.state_dict()
    torch.save(state_dict, save_path)

    os.makedirs(args.output_dir, exist_ok=True)
    with open(os.path.join(args.output_dir, f"{args.csv_log}_{args.type}.txt"), "w") as f:
        f.write(str(exec_time))

    print(f"Saved model to: {save_path}")
    print(f"Execution time (s): {exec_time:.2f}")


if __name__ == "__main__":
    main()

In [None]:
import argparse
import os
import sys
import pickle

import numpy as np
import torch
from torch.utils.data import DataLoader
from jellyfish import damerau_levenshtein_distance
from transformers import AutoModel, AutoTokenizer

from neural_network.HistoryDataset import CustomDataset
from neural_network.llamp_multiout import BertMultiOutputClassificationHeads
from preprocessing.log_to_history import Log


def clean_sequence(sequence_str, label2id):
    sequence_list = sequence_str.split(" ")
    end_activity_str = str(label2id["activity"]["ENDactivity"])

    if end_activity_str in sequence_list:
        first_end_index = sequence_list.index(end_activity_str)
        sequence_list = sequence_list[: first_end_index + 1]

    return " ".join(sequence_list)


def remove_word(sentence, word):
    words = sentence.split()
    words = [w for w in words if w != word]
    return " ".join(words)


def pad_list_to_length(seq, target_length, end_id):
    if target_length <= 0:
        return seq

    seq_len = len(seq)
    if seq_len == 0:
        return [0] * target_length

    if seq_len < target_length:
        return seq + [end_id] * (target_length - seq_len)
    if seq_len > target_length:
        return seq[:target_length]
    return seq


def extract_prefix(full_trace, suffix_sequence):
    pad_token = suffix_sequence[-1]
    if pad_token in full_trace:
        effective_full = full_trace[: full_trace.index(pad_token)]
    else:
        effective_full = full_trace

    if pad_token in suffix_sequence:
        effective_suffix = suffix_sequence[: suffix_sequence.index(pad_token)]
    else:
        effective_suffix = suffix_sequence

    prefix_length = len(effective_full) - len(effective_suffix)
    return full_trace[:prefix_length]


def predict_suffix_no_freq(model_output):
    predicted_no_freq = []
    for i in range(len(model_output)):
        pred = model_output[i].argmax(dim=1).cpu().numpy()
        predicted_no_freq.append(str(pred[0]))
    return predicted_no_freq


def predict_suffix_with_freq(
    model_output,
    prefix_sequence,     # list[str]
    trace_frequencies,   # dict{tuple(int): freq}
    label2id,
    beta: float,
    threshold: float,
):
    # Decide fixed padding length
    if len(trace_frequencies) == 0:
        max_len_in_db = 0
    else:
        max_len_in_db = max(len(k) for k in trace_frequencies.keys())

    # 1) model suffix
    model_suffix = []
    end_id = label2id["activity"]["ENDactivity"]

    for step_logits in model_output:
        probs = torch.softmax(step_logits, dim=1)[0]
        next_act = probs.argmax().item()

        if str(next_act) == str(end_id):
            break
        model_suffix.append(str(next_act))

        if str(next_act) == str(end_id):
            break

    prefix_ints = [int(x) for x in prefix_sequence]
    suffix_ints = [int(x) for x in model_suffix]
    candidate_trace = prefix_ints + suffix_ints

    # 2) pad candidate before exact match
    padded_candidate = pad_list_to_length(candidate_trace, max_len_in_db, end_id)
    candidate_tuple = tuple(padded_candidate)

    # 3) exact match
    if candidate_tuple in trace_frequencies:
        return model_suffix

    # 4) best match by DL + frequency
    best_trace = None
    best_similarity = -1.0
    best_freq = -1.0
    best_tau = -1.0

    if len(trace_frequencies) == 0 or max_len_in_db == 0:
        return model_suffix

    candidate_str = " ".join(map(str, padded_candidate))
    f_max = max(trace_frequencies.values())

    for hist_trace, freq in trace_frequencies.items():
        hist_list = list(hist_trace)
        padded_hist = pad_list_to_length(hist_list, max_len_in_db, end_id)
        hist_str = " ".join(map(str, padded_hist))

        dl_dist = damerau_levenshtein_distance(candidate_str, hist_str)
        similarity = max(0.0, 1.0 - (dl_dist / max_len_in_db))
        tau = beta * similarity + (1.0 - beta) * (freq / f_max)

        if (
            tau > best_tau
            or (tau == best_tau and similarity > best_similarity)
            or (tau == best_tau and similarity == best_similarity and freq > best_freq)
        ):
            best_tau = tau
            best_similarity = similarity
            best_freq = freq
            best_trace = hist_list

    if best_similarity >= threshold and best_trace is not None and len(best_trace) > len(prefix_ints):
        override_suffix_int = best_trace[len(prefix_ints) :]
        return list(map(str, override_suffix_int))

    return model_suffix


def load_pickle(path: str):
    with open(path, "rb") as f:
        return pickle.load(f)


def predict_next_activities(
    model,
    test_loader,
    label2id,
    trace_frequencies,
    out_path,
    beta,
    threshold,
    device,
):
    model.eval()
    list_dl_distance_no_freq = []
    list_dl_distance_with_freq = []

    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    with open(out_path, "w") as file_dl:
        file_dl.write(
            "Prefix,Predicted_NoFreq,Predicted_WithFreq,Truth,"
            "DL_Score_NoFreq,DL_Score_WithFreq,avg_noFreq,avg_withfreq\n"
        )

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                output = model(input_ids, attention_mask)

                true_sequence = []
                full_trace = []

                # 你的数据格式：labels / activities 是可索引序列（按 step）
                for i in range(len(batch["labels"])):
                    true_sequence.append(str(batch["labels"][i].item()))
                    full_trace.append(str(batch["activities"][i].item()))

                prefix_sequence = extract_prefix(full_trace, true_sequence)

                # 1) no freq
                predicted_no_freq = predict_suffix_no_freq(output)
                predicted_no_freq = clean_sequence(" ".join(predicted_no_freq), label2id).split()

                # 2) with freq
                predicted_with_freq = predict_suffix_with_freq(
                    output,
                    prefix_sequence,
                    trace_frequencies,
                    label2id,
                    beta=beta,
                    threshold=threshold,
                )
                predicted_with_freq = clean_sequence(" ".join(predicted_with_freq), label2id).split()

                # DL similarity
                seq_pred_no_freq = clean_sequence(" ".join(predicted_no_freq), label2id)
                seq_pred_with_freq = clean_sequence(" ".join(predicted_with_freq), label2id)
                seq_true = clean_sequence(" ".join(map(str, true_sequence)), label2id)
                seq_prefix = clean_sequence(" ".join(map(str, prefix_sequence)), label2id)

                end_token = str(label2id["activity"]["ENDactivity"])
                seq_pred_no_freq = remove_word(seq_pred_no_freq, end_token)
                seq_pred_with_freq = remove_word(seq_pred_with_freq, end_token)
                seq_true = remove_word(seq_true, end_token)
                seq_prefix = remove_word(seq_prefix, end_token)

                if seq_pred_no_freq == "" and seq_true == "":
                    seq_pred_no_freq = "end"
                    seq_true = "end"
                if seq_pred_with_freq == "":
                    seq_pred_with_freq = "end"

                dl_no = 1 - (
                    damerau_levenshtein_distance(seq_pred_no_freq, seq_true)
                    / max(len(seq_pred_no_freq), len(seq_true))
                )
                dl_w = 1 - (
                    damerau_levenshtein_distance(seq_pred_with_freq, seq_true)
                    / max(len(seq_pred_with_freq), len(seq_true))
                )

                list_dl_distance_no_freq.append(dl_no)
                list_dl_distance_with_freq.append(dl_w)

                file_dl.write(
                    f"{seq_prefix},{seq_pred_no_freq},{seq_pred_with_freq},{seq_true},"
                    f"{dl_no:.3f},{dl_w:.3f},"
                    f"{np.mean(list_dl_distance_no_freq):.3f},{np.mean(list_dl_distance_with_freq):.3f}\n"
                )

    print(f"Avg DL Similarity (No Frequency): {np.mean(list_dl_distance_no_freq):.3f}")
    print(f"Avg DL Similarity (With Frequency): {np.mean(list_dl_distance_with_freq):.3f}")


def parse_args():
    p = argparse.ArgumentParser()

    p.add_argument("--csv_log", type=str, default="helpdesk")
    p.add_argument("--type", type=str, default="all")

    p.add_argument("--semantic_dir", type=str, default="semantic_data")
    p.add_argument("--models_dir", type=str, default="models")
    p.add_argument("--output_dir", type=str, default="output")

    p.add_argument("--model_name", type=str, default="prajjwal1/bert-medium")
    p.add_argument("--max_len", type=int, default=512)

    # required by you
    p.add_argument("--beta", type=float, default=0.98)
    p.add_argument("--threshold", type=float, default=0.4)

    # runtime
    p.add_argument("--batch_size", type=int, default=1)
    p.add_argument("--num_workers", type=int, default=0)
    p.add_argument("--multi_gpu", action="store_true")

    return p.parse_args()


def main():
    args = parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("device-->", device)

    # keep your original behavior
    Log(args.csv_log, args.type)

    base = os.path.join(args.semantic_dir, args.csv_log)

    test = load_pickle(os.path.join(base, f"{args.csv_log}_test_{args.type}.pkl"))
    _y_test = load_pickle(os.path.join(base, f"{args.csv_log}_label_test_{args.type}.pkl"))  # kept, unused
    id2label = load_pickle(os.path.join(base, f"{args.csv_log}_id2label_{args.type}.pkl"))
    label2id = load_pickle(os.path.join(base, f"{args.csv_log}_label2id_{args.type}.pkl"))
    y_train_suffix = load_pickle(os.path.join(base, f"{args.csv_log}_suffix_train_{args.type}.pkl"))
    y_test_prefix = load_pickle(os.path.join(base, f"{args.csv_log}_prefixes_test_{args.type}.pkl"))
    y_test_suffix = load_pickle(os.path.join(base, f"{args.csv_log}_suffix_test_{args.type}.pkl"))
    y_test_activities = load_pickle(os.path.join(base, f"{args.csv_log}_activities_test_{args.type}.pkl"))
    trace_frequencies = load_pickle(os.path.join(base, f"{args.csv_log}_encoded_trace_frequencies_{args.type}.pkl"))

    tokenizer = AutoTokenizer.from_pretrained(args.model_name, truncation_side="left")
    base_model = AutoModel.from_pretrained(args.model_name)

    test_dataset = CustomDataset(
        test, y_test_suffix, y_test_prefix, y_test_activities, tokenizer, args.max_len
    )
    test_loader = DataLoader(
        test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers
    )

    output_sizes = [len(id2label["activity"]) for _ in range(len(y_train_suffix))]
    model = BertMultiOutputClassificationHeads(base_model, output_sizes)

    model_path = os.path.join(args.models_dir, f"{args.csv_log}_{args.type}.pth")
    state = torch.load(model_path, map_location="cpu")
    model.load_state_dict(state)

    model = model.to(device)
    model.eval()

    if args.multi_gpu and torch.cuda.is_available() and torch.cuda.device_count() > 1:
        print(f"Using DataParallel on {torch.cuda.device_count()} GPUs")
        model = torch.nn.DataParallel(model)

    out_path = os.path.join(args.output_dir, f"{args.csv_log}_{args.type}.txt")
    predict_next_activities(
        model=model,
        test_loader=test_loader,
        label2id=label2id,
        trace_frequencies=trace_frequencies,
        out_path=out_path,
        beta=args.beta,
        threshold=args.threshold,
        device=device,
    )


if __name__ == "__main__":
    main()
