In [1]:
import importlib
import sys
import os
import torch
import numpy as np
from tqdm.notebook import tqdm
import torch

sys.path.insert(0, '..')
sys.path.insert(1, '../..')
sys.path.insert(0, "../src")  # src package

In [2]:
# from activity_pertubator import get_train_val_test
# get_train_val_test(
#     csv_path= "../data/helpdesk.csv",
#     train_size= 0.7,
#     val_size = 0.15,
#     test_size= 0.15,
#     output_dir="../data/")

# Learn and Match Realistic Loops

This section demonstrates the new approach:
1. Learn realistic loops from a dataset (e.g., training data)
2. Greedily match and insert learned loops into a new dataset (e.g., test data)
3. Automatically split into prefixes/suffixes where loops are entirely in prefix

In [None]:
from activity_pertubator import (
    build_readable_event_log,
    learn_realistic_loops,
    match_loops_greedy,
)

# Load properties
df_train, props = build_readable_event_log(
    csv_path="../../data/helpdesk_train.csv",  # Use training data to learn loops
    properties_path="../../encoded_data/data_encoder/helpdesk_event_log_properties.pkl",
)

print(f"Learning loops from training data with {len(df_train)} rows...")

# Learn realistic loops from training data
learned_loops = learn_realistic_loops(
    df_train,
    properties=props,
    activity_column=props["concept_name"],
    eos_value="EOS",
)

print(f"Found {len(learned_loops)} loops to use for augmentation")
print(f"Sample loops (first 3):")
for i, (starting_activity, loop_df) in enumerate(learned_loops[:3]):
    activities = loop_df[props["concept_name"]].tolist()
    print(f"  Loop {i+1}: Starting activity '{starting_activity}', sequence: {activities}")

# Load test data for augmentation
df_test, _ = build_readable_event_log(
    csv_path="../data/helpdesk_test.csv",  # Use test data to insert loops
    properties_path="../encoded_data/data_encoder/helpdesk_event_log_properties.pkl",
)

print(f"\nApplying learned loops to test data with {len(df_test)} rows...")

# Match loops greedily and split into prefixes/suffixes
data = match_loops_greedy(
    data_new=df_test,
    learned_loops=learned_loops,
    properties=props,
    min_suffix_size=props.get(2),
    activity_column=props["concept_name"],
    eos_value="EOS",
)

print(f"Generated {len(data)} prefix/suffix pairs with loops in prefixes")

  self.df = self.df.groupby(self.case_name).apply(min_timestamp_before).reset_index(drop=True)
  self.df = self.df.groupby(self.case_name).apply(min_timestamp_before).reset_index(drop=True)
  self.df = self.df.groupby(self.case_name, group_keys=False).apply(


Learning loops from training data with 30919 rows...
Found 933 loops to use for augmentation
Sample loops (first 3):
  Loop 1: Starting activity 'Take in charge ticket', sequence: ['Take in charge ticket', 'Wait', 'Take in charge ticket']
  Loop 2: Starting activity 'Take in charge ticket', sequence: ['Take in charge ticket', 'Wait', 'Take in charge ticket']
  Loop 3: Starting activity 'Take in charge ticket', sequence: ['Take in charge ticket', 'Wait', 'Take in charge ticket']


  self.df = self.df.groupby(self.case_name).apply(min_timestamp_before).reset_index(drop=True)
  self.df = self.df.groupby(self.case_name, group_keys=False).apply(



Applying learned loops to test data with 11323 rows...


KeyboardInterrupt: 

# Redo Last Activity

In [3]:
from activity_pertubator import (
    build_readable_event_log,
    split_prefix_suffix_readable,
    redo_last_activity_of_prefix)


df, props = build_readable_event_log(
    csv_path="../../data/helpdesk_test.csv",
    properties_path="../../encoded_data/data_encoder/helpdesk_event_log_properties.pkl",
)

print(len(df))

data = split_prefix_suffix_readable(
    df,
    case_column=props["case_name"],
    activity_column=props["concept_name"],
    min_suffix_size=2,
)

# # Apply "redo last activity" augmentation to each prefix/suffix pair
# augmented_data = {}
# for key, (prefix_df, suffix_df) in data.items():
#     new_prefix, new_suffix = redo_last_activity_of_prefix(
#         prefix_df,
#         suffix_df,
#         properties=props,
#         time_increment_seconds=60.0,
#     )
#     augmented_data[key] = (new_prefix, new_suffix)
# data = augmented_data

print(f"Applied redo_last_activity_of_prefix to {len(data)} pairs")


  self.df = self.df.groupby(self.case_name).apply(min_timestamp_before).reset_index(drop=True)
  self.df = self.df.groupby(self.case_name, group_keys=False).apply(


11323
Applied redo_last_activity_of_prefix to 3091 pairs


In [52]:
prefix, suffix =data[("Case 3788", 2)]

In [53]:
suffix

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,Variant.1,seriousness,customer,product,responsible_section,seriousness_2,service_level,service_type,support_section,workgroup,case_elapsed_time,event_elapsed_time,day_in_week,seconds_in_day
2,Case 3788,Take in charge ticket,Value 2,2010-10-07 08:45:20,Variant 4,4.0,Variant 4,Value 1,Value 227,Value 3,Value 1,Value 1,Value 2,Value 1,Value 1,Value 1,4129.0,8.0,3.0,31520.0
3,Case 3788,Resolve ticket,Value 7,2010-10-07 10:15:03,Variant 4,4.0,Variant 4,Value 1,Value 227,Value 3,Value 1,Value 1,Value 2,Value 1,Value 1,Value 1,9512.0,5383.0,3.0,36903.0
4,Case 3788,Closed,Value 5,2010-11-24 08:40:24,Variant 4,4.0,Variant 4,Value 1,Value 227,Value 3,Value 1,Value 1,Value 2,Value 1,Value 1,Value 1,4151033.0,4141521.0,2.0,31224.0
5,Case 3788,EOS,EOS,NaT,EOS,,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,,,,
6,Case 3788,EOS,EOS,NaT,EOS,,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,,,,
7,Case 3788,EOS,EOS,NaT,EOS,,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,,,,
8,Case 3788,EOS,EOS,NaT,EOS,,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,,,,
9,Case 3788,EOS,EOS,NaT,EOS,,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,EOS,,,,


In [21]:
from activity_pertubator import encode_single_dataframe

# Load the trained encoder_decoder
encoder_decoder = torch.load(
    "../encoded_data/data_encoder/helpdesk_encoder_decoder.pkl",
    weights_only=False
)


# Encode all prefix/suffix pairs
encoded_data = {}
for (case_id, prefix_len), (prefix_df, suffix_df) in tqdm(data.items(), desc="Encoding data"):
    # Encode prefix
    encoded_prefix = encode_single_dataframe(
        prefix_df, encoder_decoder, props["case_name"], case_id
    )
    # Encode suffix
    encoded_suffix = encode_single_dataframe(
        suffix_df, encoder_decoder, props["case_name"], case_id
    )
    # Store encoded pair
    encoded_data[(case_id, prefix_len)] = (encoded_prefix, encoded_suffix)

print(f"Encoded {len(encoded_data)} prefix/suffix pairs")

torch.save(encoded_data, "../encoded_data/helpdesk_redo_activity.pkl")

Encoding data:   0%|          | 0/3091 [00:00<?, ?it/s]

Encoded 3091 prefix/suffix pairs


In [3]:
# # Helper: decode encoded tensors back to readable events using Evaluation.case_to_readable
# from model.dropout_uncertainty_enc_dec_LSTM.dropout_uncertainty_model import DropoutUncertaintyEncoderDecoderLSTM
# from evaluation.evaluation import Evaluation

# # Load trained model (needed only for its enc_feat definitions)
# model_path = "../src/notebooks/training_variational_dropout/Helpdesk/Helpdesk_full_no_grad_norm_new_2.pkl"
# model = DropoutUncertaintyEncoderDecoderLSTM.load(model_path, dropout=0.1)

# # Load original dataset (provides encoders/decoders + metadata)
# original_dataset_path = "../encoded_data/helpdesk_all_5_test.pkl"
# original_dataset = torch.load(original_dataset_path, weights_only=False)

# eval_helper = Evaluation(
#     model=model,
#     dataset=original_dataset,
#     concept_name=props["concept_name"],
#     eos_value="EOS",
#     growing_num_values=[col for col in encoder_decoder.continuous_columns if "elapsed" in col]
# )

# def preview_encoded_pair(case_id: str, prefix_len: int, max_events: int = 5):
#     encoded_prefix, encoded_suffix = encoded_data[(case_id, prefix_len)]
#     readable_prefix = eval_helper.case_to_readable(encoded_prefix, prune_eos=False)
#     readable_suffix = eval_helper.case_to_readable(encoded_suffix, prune_eos=False)
#     print(f"Decoded prefix for (case={case_id}, prefix_len={prefix_len}):")
#     for event in readable_prefix[:max_events]:
#         print(event)
#     print("\nDecoded suffix (first events):")
#     for event in readable_suffix[:max_events]:
#         print(event)

# # Example preview
# demo_case, demo_prefix_len = next(iter(encoded_data.keys()))
# preview_encoded_pair(demo_case, demo_prefix_len)



In [55]:
torch.save(encoded_data, "../encoded_data/helpdesk_augmented_loops.pkl")