In [1]:
import importlib
import sys
import torch
import pickle
import os
from tqdm.notebook import tqdm
import concurrent.futures
from functools import partial

sys.path.insert(0, '..')
sys.path.insert(0, '../..')
sys.path.insert(0, '../../..')
sys.path.insert(0, '../../../..')
sys.path.insert(0, '../../../../..')

from model.dropout_uncertainty_enc_dec_LSTM.dropout_uncertainty_model import DropoutUncertaintyEncoderDecoderLSTM
from evaluation.probabilistic_evaluation import ProbabilisticEvaluation


In [2]:
# Load model
file_path_model = '../../../training_variational_dropout/Helpdesk/Helpdesk_setting_2.pkl'
output_dir = '../../../../../evaluation_results/robustness/Helpdesk/random_event_attack/'
model = DropoutUncertaintyEncoderDecoderLSTM.load(file_path_model, dropout=0.1)

# Load datasets
file_path_original = '../../../../../encoded_data/helpdesk_all_5_test.pkl'
file_path_perturbed = '../../../../../encoded_data/helpdesk_small_perturbations_test.pkl'
file_path_redo_activity = '../../../../../encoded_data/helpdesk/val.pkl'
file_path_redo_activity_pert = '../../../../../encoded_data/helpdesk/random_event_attack.pkl'


original_dataset = torch.load(file_path_original, weights_only=False)
perturbed_dataset = torch.load(file_path_perturbed, weights_only=False)
redo_activity_dataset = torch.load(file_path_redo_activity, weights_only=False)
redo_activity_pert_dataset = torch.load(file_path_redo_activity_pert, weights_only=False)


print(f"Original dataset loaded: {len(original_dataset)} cases")
print(f"Perturbed dataset loaded: {len(perturbed_dataset)} cases")


Data set categories:  ([('Activity', 16, {'Assign seriousness': 1, 'Closed': 2, 'Create SW anomaly': 3, 'DUPLICATE': 4, 'EOS': 5, 'INVALID': 6, 'Insert ticket': 7, 'RESOLVED': 8, 'Require upgrade': 9, 'Resolve SW anomaly': 10, 'Resolve ticket': 11, 'Schedule intervention': 12, 'Take in charge ticket': 13, 'VERIFIED': 14, 'Wait': 15}), ('Resource', 24, {'EOS': 1, 'Value 1': 2, 'Value 10': 3, 'Value 11': 4, 'Value 12': 5, 'Value 13': 6, 'Value 14': 7, 'Value 15': 8, 'Value 16': 9, 'Value 17': 10, 'Value 18': 11, 'Value 19': 12, 'Value 2': 13, 'Value 20': 14, 'Value 21': 15, 'Value 22': 16, 'Value 3': 17, 'Value 4': 18, 'Value 5': 19, 'Value 6': 20, 'Value 7': 21, 'Value 8': 22, 'Value 9': 23}), ('Variant index', 166, {'1.0': 1, '10.0': 2, '100.0': 3, '101.0': 4, '102.0': 5, '103.0': 6, '104.0': 7, '105.0': 8, '106.0': 9, '107.0': 10, '108.0': 11, '109.0': 12, '11.0': 13, '110.0': 14, '111.0': 15, '112.0': 16, '113.0': 17, '114.0': 18, '12.0': 19, '13.0': 20, '14.0': 21, '15.0': 22, '16.0

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Original dataset loaded: 5192 cases
Perturbed dataset loaded: 5192 cases


In [3]:
# Create evaluation instances (NON-RANDOM ORDER)
eval_original = ProbabilisticEvaluation(
    model, original_dataset,
    concept_name='Activity',
    num_processes=1, #16
    growing_num_values=['case_elapsed_time'],
    samples_per_case=100,
    sample_argmax=False,
    use_variance_cat=True,
    use_variance_num=True,
    all_cat=['Activity', 'Resource'],
    all_num=['case_elapsed_time', 'event_elapsed_time'],
    dataset_predefined_prefixes=redo_activity_dataset
)

eval_perturbed = ProbabilisticEvaluation(
    model, perturbed_dataset,
    concept_name='Activity',
    num_processes=1,#16
    growing_num_values=['case_elapsed_time'],
    samples_per_case=100,
    sample_argmax=False,
    use_variance_cat=True,
    use_variance_num=True,
    all_cat=['Activity', 'Resource'],
    all_num=['case_elapsed_time', 'event_elapsed_time'],
    dataset_predefined_prefixes=redo_activity_pert_dataset
)

print("ProbabilisticEvaluation instances created")


ProbabilisticEvaluation instances created


In [4]:
# Import robustness metrics module
import robustness.evaluator.robustness_metrics
#importlib.reload(robustness.robustness_metrics)
from robustness.evaluator.robustness_metrics import save_chunk

print("Robustness metrics module imported")

# Helper functions for filtering predictions and calculating remaining time
def filter_prediction_events(prediction_list, concept_name='Activity'):
    """Filter prediction events to only include concept:name and case_elapsed_time"""
    if prediction_list is None:
        return None
    filtered = []
    for event in prediction_list:
        if not isinstance(event, dict):
            continue
        filtered_event = {}
        if concept_name in event:
            filtered_event[concept_name] = event[concept_name]
        if 'case_elapsed_time' in event:
            filtered_event['case_elapsed_time'] = event['case_elapsed_time']
        filtered.append(filtered_event)
    return filtered

def calculate_remaining_time(prefix, prediction, concept_name='Activity'):
    """Calculate remaining time from prefix and prediction"""
    if not prefix or not prediction:
        return None
    if not isinstance(prefix[-1], dict) or 'case_elapsed_time' not in prefix[-1]:
        return None
    if not isinstance(prediction[-1], dict) or 'case_elapsed_time' not in prediction[-1]:
        return None
    current_time = prefix[-1]['case_elapsed_time']
    final_time = prediction[-1]['case_elapsed_time']
    return final_time - current_time

def calculate_sampled_remaining_times(prefix, predicted_suffixes, concept_name='Activity'):
    """Calculate remaining time for each sample in predicted_suffixes"""
    if not prefix or not predicted_suffixes:
        return None
    if not isinstance(prefix[-1], dict) or 'case_elapsed_time' not in prefix[-1]:
        return None
    current_time = prefix[-1]['case_elapsed_time']
    
    remaining_times = []
    for sample in predicted_suffixes:
        if not sample or not isinstance(sample[-1], dict) or 'case_elapsed_time' not in sample[-1]:
            remaining_times.append(None)
        else:
            final_time = sample[-1]['case_elapsed_time']
            remaining_times.append(final_time - current_time)
    return remaining_times


Robustness metrics module imported


In [5]:
# Main evaluation loop with parallelization
os.makedirs(output_dir, exist_ok=True)

save_every = 50
results = {}
concept_name = 'Activity'  # Match the concept_name used in ProbabilisticEvaluation
num_workers = 8  # Number of parallel workers

# Collect prefix/suffix pairs from predefined datasets (fast - just reading data)
print("Collecting prefix/suffix pairs from predefined datasets...")
all_prefix_suffix_pairs_orig = list(redo_activity_dataset.items())
all_prefix_suffix_pairs_pert = list(redo_activity_pert_dataset.items())

# Ensure they're in the same order and match
assert len(all_prefix_suffix_pairs_orig) == len(all_prefix_suffix_pairs_pert), \
    f"Mismatch in number of pairs: {len(all_prefix_suffix_pairs_orig)} vs {len(all_prefix_suffix_pairs_pert)}"

# Verify keys match
for (key_orig, _), (key_pert, _) in zip(all_prefix_suffix_pairs_orig, all_prefix_suffix_pairs_pert):
    assert key_orig == key_pert, f"Key mismatch: {key_orig} != {key_pert}"

print(f"Collected {len(all_prefix_suffix_pairs_orig)} prefix/suffix pairs")

# Helper function to evaluate and process a single pair (runs in worker process)
def evaluate_and_process_pair(
    eval_orig_instance, eval_pert_instance,
    case_name, prefix_len, prefix_orig, suffix_orig, prefix_pert, suffix_pert,
    filter_prediction_events, calculate_remaining_time, 
    calculate_sampled_remaining_times, concept_name
):
    """Evaluate a single pair and process results - runs in worker process"""
    # Evaluate both (this is the expensive model inference part)
    orig_result = eval_orig_instance._evaluate_single(case_name, prefix_len, prefix_orig, suffix_orig, False)
    pert_result = eval_pert_instance._evaluate_single(case_name, prefix_len, prefix_pert, suffix_pert, False)
    
    (case_name_orig, prefix_len_orig, prefix_orig_readable, predicted_suffixes_orig, suffix_orig_readable, mean_pred_orig) = orig_result
    (case_name_pert, prefix_len_pert, prefix_pert_readable, predicted_suffixes_pert, suffix_pert_readable, mean_pred_pert) = pert_result
    
    # Ensure we're comparing the same case and prefix length
    assert case_name_orig == case_name_pert, f"Case mismatch: {case_name_orig} != {case_name_pert}"
    assert prefix_len_orig == prefix_len_pert, f"Prefix length mismatch: {prefix_len_orig} != {prefix_len_pert}"
    
    # Filter predictions to only include concept:name and case_elapsed_time
    mean_pred_orig_filtered = filter_prediction_events(mean_pred_orig, concept_name=concept_name)
    predicted_suffixes_orig_filtered = [filter_prediction_events(sample, concept_name=concept_name) 
                                        for sample in predicted_suffixes_orig] if predicted_suffixes_orig else None
    
    mean_pred_pert_filtered = filter_prediction_events(mean_pred_pert, concept_name=concept_name)
    predicted_suffixes_pert_filtered = [filter_prediction_events(sample, concept_name=concept_name) 
                                        for sample in predicted_suffixes_pert] if predicted_suffixes_pert else None
    
    # Calculate remaining times
    mean_pred_remaining_time_orig = calculate_remaining_time(prefix_orig_readable, mean_pred_orig, concept_name=concept_name)
    sampled_remaining_time_orig = calculate_sampled_remaining_times(prefix_orig_readable, predicted_suffixes_orig, concept_name=concept_name)
    
    mean_pred_remaining_time_pert = calculate_remaining_time(prefix_pert_readable, mean_pred_pert, concept_name=concept_name)
    sampled_remaining_time_pert = calculate_sampled_remaining_times(prefix_pert_readable, predicted_suffixes_pert, concept_name=concept_name)

    # Return structured result
    key = (case_name_orig, prefix_len_orig)
    return key, {
        'original': (
            prefix_orig_readable,  # Keep all fields
            suffix_orig_readable,  # Keep all fields
            mean_pred_orig_filtered,  # Filtered: only concept:name and case_elapsed_time
            predicted_suffixes_orig_filtered,  # Filtered: only concept:name and case_elapsed_time
            mean_pred_remaining_time_orig,  # NEW: single float
            sampled_remaining_time_orig  # NEW: list of floats
        ),
        'perturbed': (
            prefix_pert_readable,  # Keep all fields
            suffix_pert_readable,  # Keep all fields
            mean_pred_pert_filtered,  # Filtered: only concept:name and case_elapsed_time
            predicted_suffixes_pert_filtered,  # Filtered: only concept:name and case_elapsed_time
            mean_pred_remaining_time_pert,  # NEW: single float
            sampled_remaining_time_pert  # NEW: list of floats
        ),
    }

# Process in parallel
processed = 0
successful = 0
failed = []

print(f"Starting parallel evaluation with {num_workers} workers...")
print("Note: Model will be pickled to each worker (one-time overhead per worker)")

# Create dictionary for O(1) lookup of perturbed pairs
perturbed_dict = dict(all_prefix_suffix_pairs_pert)

with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
    # Submit all tasks - each worker gets a prefix/suffix pair
    futures = []
    for (case_name, prefix_len), (prefix_orig, suffix_orig) in all_prefix_suffix_pairs_orig:
        # Get corresponding perturbed pair using dictionary lookup (O(1) instead of O(n))
        (prefix_pert, suffix_pert) = perturbed_dict[(case_name, prefix_len)]
        
        # Clone tensors to avoid memory issues (as done in evaluate_multi_processing)
        prefix_orig_clone = [[t.clone() for t in i] for i in prefix_orig] if isinstance(prefix_orig[0], list) else prefix_orig
        suffix_orig_clone = [[t.clone() for t in i] for i in suffix_orig] if isinstance(suffix_orig[0], list) else suffix_orig
        prefix_pert_clone = [[t.clone() for t in i] for i in prefix_pert] if isinstance(prefix_pert[0], list) else prefix_pert
        suffix_pert_clone = [[t.clone() for t in i] for i in suffix_pert] if isinstance(suffix_pert[0], list) else suffix_pert
        
        # Submit task
        future = executor.submit(
            evaluate_and_process_pair,
            eval_original, eval_perturbed,
            case_name, prefix_len,
            prefix_orig_clone, suffix_orig_clone,
            prefix_pert_clone, suffix_pert_clone,
            filter_prediction_events, calculate_remaining_time,
            calculate_sampled_remaining_times, concept_name
        )
        futures.append(future)
    
    # Process results as they complete
    for future in tqdm(concurrent.futures.as_completed(futures), 
                       total=len(futures), desc="Evaluating robustness"):
        try:
            key, result = future.result()
            results[key] = result
            processed += 1
            successful += 1
            
            # Save periodically
            if processed % save_every == 0:
                save_chunk(results, processed - 1, output_dir)
                results = {}
                
        except Exception as e:
            print(f"Error processing pair: {e}")
            failed.append(str(e))
            import traceback
            traceback.print_exc()

# Save remaining results
if len(results):
    save_chunk(results, processed - 1, output_dir)

print(f"\nRobustness evaluation completed!")
print(f"  Processed: {successful}/{len(all_prefix_suffix_pairs_orig)} pairs successfully")
if failed:
    print(f"  Failed: {len(failed)} pairs")


Collecting prefix/suffix pairs from predefined datasets...
Collected 1898 prefix/suffix pairs
Starting parallel evaluation with 8 workers...
Note: Model will be pickled to each worker (one-time overhead per worker)


Evaluating robustness:   0%|          | 0/1898 [00:00<?, ?it/s]

Process ForkProcess-6:
Process ForkProcess-8:
Process ForkProcess-7:
Process ForkProcess-2:
Process ForkProcess-3:
Process ForkProcess-5:


BrokenPipeError: [Errno 32] Broken pipe

Process ForkProcess-4:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib64/python3.13/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
    ~~~~~~~~^^
  File "/usr/lib64/python3.13/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
    ~~~~~~~~^^
  File "/usr/lib64/python3.13/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
    ~~~~~~~~^^
  File "/usr/lib64/python3.13/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
    ~~~~~~~~^^
  File "/usr/lib64/python3.13/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
    ~~~~~~~~^^
  File "/usr/lib64/python3.13/multiprocessing/process.py", line 313, in _bootstrap
    self.run()
    ~~~~~~~~^^
  File "/usr/lib64/python3.13/multiprocessing/process.

KeyboardInterrupt: 

In [None]:
# Load all saved chunks and combine them
all_results = {}
# Get all chunk files and sort them
chunk_files = [f for f in os.listdir(output_dir) if f.startswith('robustness_results_part_')]
chunk_files.sort()  # Ensure correct order

print(f"Found {len(chunk_files)} chunk files")

for chunk_file in chunk_files:
    chunk_path = os.path.join(output_dir, chunk_file)
    print(f"Loading {chunk_file}...")
    with open(chunk_path, 'rb') as f:
        chunk_results = pickle.load(f)
        all_results.update(chunk_results)
        print(f"  Added {len(chunk_results)} results from {chunk_file}")

# Also add the final results if any (e.g. from a still-running evaluation loop)
if 'results' in locals() and len(results) > 0:
    print(f"Adding final {len(results)} results...")
    all_results.update(results)

print(f"\nTotal results loaded: {len(all_results)}")

# Save combined results into a single pickle file
combined_results_path = os.path.join(output_dir, 'robustness_results.pkl')
with open(combined_results_path, 'wb') as f:
    pickle.dump(all_results, f)

print(f"Combined results saved to {combined_results_path}")

NameError: name 'os' is not defined