# Robustness Comparison: Camargo LSTM vs Normal Model

This notebook compares robustness metrics between the Camargo LSTM model and the Normal (DropoutUncertainty) model on the Helpdesk dataset.


In [None]:
import sys
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.insert(0, '..')
sys.path.insert(0, '../..')

from robustness_metrics import (
    load_results, prepare_robustness_results, calculate_aggregate_metrics
)

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 11


In [None]:
# Load Camargo LSTM results
#camargo_results_path = '../../evaluation_results/robustness/sepsis/last_event_attack_2/robustness_results.pkl'
camargo_results_path = '../../evaluation_results/robustness/helpdesk/last_event_attack/robustness_results.pkl'
camargo_results = load_results(camargo_results_path)
camargo_results = prepare_robustness_results(camargo_results, save_path=camargo_results_path)
camargo_data = calculate_aggregate_metrics(camargo_results)

print(f"Camargo-LSTM: Loaded {len(camargo_results)} results")


In [None]:
# Load UED-LSTM results
#U_ED_LSTM_results_path = '../../evaluation_results/robustness/sepsis/last_event_attack_2/robustness_results.pkl'
#U_ED_LSTM_results_path = '../evaluation_results/robustness/Helpdesk/large_perturbations/robustness_results.pkl'
U_ED_LSTM_results_path = '../../evaluation_results/robustness/helpdesk/last_event_attack/robustness_results.pkl'
U_ED_LSTM_results = load_results(U_ED_LSTM_results_path)
U_ED_LSTM_results = prepare_robustness_results(U_ED_LSTM_results, save_path=U_ED_LSTM_results_path)
normal_data = calculate_aggregate_metrics(U_ED_LSTM_results)

print(f"U-ED-LSTM: Loaded {len(U_ED_LSTM_results)} results")

In [None]:
# Compare mean_suffix (mean_prediction) of perturbed and clean for first 100 results
print("="*80)
print("COMPARING MEAN PREDICTIONS: CLEAN vs PERTURBED (First 100)")
print("="*80)

# Get first 100 entries
first_100_items = list(U_ED_LSTM_results.items())[:100]

# Extract mean predictions
comparisons = []
for idx, ((case_name, prefix_len), entry) in enumerate(first_100_items, 1):
    # Handle both old (4-tuple) and new (6-tuple) formats
    try:
        orig_tuple = entry['original']
        pert_tuple = entry['perturbed']
        
        if len(orig_tuple) == 6:
            # New format: (prefix, suffix, mean_pred, sampled, mean_rt, sampled_rt)
            mean_pred_orig = orig_tuple[2]
            mean_pred_pert = pert_tuple[2]
        else:
            # Old format: (prefix, suffix, mean_pred, sampled)
            mean_pred_orig = orig_tuple[2]
            mean_pred_pert = pert_tuple[2]
        
        # Extract activity/concept:name sequences for comparison
        concept_name = 'concept:name'  # Default, adjust if needed
        if mean_pred_orig and len(mean_pred_orig) > 0:
            # Try to find concept_name in the first event
            if isinstance(mean_pred_orig[0], dict):
                possible_names = ['concept:name', 'Activity', 'activity']
                for name in possible_names:
                    if name in mean_pred_orig[0]:
                        concept_name = name
                        break
        
        orig_activities = [event.get(concept_name, 'N/A') for event in mean_pred_orig] if mean_pred_orig else []
        pert_activities = [event.get(concept_name, 'N/A') for event in mean_pred_pert] if mean_pred_pert else []
        
        # Compare sequences
        are_equal = orig_activities == pert_activities
        orig_len = len(orig_activities)
        pert_len = len(pert_activities)
        
        comparisons.append({
            'idx': idx,
            'case': case_name,
            'prefix_len': prefix_len,
            'orig_activities': orig_activities,
            'pert_activities': pert_activities,
            'orig_len': orig_len,
            'pert_len': pert_len,
            'are_equal': are_equal
        })
        
    except (KeyError, IndexError, TypeError) as e:
        print(f"Error processing entry {idx} ({case_name}, {prefix_len}): {e}")
        continue

# Print detailed comparison
print(f"\nTotal entries compared: {len(comparisons)}\n")
print(f"{'#':<5} {'Case':<20} {'Prefix':<8} {'Orig Len':<10} {'Pert Len':<10} {'Match':<8} {'Orig Activities':<40} {'Pert Activities':<40}")
print("-"*150)

for comp in comparisons[:100]:  # Print all comparisons
    match_str = "✓" if comp['are_equal'] else "✗"
    orig_str = " → ".join(comp['orig_activities'][:5])  # Show first 5 activities
    if len(comp['orig_activities']) > 5:
        orig_str += "..."
    pert_str = " → ".join(comp['pert_activities'][:5])
    if len(comp['pert_activities']) > 5:
        pert_str += "..."
    
    print(f"{comp['idx']:<5} {comp['case']:<20} {comp['prefix_len']:<8} {comp['orig_len']:<10} {comp['pert_len']:<10} {match_str:<8} {orig_str:<40} {pert_str:<40}")

# Summary statistics
matches = sum(1 for c in comparisons if c['are_equal'])
total = len(comparisons)
match_rate = (matches / total * 100) if total > 0 else 0

print("\n" + "="*80)
print(f"SUMMARY:")
print(f"  Total compared: {total}")
print(f"  Exact matches: {matches}")
print(f"  Mismatches: {total - matches}")
print(f"  Match rate: {match_rate:.2f}%")
print("="*80)


In [None]:
# Summary Statistics Table
import numpy as np

print("="*80)
print("SUMMARY COMPARISON")
print("="*80)
print(f"\n{'Metric':<30} {'Camargo LSTM':<20} {'Normal Model':<20} {'Difference':<15}")
print("-"*80)

# Calculate overall metrics from results
camargo_mean_metrics = [entry['robustness_metrics']['mean_prediction'] for entry in camargo_results.values() if 'robustness_metrics' in entry]
normal_mean_metrics = [entry['robustness_metrics']['mean_prediction'] for entry in U_ED_LSTM_results.values() if 'robustness_metrics' in entry]

camargo_prob_metrics = [entry['robustness_metrics'].get('probabilistic_prediction', {}) for entry in camargo_results.values() if 'robustness_metrics' in entry]
camargo_prob_metrics = [m for m in camargo_prob_metrics if m]
normal_prob_metrics = [entry['robustness_metrics'].get('probabilistic_prediction', {}) for entry in U_ED_LSTM_results.values() if 'robustness_metrics' in entry]
normal_prob_metrics = [m for m in normal_prob_metrics if m]

camargo_activity_match = np.mean([m['activity_sequence_match'] for m in camargo_mean_metrics])
normal_activity_match = np.mean([m['activity_sequence_match'] for m in normal_mean_metrics])
camargo_length_match = np.mean([m['length_match'] for m in camargo_mean_metrics])
normal_length_match = np.mean([m['length_match'] for m in normal_mean_metrics])

metrics_comparison = [
    ('Activity Match Rate', camargo_activity_match, normal_activity_match),
    ('Length Match Rate', camargo_length_match, normal_length_match),
]

# Add Top k metrics if available
if camargo_prob_metrics and normal_prob_metrics:
    camargo_topk = np.mean([m.get('top_k_activity_match_rate', 0.0) for m in camargo_prob_metrics])
    normal_topk = np.mean([m.get('top_k_activity_match_rate', 0.0) for m in normal_prob_metrics])
    metrics_comparison.append(
        ('Top k Activity Match Rate', camargo_topk, normal_topk)
    )

for metric_name, camargo_val, normal_val in metrics_comparison:
    diff = camargo_val - normal_val
    diff_str = f"{diff:+.4f}" if abs(diff) > 0.0001 else "≈ 0.0000"
    print(f"{metric_name:<30} {camargo_val:<20.4f} {normal_val:<20.4f} {diff_str:<15}")

print(f"\nTotal Evaluations:")
print(f"  Camargo LSTM: {len(camargo_results)}")
print(f"  Normal Model: {len(U_ED_LSTM_results)}")


In [None]:
# Single comparison chart: Activity Sequence Match Rate by Prefix Length
import matplotlib as mpl

# Reset to defaults and apply clean styling similar to plot_res
mpl.rcdefaults()
plt.rcParams.update({
    'font.size': 8,          # General font size
    'axes.titlesize': 10,    # Title font size
    'axes.labelsize': 9,     # Axis label size
    'xtick.labelsize': 8,    # X-axis tick labels
    'ytick.labelsize': 8,    # Y-axis tick labels
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'legend.fontsize': 8,    # Legend font size
    'lines.linewidth': 1.2,  # Line width
    'lines.markersize': 5    # Marker size
})

fig, ax1 = plt.subplots(figsize=(6, 4), dpi=100)

# Get common prefix lengths
common_prefix_lengths = sorted(set(camargo_data['prefix_lengths']) & set(normal_data['prefix_lengths']))

# Create dictionaries for easy lookup
camargo_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['activity_match_rates']))
normal_dict = dict(zip(normal_data['prefix_lengths'], normal_data['activity_match_rates']))
camargo_counts_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['sample_counts']))
normal_counts_dict = dict(zip(normal_data['prefix_lengths'], normal_data['sample_counts']))

# Extract values for common prefix lengths
camargo_values = [camargo_dict[p] for p in common_prefix_lengths]
normal_values = [normal_dict[p] for p in common_prefix_lengths]
camargo_counts = [camargo_counts_dict[p] for p in common_prefix_lengths]
normal_counts = [normal_counts_dict[p] for p in common_prefix_lengths]

# Plot lines for activity match rate
line1 = ax1.plot(common_prefix_lengths, camargo_values, marker='o', 
                 linewidth=1.2, markersize=5, label='Camargo LSTM', 
                 color='blue', alpha=0.8)
line2 = ax1.plot(common_prefix_lengths, normal_values, marker='s', 
                 linewidth=1.2, markersize=5, label='U-ED-LSTM', 
                 color='orange', alpha=0.8)

# Create secondary y-axis for instance counts
ax2 = ax1.twinx()

# Plot instance counts as background bars
total_counts = [camargo_counts[i] + normal_counts[i] for i in range(len(common_prefix_lengths))]
ax2.bar(common_prefix_lengths, total_counts, alpha=0.15, color='gray', 
        width=0.6, label='Total instances')

# Style the axes
ax1.set_xlabel('prefix len', labelpad=0.5)
ax1.set_ylabel('Activity Sequence Match Rate', labelpad=0.5)
ax2.set_ylabel('instances', labelpad=0.5)

# Set limits
ax1.set_ylim(0, 1.05)
ax1.set_xlim(left=min(common_prefix_lengths) - 0.5, right=max(common_prefix_lengths) + 0.5)
ax2.set_ylim(bottom=0)

# Remove spines for cleaner look
for spine in ax1.spines.values():
    spine.set_visible(False)
for spine in ax2.spines.values():
    spine.set_visible(False)

# Add grid
ax1.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)

# Add reference line at 1.0
ax1.axhline(y=1.0, color='gray', linestyle='--', alpha=0.5, linewidth=0.7)

# Create combined legend
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper right", 
           frameon=True, fontsize=8)

# Hide secondary y-axis ticks
ax2.set_yticks([])

# Bring primary plot to front
ax1.set_zorder(2)
ax2.set_zorder(1)
ax1.patch.set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
# Remaining Time Prediction Shift by Prefix Length
import matplotlib as mpl

mpl.rcdefaults()
plt.rcParams.update({
    'font.size': 8,
    'axes.titlesize': 10,
    'axes.labelsize': 9,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'legend.fontsize': 8,
    'lines.linewidth': 1.2,
    'lines.markersize': 5
})

fig, ax1 = plt.subplots(figsize=(6, 4), dpi=100)

common_prefix_lengths = sorted(set(camargo_data['prefix_lengths']) & set(normal_data['prefix_lengths']))

camargo_rt_shift_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['remaining_time_prediction_shift']))
normal_rt_shift_dict = dict(zip(normal_data['prefix_lengths'], normal_data['remaining_time_prediction_shift']))
camargo_counts_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['sample_counts']))
normal_counts_dict = dict(zip(normal_data['prefix_lengths'], normal_data['sample_counts']))

camargo_rt_shift = [camargo_rt_shift_dict.get(p, 0.0) for p in common_prefix_lengths]
normal_rt_shift = [normal_rt_shift_dict.get(p, 0.0) for p in common_prefix_lengths]
camargo_counts = [camargo_counts_dict.get(p, 0) for p in common_prefix_lengths]
normal_counts = [normal_counts_dict.get(p, 0) for p in common_prefix_lengths]

# Plot lines for mean remaining time prediction shift
# ax1.plot(common_prefix_lengths, camargo_rt_shift, marker='o',
#          linewidth=1.2, markersize=5, label='Camargo LSTM',
#          color='blue', alpha=0.9)
ax1.plot(common_prefix_lengths, normal_rt_shift, marker='s',
         linewidth=1.2, markersize=5, label='U-ED-LSTM',
         color='orange', alpha=0.9)

ax2 = ax1.twinx()

total_counts = [camargo_counts[i] + normal_counts[i] for i in range(len(common_prefix_lengths))]
ax2.bar(common_prefix_lengths, total_counts, alpha=0.15, color='gray',
        width=0.6, label='Total instances')

ax1.set_xlabel('prefix len', labelpad=0.5)
ax1.set_ylabel('Remaining Time Prediction Shift', labelpad=0.5)
ax2.set_ylabel('instances', labelpad=0.5)

ax1.set_xlim(left=min(common_prefix_lengths) - 0.5, right=max(common_prefix_lengths) + 0.5)
ax2.set_ylim(bottom=0)

for spine in ax1.spines.values():
    spine.set_visible(False)
for spine in ax2.spines.values():
    spine.set_visible(False)

ax1.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)

lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper right",
           frameon=True, fontsize=8)

ax2.set_yticks([])

ax1.set_zorder(2)
ax2.set_zorder(1)
ax1.patch.set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Single comparison chart: Length Match Rate by Prefix Length
import matplotlib as mpl

# Reset to defaults and apply clean styling similar to plot_res
mpl.rcdefaults()
plt.rcParams.update({
    'font.size': 8,          # General font size
    'axes.titlesize': 10,    # Title font size
    'axes.labelsize': 9,     # Axis label size
    'xtick.labelsize': 8,    # X-axis tick labels
    'ytick.labelsize': 8,    # Y-axis tick labels
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'legend.fontsize': 8,    # Legend font size
    'lines.linewidth': 1.2,  # Line width
    'lines.markersize': 5    # Marker size
})

fig, ax1 = plt.subplots(figsize=(6, 4), dpi=100)

# Get common prefix lengths
common_prefix_lengths = sorted(set(camargo_data['prefix_lengths']) & set(normal_data['prefix_lengths']))

# Create dictionaries for easy lookup (using length_match_rates)
camargo_length_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['length_match_rates']))
normal_length_dict = dict(zip(normal_data['prefix_lengths'], normal_data['length_match_rates']))
camargo_counts_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['sample_counts']))
normal_counts_dict = dict(zip(normal_data['prefix_lengths'], normal_data['sample_counts']))

# Extract values for common prefix lengths
camargo_length_values = [camargo_length_dict[p] for p in common_prefix_lengths]
normal_length_values = [normal_length_dict[p] for p in common_prefix_lengths]
camargo_counts = [camargo_counts_dict[p] for p in common_prefix_lengths]
normal_counts = [normal_counts_dict[p] for p in common_prefix_lengths]

# Plot lines for length match rate
line1 = ax1.plot(common_prefix_lengths, camargo_length_values, marker='o', 
                 linewidth=1.2, markersize=5, label='Camargo LSTM', 
                 color='blue', alpha=0.8)
line2 = ax1.plot(common_prefix_lengths, normal_length_values, marker='s', 
                 linewidth=1.2, markersize=5, label='U-ED-LSTM', 
                 color='orange', alpha=0.8)

# Create secondary y-axis for instance counts
ax2 = ax1.twinx()

# Plot instance counts as background bars
total_counts = [camargo_counts[i] + normal_counts[i] for i in range(len(common_prefix_lengths))]
ax2.bar(common_prefix_lengths, total_counts, alpha=0.15, color='gray', 
        width=0.6, label='Total instances')

# Style the axes
ax1.set_xlabel('prefix len', labelpad=0.5)
ax1.set_ylabel('Length Match Rate', labelpad=0.5)
ax2.set_ylabel('instances', labelpad=0.5)

# Set limits
ax1.set_ylim(0, 1.05)
ax1.set_xlim(left=min(common_prefix_lengths) - 0.5, right=max(common_prefix_lengths) + 0.5)
ax2.set_ylim(bottom=0)

# Remove spines for cleaner look
for spine in ax1.spines.values():
    spine.set_visible(False)
for spine in ax2.spines.values():
    spine.set_visible(False)

# Add grid
ax1.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)

# Add reference line at 1.0
ax1.axhline(y=1.0, color='gray', linestyle='--', alpha=0.5, linewidth=0.7)

# Create combined legend
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper right", 
           frameon=True, fontsize=8)

# Hide secondary y-axis ticks
ax2.set_yticks([])

# Bring primary plot to front
ax1.set_zorder(2)
ax2.set_zorder(1)
ax1.patch.set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
# Single comparison chart: Clean DLS by Prefix Length
import matplotlib as mpl

mpl.rcdefaults()
plt.rcParams.update({
    'font.size': 8,
    'axes.titlesize': 10,
    'axes.labelsize': 9,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'legend.fontsize': 8,
    'lines.linewidth': 1.2,
    'lines.markersize': 5
})

fig, ax1 = plt.subplots(figsize=(6, 4), dpi=100)

common_prefix_lengths = sorted(set(camargo_data['prefix_lengths']) & set(normal_data['prefix_lengths']))

camargo_clean_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['clean_dls']))
normal_clean_dict = dict(zip(normal_data['prefix_lengths'], normal_data['clean_dls']))
camargo_counts_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['sample_counts']))
normal_counts_dict = dict(zip(normal_data['prefix_lengths'], normal_data['sample_counts']))

camargo_clean_values = [camargo_clean_dict[p] for p in common_prefix_lengths]
normal_clean_values = [normal_clean_dict[p] for p in common_prefix_lengths]
camargo_counts = [camargo_counts_dict[p] for p in common_prefix_lengths]
normal_counts = [normal_counts_dict[p] for p in common_prefix_lengths]

ax1.plot(common_prefix_lengths, camargo_clean_values, marker='o',
         linewidth=1.2, markersize=5, label='Camargo LSTM',
         color='blue', alpha=0.8)
ax1.plot(common_prefix_lengths, normal_clean_values, marker='s',
         linewidth=1.2, markersize=5, label='U-ED-LSTM',
         color='orange', alpha=0.8)

ax2 = ax1.twinx()

total_counts = [camargo_counts[i] + normal_counts[i] for i in range(len(common_prefix_lengths))]
ax2.bar(common_prefix_lengths, total_counts, alpha=0.15, color='gray',
        width=0.6, label='Total instances')

ax1.set_xlabel('prefix len', labelpad=0.5)
ax1.set_ylabel('Clean DLS', labelpad=0.5)
ax2.set_ylabel('instances', labelpad=0.5)

ax1.set_ylim(0, 1.05)
ax1.set_xlim(left=min(common_prefix_lengths) - 0.5, right=max(common_prefix_lengths) + 0.5)
ax2.set_ylim(bottom=0)

for spine in ax1.spines.values():
    spine.set_visible(False)
for spine in ax2.spines.values():
    spine.set_visible(False)

ax1.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)
ax1.axhline(y=1.0, color='gray', linestyle='--', alpha=0.5, linewidth=0.7)

lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper right",
           frameon=True, fontsize=8)

ax2.set_yticks([])

ax1.set_zorder(2)
ax2.set_zorder(1)
ax1.patch.set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Single comparison chart: DLS drop under Perturbation by Prefix Length
import matplotlib as mpl

mpl.rcdefaults()
plt.rcParams.update({
    'font.size': 8,
    'axes.titlesize': 10,
    'axes.labelsize': 9,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'legend.fontsize': 8,
    'lines.linewidth': 1.2,
    'lines.markersize': 5
})

fig, ax1 = plt.subplots(figsize=(6, 4), dpi=100)

common_prefix_lengths = sorted(set(camargo_data['prefix_lengths']) & set(normal_data['prefix_lengths']))

camargo_clean_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['relative_dls_drop']))
normal_clean_dict = dict(zip(normal_data['prefix_lengths'], normal_data['relative_dls_drop']))
camargo_counts_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['sample_counts']))
normal_counts_dict = dict(zip(normal_data['prefix_lengths'], normal_data['sample_counts']))

camargo_clean_values = [camargo_clean_dict[p] for p in common_prefix_lengths]
normal_clean_values = [normal_clean_dict[p] for p in common_prefix_lengths]
camargo_counts = [camargo_counts_dict[p] for p in common_prefix_lengths]
normal_counts = [normal_counts_dict[p] for p in common_prefix_lengths]

ax1.plot(common_prefix_lengths, camargo_clean_values, marker='o',
         linewidth=1.2, markersize=5, label='Camargo LSTM',
         color='blue', alpha=0.8)
ax1.plot(common_prefix_lengths, normal_clean_values, marker='s',
         linewidth=1.2, markersize=5, label='U-ED-LSTM',
         color='orange', alpha=0.8)

ax2 = ax1.twinx()

total_counts = [camargo_counts[i] + normal_counts[i] for i in range(len(common_prefix_lengths))]
ax2.bar(common_prefix_lengths, total_counts, alpha=0.15, color='gray',
        width=0.6, label='Total instances')

ax1.set_xlabel('prefix len', labelpad=0.5)
ax1.set_ylabel('DLS drop under Perturbation', labelpad=0.5)
ax2.set_ylabel('instances', labelpad=0.5)

ax1.set_ylim(0, 1.05)
ax1.set_xlim(left=min(common_prefix_lengths) - 0.5, right=max(common_prefix_lengths) + 0.5)
ax2.set_ylim(bottom=0)

for spine in ax1.spines.values():
    spine.set_visible(False)
for spine in ax2.spines.values():
    spine.set_visible(False)

ax1.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)
ax1.axhline(y=1.0, color='gray', linestyle='--', alpha=0.5, linewidth=0.7)

lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper right",
           frameon=True, fontsize=8)

ax2.set_yticks([])

ax1.set_zorder(2)
ax2.set_zorder(1)
ax1.patch.set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
# Single comparison chart: Modal DLS on Clean Data by Prefix Length with IQR
import matplotlib as mpl

mpl.rcdefaults()
plt.rcParams.update({
    'font.size': 8,
    'axes.titlesize': 10,
    'axes.labelsize': 9,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'legend.fontsize': 8,
    'lines.linewidth': 1.2,
    'lines.markersize': 5
})

fig, ax1 = plt.subplots(figsize=(6, 4), dpi=100)

common_prefix_lengths = sorted(set(camargo_data['prefix_lengths']) & set(normal_data['prefix_lengths']))

camargo_modal_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['modal_clean_dls']))
normal_modal_dict = dict(zip(normal_data['prefix_lengths'], normal_data['modal_clean_dls']))
camargo_q25_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['clean_dls_q25']))
camargo_q75_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['clean_dls_q75']))
normal_q25_dict = dict(zip(normal_data['prefix_lengths'], normal_data['clean_dls_q25']))
normal_q75_dict = dict(zip(normal_data['prefix_lengths'], normal_data['clean_dls_q75']))
camargo_counts_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['sample_counts']))
normal_counts_dict = dict(zip(normal_data['prefix_lengths'], normal_data['sample_counts']))

camargo_modal_values = [camargo_modal_dict[p] for p in common_prefix_lengths]
normal_modal_values = [normal_modal_dict[p] for p in common_prefix_lengths]
camargo_q25_values = [camargo_q25_dict[p] for p in common_prefix_lengths]
camargo_q75_values = [camargo_q75_dict[p] for p in common_prefix_lengths]
normal_q25_values = [normal_q25_dict[p] for p in common_prefix_lengths]
normal_q75_values = [normal_q75_dict[p] for p in common_prefix_lengths]
camargo_counts = [camargo_counts_dict[p] for p in common_prefix_lengths]
normal_counts = [normal_counts_dict[p] for p in common_prefix_lengths]

# Plot IQR ranges (fill_between) first so they appear behind the lines
ax1.fill_between(common_prefix_lengths, camargo_q25_values, camargo_q75_values,
                 color='blue', alpha=0.15, label='Camargo LSTM IQR')
ax1.fill_between(common_prefix_lengths, normal_q25_values, normal_q75_values,
                 color='orange', alpha=0.15, label='U-ED-LSTM IQR')

# Plot modal DLS lines
ax1.plot(common_prefix_lengths, camargo_modal_values, marker='o',
         linewidth=1.2, markersize=5, label='Camargo LSTM',
         color='blue', alpha=0.8)
ax1.plot(common_prefix_lengths, normal_modal_values, marker='s',
         linewidth=1.2, markersize=5, label='U-ED-LSTM',
         color='orange', alpha=0.8)

ax2 = ax1.twinx()

total_counts = [camargo_counts[i] + normal_counts[i] for i in range(len(common_prefix_lengths))]
ax2.bar(common_prefix_lengths, total_counts, alpha=0.15, color='gray',
        width=0.6, label='Total instances')

ax1.set_xlabel('prefix len', labelpad=0.5)
ax1.set_ylabel('Modal DLS on Clean Data', labelpad=0.5)
ax2.set_ylabel('instances', labelpad=0.5)

ax1.set_ylim(0, 1.05)
ax1.set_xlim(left=min(common_prefix_lengths) - 0.5, right=max(common_prefix_lengths) + 0.5)
ax2.set_ylim(bottom=0)

for spine in ax1.spines.values():
    spine.set_visible(False)
for spine in ax2.spines.values():
    spine.set_visible(False)

ax1.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)
ax1.axhline(y=1.0, color='gray', linestyle='--', alpha=0.5, linewidth=0.7)

lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper right",
           frameon=True, fontsize=8)

ax2.set_yticks([])

ax1.set_zorder(2)
ax2.set_zorder(1)
ax1.patch.set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
# Single comparison chart: Modal DLS on Perturbed Data by Prefix Length with IQR
import matplotlib as mpl

mpl.rcdefaults()
plt.rcParams.update({
    'font.size': 8,
    'axes.titlesize': 10,
    'axes.labelsize': 9,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'legend.fontsize': 8,
    'lines.linewidth': 1.2,
    'lines.markersize': 5
})

fig, ax1 = plt.subplots(figsize=(6, 4), dpi=100)

common_prefix_lengths = sorted(set(camargo_data['prefix_lengths']) & set(normal_data['prefix_lengths']))

camargo_modal_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['modal_perturbed_dls']))
normal_modal_dict = dict(zip(normal_data['prefix_lengths'], normal_data['modal_perturbed_dls']))
camargo_q25_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['perturbed_dls_q25']))
camargo_q75_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['perturbed_dls_q75']))
normal_q25_dict = dict(zip(normal_data['prefix_lengths'], normal_data['perturbed_dls_q25']))
normal_q75_dict = dict(zip(normal_data['prefix_lengths'], normal_data['perturbed_dls_q75']))
camargo_counts_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['sample_counts']))
normal_counts_dict = dict(zip(normal_data['prefix_lengths'], normal_data['sample_counts']))

camargo_modal_values = [camargo_modal_dict[p] for p in common_prefix_lengths]
normal_modal_values = [normal_modal_dict[p] for p in common_prefix_lengths]
camargo_q25_values = [camargo_q25_dict[p] for p in common_prefix_lengths]
camargo_q75_values = [camargo_q75_dict[p] for p in common_prefix_lengths]
normal_q25_values = [normal_q25_dict[p] for p in common_prefix_lengths]
normal_q75_values = [normal_q75_dict[p] for p in common_prefix_lengths]
camargo_counts = [camargo_counts_dict[p] for p in common_prefix_lengths]
normal_counts = [normal_counts_dict[p] for p in common_prefix_lengths]

# Plot IQR ranges (fill_between) first so they appear behind the lines
ax1.fill_between(common_prefix_lengths, camargo_q25_values, camargo_q75_values,
                 color='blue', alpha=0.15, label='Camargo LSTM IQR')
ax1.fill_between(common_prefix_lengths, normal_q25_values, normal_q75_values,
                 color='orange', alpha=0.15, label='U-ED-LSTM IQR')

# Plot modal DLS lines
ax1.plot(common_prefix_lengths, camargo_modal_values, marker='o',
         linewidth=1.2, markersize=5, label='Camargo LSTM',
         color='blue', alpha=0.8)
ax1.plot(common_prefix_lengths, normal_modal_values, marker='s',
         linewidth=1.2, markersize=5, label='U-ED-LSTM',
         color='orange', alpha=0.8)

ax2 = ax1.twinx()

total_counts = [camargo_counts[i] + normal_counts[i] for i in range(len(common_prefix_lengths))]
ax2.bar(common_prefix_lengths, total_counts, alpha=0.15, color='gray',
        width=0.6, label='Total instances')

ax1.set_xlabel('prefix len', labelpad=0.5)
ax1.set_ylabel('Modal DLS on Perturbed Data', labelpad=0.5)
ax2.set_ylabel('instances', labelpad=0.5)

ax1.set_ylim(0, 1.05)
ax1.set_xlim(left=min(common_prefix_lengths) - 0.5, right=max(common_prefix_lengths) + 0.5)
ax2.set_ylim(bottom=0)

for spine in ax1.spines.values():
    spine.set_visible(False)
for spine in ax2.spines.values():
    spine.set_visible(False)

ax1.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)
ax1.axhline(y=1.0, color='gray', linestyle='--', alpha=0.5, linewidth=0.7)

lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper right",
           frameon=True, fontsize=8)

ax2.set_yticks([])

ax1.set_zorder(2)
ax2.set_zorder(1)
ax1.patch.set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
# Support of Correct Prediction by Prefix Length
import matplotlib as mpl

mpl.rcdefaults()
plt.rcParams.update({
    'font.size': 8,
    'axes.titlesize': 10,
    'axes.labelsize': 9,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'legend.fontsize': 8,
    'lines.linewidth': 1.2,
    'lines.markersize': 5
})

fig, ax1 = plt.subplots(figsize=(6, 4), dpi=100)

common_prefix_lengths = sorted(set(camargo_data['prefix_lengths']) & set(normal_data['prefix_lengths']))

camargo_support_clean_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['support_clean']))
camargo_support_pert_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['support_perturbed']))
normal_support_clean_dict = dict(zip(normal_data['prefix_lengths'], normal_data['support_clean']))
normal_support_pert_dict = dict(zip(normal_data['prefix_lengths'], normal_data['support_perturbed']))
camargo_counts_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['sample_counts']))
normal_counts_dict = dict(zip(normal_data['prefix_lengths'], normal_data['sample_counts']))

camargo_support_clean = [camargo_support_clean_dict[p] for p in common_prefix_lengths]
camargo_support_pert = [camargo_support_pert_dict[p] for p in common_prefix_lengths]
normal_support_clean = [normal_support_clean_dict[p] for p in common_prefix_lengths]
normal_support_pert = [normal_support_pert_dict[p] for p in common_prefix_lengths]
camargo_counts = [camargo_counts_dict[p] for p in common_prefix_lengths]
normal_counts = [normal_counts_dict[p] for p in common_prefix_lengths]

ax1.plot(common_prefix_lengths, camargo_support_clean, marker='o',
         linewidth=1.2, markersize=5, label='Camargo LSTM (clean)',
         color='blue', alpha=0.9)
ax1.plot(common_prefix_lengths, camargo_support_pert, marker='o', linestyle='--',
         linewidth=1.2, markersize=5, label='Camargo LSTM (perturbed)',
         color='blue', alpha=0.6)
ax1.plot(common_prefix_lengths, normal_support_clean, marker='s',
         linewidth=1.2, markersize=5, label='U-ED-LSTM (clean)',
         color='orange', alpha=0.9)
ax1.plot(common_prefix_lengths, normal_support_pert, marker='s', linestyle='--',
         linewidth=1.2, markersize=5, label='U-ED-LSTM (perturbed)',
         color='orange', alpha=0.6)

ax2 = ax1.twinx()

total_counts = [camargo_counts[i] + normal_counts[i] for i in range(len(common_prefix_lengths))]
ax2.bar(common_prefix_lengths, total_counts, alpha=0.15, color='gray',
        width=0.6, label='Total instances')

ax1.set_xlabel('prefix len', labelpad=0.5)
ax1.set_ylabel('Support of Correct Prediction', labelpad=0.5)
ax2.set_ylabel('instances', labelpad=0.5)

ax1.set_ylim(0, 1.05)
ax1.set_xlim(left=min(common_prefix_lengths) - 0.5, right=max(common_prefix_lengths) + 0.5)
ax2.set_ylim(bottom=0)

for spine in ax1.spines.values():
    spine.set_visible(False)
for spine in ax2.spines.values():
    spine.set_visible(False)

ax1.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)
ax1.axhline(y=1.0, color='gray', linestyle='--', alpha=0.5, linewidth=0.7)

lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper right",
           frameon=True, fontsize=8)

ax2.set_yticks([])

ax1.set_zorder(2)
ax2.set_zorder(1)
ax1.patch.set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
# ROUGE-L Score by Prefix Length
import matplotlib as mpl

mpl.rcdefaults()
plt.rcParams.update({
    'font.size': 8,
    'axes.titlesize': 10,
    'axes.labelsize': 9,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'legend.fontsize': 8,
    'lines.linewidth': 1.2,
    'lines.markersize': 5
})

fig, ax1 = plt.subplots(figsize=(6, 4), dpi=100)

common_prefix_lengths = sorted(set(camargo_data['prefix_lengths']) & set(normal_data['prefix_lengths']))

camargo_rouge_clean_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['rouge_l_clean']))
camargo_rouge_pert_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['rouge_l_perturbed']))
normal_rouge_clean_dict = dict(zip(normal_data['prefix_lengths'], normal_data['rouge_l_clean']))
normal_rouge_pert_dict = dict(zip(normal_data['prefix_lengths'], normal_data['rouge_l_perturbed']))
camargo_counts_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['sample_counts']))
normal_counts_dict = dict(zip(normal_data['prefix_lengths'], normal_data['sample_counts']))

camargo_rouge_clean = [camargo_rouge_clean_dict[p] for p in common_prefix_lengths]
camargo_rouge_pert = [camargo_rouge_pert_dict[p] for p in common_prefix_lengths]
normal_rouge_clean = [normal_rouge_clean_dict[p] for p in common_prefix_lengths]
normal_rouge_pert = [normal_rouge_pert_dict[p] for p in common_prefix_lengths]
camargo_counts = [camargo_counts_dict[p] for p in common_prefix_lengths]
normal_counts = [normal_counts_dict[p] for p in common_prefix_lengths]

ax1.plot(common_prefix_lengths, camargo_rouge_clean, marker='o',
         linewidth=1.2, markersize=5, label='Camargo LSTM (clean)',
         color='blue', alpha=0.9)
ax1.plot(common_prefix_lengths, camargo_rouge_pert, marker='o', linestyle='--',
         linewidth=1.2, markersize=5, label='Camargo LSTM (perturbed)',
         color='blue', alpha=0.6)
ax1.plot(common_prefix_lengths, normal_rouge_clean, marker='s',
         linewidth=1.2, markersize=5, label='U-ED-LSTM (clean)',
         color='orange', alpha=0.9)
ax1.plot(common_prefix_lengths, normal_rouge_pert, marker='s', linestyle='--',
         linewidth=1.2, markersize=5, label='U-ED-LSTM (perturbed)',
         color='orange', alpha=0.6)

ax2 = ax1.twinx()

total_counts = [camargo_counts[i] + normal_counts[i] for i in range(len(common_prefix_lengths))]
ax2.bar(common_prefix_lengths, total_counts, alpha=0.15, color='gray',
        width=0.6, label='Total instances')

ax1.set_xlabel('prefix len', labelpad=0.5)
ax1.set_ylabel('ROUGE-L Score', labelpad=0.5)
ax2.set_ylabel('instances', labelpad=0.5)

ax1.set_ylim(0, 1.05)
ax1.set_xlim(left=min(common_prefix_lengths) - 0.5, right=max(common_prefix_lengths) + 0.5)
ax2.set_ylim(bottom=0)

for spine in ax1.spines.values():
    spine.set_visible(False)
for spine in ax2.spines.values():
    spine.set_visible(False)

ax1.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)
ax1.axhline(y=1.0, color='gray', linestyle='--', alpha=0.5, linewidth=0.7)

lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper right",
           frameon=True, fontsize=8)

ax2.set_yticks([])

ax1.set_zorder(2)
ax2.set_zorder(1)
ax1.patch.set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
# chrF Score by Prefix Length
import matplotlib as mpl

mpl.rcdefaults()
plt.rcParams.update({
    'font.size': 8,
    'axes.titlesize': 10,
    'axes.labelsize': 9,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'legend.fontsize': 8,
    'lines.linewidth': 1.2,
    'lines.markersize': 5
})

fig, ax1 = plt.subplots(figsize=(6, 4), dpi=100)

common_prefix_lengths = sorted(set(camargo_data['prefix_lengths']) & set(normal_data['prefix_lengths']))

camargo_chrf_clean_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['chrf_clean']))
camargo_chrf_pert_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['chrf_perturbed']))
normal_chrf_clean_dict = dict(zip(normal_data['prefix_lengths'], normal_data['chrf_clean']))
normal_chrf_pert_dict = dict(zip(normal_data['prefix_lengths'], normal_data['chrf_perturbed']))
camargo_counts_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['sample_counts']))
normal_counts_dict = dict(zip(normal_data['prefix_lengths'], normal_data['sample_counts']))

camargo_chrf_clean = [camargo_chrf_clean_dict[p] for p in common_prefix_lengths]
camargo_chrf_pert = [camargo_chrf_pert_dict[p] for p in common_prefix_lengths]
normal_chrf_clean = [normal_chrf_clean_dict[p] for p in common_prefix_lengths]
normal_chrf_pert = [normal_chrf_pert_dict[p] for p in common_prefix_lengths]
camargo_counts = [camargo_counts_dict[p] for p in common_prefix_lengths]
normal_counts = [normal_counts_dict[p] for p in common_prefix_lengths]

ax1.plot(common_prefix_lengths, camargo_chrf_clean, marker='o',
         linewidth=1.2, markersize=5, label='Camargo LSTM (clean)',
         color='blue', alpha=0.9)
ax1.plot(common_prefix_lengths, camargo_chrf_pert, marker='o', linestyle='--',
         linewidth=1.2, markersize=5, label='Camargo LSTM (perturbed)',
         color='blue', alpha=0.6)
ax1.plot(common_prefix_lengths, normal_chrf_clean, marker='s',
         linewidth=1.2, markersize=5, label='U-ED-LSTM (clean)',
         color='orange', alpha=0.9)
ax1.plot(common_prefix_lengths, normal_chrf_pert, marker='s', linestyle='--',
         linewidth=1.2, markersize=5, label='U-ED-LSTM (perturbed)',
         color='orange', alpha=0.6)

ax2 = ax1.twinx()

total_counts = [camargo_counts[i] + normal_counts[i] for i in range(len(common_prefix_lengths))]
ax2.bar(common_prefix_lengths, total_counts, alpha=0.15, color='gray',
        width=0.6, label='Total instances')

ax1.set_xlabel('prefix len', labelpad=0.5)
ax1.set_ylabel('chrF Score', labelpad=0.5)
ax2.set_ylabel('instances', labelpad=0.5)

ax1.set_ylim(0, 1.05)
ax1.set_xlim(left=min(common_prefix_lengths) - 0.5, right=max(common_prefix_lengths) + 0.5)
ax2.set_ylim(bottom=0)

for spine in ax1.spines.values():
    spine.set_visible(False)
for spine in ax2.spines.values():
    spine.set_visible(False)

ax1.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)
ax1.axhline(y=1.0, color='gray', linestyle='--', alpha=0.5, linewidth=0.7)

lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper right",
           frameon=True, fontsize=8)

ax2.set_yticks([])

ax1.set_zorder(2)
ax2.set_zorder(1)
ax1.patch.set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
# Negative Log Likelihood (NLL) by Prefix Length
import matplotlib as mpl

mpl.rcdefaults()
plt.rcParams.update({
    'font.size': 8,
    'axes.titlesize': 10,
    'axes.labelsize': 9,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'legend.fontsize': 8,
    'lines.linewidth': 1.2,
    'lines.markersize': 5
})

fig, ax1 = plt.subplots(figsize=(6, 4), dpi=100)

common_prefix_lengths = sorted(set(camargo_data['prefix_lengths']) & set(normal_data['prefix_lengths']))

camargo_nll_clean_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['nll_clean']))
camargo_nll_pert_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['nll_perturbed']))
normal_nll_clean_dict = dict(zip(normal_data['prefix_lengths'], normal_data['nll_clean']))
normal_nll_pert_dict = dict(zip(normal_data['prefix_lengths'], normal_data['nll_perturbed']))
camargo_counts_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['sample_counts']))
normal_counts_dict = dict(zip(normal_data['prefix_lengths'], normal_data['sample_counts']))

camargo_nll_clean = [camargo_nll_clean_dict[p] for p in common_prefix_lengths]
camargo_nll_pert = [camargo_nll_pert_dict[p] for p in common_prefix_lengths]
normal_nll_clean = [normal_nll_clean_dict[p] for p in common_prefix_lengths]
normal_nll_pert = [normal_nll_pert_dict[p] for p in common_prefix_lengths]
camargo_counts = [camargo_counts_dict[p] for p in common_prefix_lengths]
normal_counts = [normal_counts_dict[p] for p in common_prefix_lengths]

ax1.plot(common_prefix_lengths, camargo_nll_clean, marker='o',
         linewidth=1.2, markersize=5, label='Camargo LSTM (clean)',
         color='blue', alpha=0.9)
ax1.plot(common_prefix_lengths, camargo_nll_pert, marker='o', linestyle='--',
         linewidth=1.2, markersize=5, label='Camargo LSTM (perturbed)',
         color='blue', alpha=0.6)
ax1.plot(common_prefix_lengths, normal_nll_clean, marker='s',
         linewidth=1.2, markersize=5, label='U-ED-LSTM (clean)',
         color='orange', alpha=0.9)
ax1.plot(common_prefix_lengths, normal_nll_pert, marker='s', linestyle='--',
         linewidth=1.2, markersize=5, label='U-ED-LSTM (perturbed)',
         color='orange', alpha=0.6)

ax2 = ax1.twinx()

total_counts = [camargo_counts[i] + normal_counts[i] for i in range(len(common_prefix_lengths))]
ax2.bar(common_prefix_lengths, total_counts, alpha=0.15, color='gray',
        width=0.6, label='Total instances')

ax1.set_xlabel('prefix len', labelpad=0.5)
ax1.set_ylabel('Negative Log Likelihood', labelpad=0.5)
ax2.set_ylabel('instances', labelpad=0.5)

# For NLL, lower is better, so we don't set a fixed upper limit
# Instead, let matplotlib auto-scale based on the data
ax1.set_xlim(left=min(common_prefix_lengths) - 0.5, right=max(common_prefix_lengths) + 0.5)
ax2.set_ylim(bottom=0)

for spine in ax1.spines.values():
    spine.set_visible(False)
for spine in ax2.spines.values():
    spine.set_visible(False)

ax1.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)

lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper right",
           frameon=True, fontsize=8)

ax2.set_yticks([])

ax1.set_zorder(2)
ax2.set_zorder(1)
ax1.patch.set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
# Wasserstein Distance by Prefix Length - Camargo LSTM
import matplotlib as mpl

mpl.rcdefaults()
plt.rcParams.update({
    'font.size': 8,
    'axes.titlesize': 10,
    'axes.labelsize': 9,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'legend.fontsize': 8,
    'lines.linewidth': 1.2,
    'lines.markersize': 5
})

fig, ax1 = plt.subplots(figsize=(6, 4), dpi=100)

camargo_wasserstein_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['wasserstein_distance']))
camargo_counts_dict = dict(zip(camargo_data['prefix_lengths'], camargo_data['sample_counts']))

prefix_lengths = sorted(camargo_data['prefix_lengths'])
camargo_wasserstein = [camargo_wasserstein_dict[p] for p in prefix_lengths]
camargo_counts = [camargo_counts_dict[p] for p in prefix_lengths]

ax1.plot(prefix_lengths, camargo_wasserstein, marker='o',
         linewidth=1.2, markersize=5, label='Camargo LSTM',
         color='blue', alpha=0.9)

ax2 = ax1.twinx()

ax2.bar(prefix_lengths, camargo_counts, alpha=0.15, color='gray',
        width=0.6, label='Instances')

ax1.set_xlabel('prefix len', labelpad=0.5)
ax1.set_ylabel('Wasserstein Distance', labelpad=0.5)
ax2.set_ylabel('instances', labelpad=0.5)

ax1.set_xlim(left=min(prefix_lengths) - 0.5, right=max(prefix_lengths) + 0.5)
ax2.set_ylim(bottom=0)

for spine in ax1.spines.values():
    spine.set_visible(False)
for spine in ax2.spines.values():
    spine.set_visible(False)

ax1.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)

lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper right",
           frameon=True, fontsize=8)

ax2.set_yticks([])

ax1.set_zorder(2)
ax2.set_zorder(1)
ax1.patch.set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
# Wasserstein Distance by Prefix Length - U-ED-LSTM
import matplotlib as mpl

mpl.rcdefaults()
plt.rcParams.update({
    'font.size': 8,
    'axes.titlesize': 10,
    'axes.labelsize': 9,
    'xtick.labelsize': 8,
    'ytick.labelsize': 8,
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'legend.fontsize': 8,
    'lines.linewidth': 1.2,
    'lines.markersize': 5
})

fig, ax1 = plt.subplots(figsize=(6, 4), dpi=100)

normal_wasserstein_dict = dict(zip(normal_data['prefix_lengths'], normal_data['wasserstein_distance']))
normal_counts_dict = dict(zip(normal_data['prefix_lengths'], normal_data['sample_counts']))

prefix_lengths = sorted(normal_data['prefix_lengths'])
normal_wasserstein = [normal_wasserstein_dict[p] for p in prefix_lengths]
normal_counts = [normal_counts_dict[p] for p in prefix_lengths]

ax1.plot(prefix_lengths, normal_wasserstein, marker='s',
         linewidth=1.2, markersize=5, label='U-ED-LSTM',
         color='orange', alpha=0.9)

ax2 = ax1.twinx()

ax2.bar(prefix_lengths, normal_counts, alpha=0.15, color='gray',
        width=0.6, label='Instances')

ax1.set_xlabel('prefix len', labelpad=0.5)
ax1.set_ylabel('Wasserstein Distance', labelpad=0.5)
ax2.set_ylabel('instances', labelpad=0.5)

ax1.set_xlim(left=min(prefix_lengths) - 0.5, right=max(prefix_lengths) + 0.5)
ax2.set_ylim(bottom=0)

for spine in ax1.spines.values():
    spine.set_visible(False)
for spine in ax2.spines.values():
    spine.set_visible(False)

ax1.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)

lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc="upper right",
           frameon=True, fontsize=8)

ax2.set_yticks([])

ax1.set_zorder(2)
ax2.set_zorder(1)
ax1.patch.set_visible(False)

plt.tight_layout()
plt.show()


## Adversarial Sample Analysis

Identify the most effective adversarial samples - those that caused the biggest shift in model predictions.

In [None]:
from adversarial_sample_selector import get_best_adversarial_prefixes

# Analyze the most effective adversarial samples for U-ED-LSTM
print("="*80)
print("ANALYZING U-ED-LSTM MODEL")
print("="*80)

best_adversarial_samples = get_best_adversarial_prefixes(
    U_ED_LSTM_results, 
    top_n=10,  # Show top 10 most adversarial samples
    concept_name='Activity'
)

# The function returns a list of tuples with detailed information
# Each tuple contains: (case_name, prefix_len, dls, prefix_orig, prefix_pert, mean_orig, mean_pert, perturbations)

In [None]:
U_ED_LSTM_results