### Load Data

In [9]:
import pandas as pd
import numpy as np

episodes_operated_tags_df = pd.read_excel('/home/h604827/ControlActions/RESULTS/episode_all_operator_action_plots/episodes_all_with_actions_and_deviations.xlsx')
ssd_df = pd.read_excel('/home/h604827/ControlActions/DATA/SSD_1071_SSD_output_1071_7Jan2026.xlsx')
events_df = pd.read_csv('/home/h604827/ControlActions/DATA/trip_filtered_events.csv')
pv_op_data_df = pd.read_parquet('/home/h604827/ControlActions/DATA/03LIC_1071_JAN_2026_filtered.parquet')
operating_limits_df = pd.read_csv('/home/h604827/ControlActions/DATA/operating_limits.csv')

episodes_operated_tags_df.head()

Unnamed: 0,EpisodeID,AlarmStart,AlarmEnd,AlarmDurationMinutes,TotalWindowMinutes,OperatedTags,OperatedTagsCount,DeviatedTags,DeviatedTagsCount,HasOperatorActions,HasOnlyTargetTags,Has1071Action,Has1016Action,Has1013Action
0,1,2022-01-05 08:53:00,2022-01-05 09:33:00,40,186,"03FIC_1085, 03FIC_3435, 03HIC_1141, 03HIC_1151...",15,"02FI_1000.PV, 03FIC_1085.PV, 03FIC_3415.PV, 03...",22,True,False,True,True,True
1,2,2022-01-07 09:55:00,2022-01-07 10:00:00,5,151,"03FIC_3435, 03GHS_0121A, 03GHS_0121AA, 03GHS_0...",8,"03FIC_1085.PV, 03FIC_3415.PV, 03FI_1141A.PV, 0...",23,True,False,False,False,False
2,3,2022-01-07 13:33:00,2022-01-07 13:36:00,3,149,"03FIC_3435, 03LIC_1034",2,"03FIC_1085.PV, 03FIC_3415.PV, 03FI_1141A.PV, 0...",19,True,False,False,False,False
3,4,2022-01-07 14:17:00,2022-01-07 14:19:00,2,148,"03FIC_3435, 03LIC_1016, 03LIC_1034",3,"03FIC_1085.PV, 03FI_1141A.PV, 03LIC_1016.PV, 0...",22,True,False,False,True,False
4,5,2022-01-07 14:54:00,2022-01-07 14:58:00,4,150,"03FIC_3435, 03LIC_1016",2,"02FI_1000.PV, 03FIC_1085.PV, 03FI_1141A.PV, 03...",19,True,False,False,True,False


## Similarity Approach: Building Context for Operator Actions

### Approach Overview
1. Use **all available episodes**
2. For each operator action on the target tags (**03LIC_1071**, **03LIC_1016**, **03PIC_1013**), create a "context" capturing process state
3. Context window: From deviation start time to operator action timestamp

### Context Features (for each of 28 PV tags):
1. **PV value at deviation start**
2. **PV value at operator action time**
3. **Rate of change (%)** = (PV_action - PV_start) / PV_start * 100
4. **Direction** = 1 if ROC positive, 0 if negative

### Splitting Train Test Episodes

In [10]:
# Step 1: Use all available episodes
all_episodes_df = episodes_operated_tags_df.copy()
print(f"Total episodes available: {len(all_episodes_df)}")

# Select 50 random episodes from 2025 for testing (exclude from training lookup table)
episodes_2025 = all_episodes_df[
    (all_episodes_df['AlarmStart'] >= pd.Timestamp('2025-01-01')) &
    (all_episodes_df['AlarmStart'] < pd.Timestamp('2026-01-01'))
].copy()

print(f"Episodes in 2025: {len(episodes_2025)}")

sample_n = 50
np.random.seed(42)
if len(episodes_2025) >= sample_n:
    test_episode_ids = episodes_2025.sample(n=sample_n, random_state=42)['EpisodeID'].tolist()
else:
    test_episode_ids = episodes_2025['EpisodeID'].tolist()
    print(f"Warning: Only {len(test_episode_ids)} episodes found in 2025. Using all available.")

train_episodes_df = all_episodes_df[~all_episodes_df['EpisodeID'].isin(test_episode_ids)].copy()
print(f"Test episodes (2025): {len(test_episode_ids)}")
print(f"Training episodes: {len(train_episodes_df)}")

Total episodes available: 609
Episodes in 2025: 92
Test episodes (2025): 50
Training episodes: 559


### Defining the PV columns to be used for context creation

In [11]:
# Step 3: Define the PV columns for context (28 tags)
context_pv_tags = [col for col in pv_op_data_df.columns if col.endswith('.PV')]

# Step 4: Prepare events data for timestamp parsing
events_df['VT_Start'] = pd.to_datetime(events_df['VT_Start'])

# Prepare SSD data for timestamp parsing
ssd_df['AlarmStart_rounded_minutes'] = pd.to_datetime(ssd_df['AlarmStart_rounded_minutes'])
ssd_df['AlarmEnd_rounded_minutes'] = pd.to_datetime(ssd_df['AlarmEnd_rounded_minutes'])
ssd_df['Tag_First_Transition_Start_minutes'] = pd.to_datetime(ssd_df['Tag_First_Transition_Start_minutes'])

# Prepare episodes data
episodes_operated_tags_df['AlarmStart'] = pd.to_datetime(episodes_operated_tags_df['AlarmStart'])
episodes_operated_tags_df['AlarmEnd'] = pd.to_datetime(episodes_operated_tags_df['AlarmEnd'])

print("Timestamp columns converted to datetime")

Timestamp columns converted to datetime


In [12]:
# Step 5: Helper function to get deviation start time for an episode
# Deviation start is the earliest Tag_First_Transition_Start_minutes for the target tag (03LIC_1071)
# in that alarm episode

def get_deviation_start_for_episode(episode_id, alarm_start, alarm_end):
    """
    Get the deviation start time for an episode.
    This is the Tag_First_Transition_Start_minutes for 03LIC_1071 (or earliest among related tags)
    """
    # Find SSD records for this alarm episode (matching by alarm start time)
    episode_ssd = ssd_df[
        (ssd_df['AlarmStart_rounded_minutes'] == alarm_start)
    ]
    
    if len(episode_ssd) == 0:
        # Try with a small time tolerance (within 1 minute)
        print(f"No exact SSD data found for EpisodeID {episode_id} with AlarmStart {alarm_start}. Trying with time tolerance.")
        episode_ssd = ssd_df[
            (abs((ssd_df['AlarmStart_rounded_minutes'] - alarm_start).dt.total_seconds()) <= 60)
        ]
    
    if len(episode_ssd) == 0:
        # If no SSD data found, use alarm start minus 30 minutes as default deviation start
        print(f"No SSD data found for EpisodeID {episode_id} with AlarmStart {alarm_start}. Using default deviation start.")
        return alarm_start - pd.Timedelta(minutes=30)
    
    # Get deviation start for target tag 03LIC_1071 if available
    target_ssd = episode_ssd[episode_ssd['TagName'] == '03LIC_1071']
    if len(target_ssd) > 0:
        return target_ssd['Tag_First_Transition_Start_minutes'].iloc[0]
    
    # If not found, use earliest transition start among all tags
    return episode_ssd['Tag_First_Transition_Start_minutes'].min()

# Test with first available episode
test_ep = all_episodes_df.iloc[0]
test_dev_start = get_deviation_start_for_episode(
    test_ep['EpisodeID'], 
    test_ep['AlarmStart'], 
    test_ep['AlarmEnd']
 )
print(f"Episode {test_ep['EpisodeID']}:")
print(f"  Alarm Start: {test_ep['AlarmStart']}")
print(f"  Deviation Start: {test_dev_start}")
print(f"  Alarm End: {test_ep['AlarmEnd']}")

Episode 1:
  Alarm Start: 2022-01-05 08:53:00
  Deviation Start: 2022-01-05 07:27:00
  Alarm End: 2022-01-05 09:33:00


In [13]:
# Step 6: Helper function to get operator actions for an episode
def get_operator_actions_for_episode(alarm_start, alarm_end, target_sources=['03LIC_1071', '03LIC_1016', '03PIC_1013']):
    """
    Get all CHANGE events (operator actions) for target sources during an episode.
    Episode window: deviation_start to alarm_end
    Only keep SP/OP actions (exclude MODE).
    """
    # Get deviation start
    deviation_start = get_deviation_start_for_episode(None, alarm_start, alarm_end)
    
    # Filter CHANGE events within the episode window for target sources
    actions = events_df[
        (events_df['ConditionName'] == 'CHANGE') &
        (events_df['Source'].isin(target_sources)) &
        (events_df['VT_Start'] >= deviation_start) &
        (events_df['VT_Start'] <= alarm_end)
    ].copy()
    
    # Exclude MODE actions if Description is available
    if 'Description' in actions.columns:
        actions = actions[actions['Description'].isin(['SP', 'OP'])].copy()
    
    return actions, deviation_start

# Test with first selected episode
test_actions, test_dev_start = get_operator_actions_for_episode(
    test_ep['AlarmStart'], 
    test_ep['AlarmEnd']
 )
print(f"Episode {test_ep['EpisodeID']} - Found {len(test_actions)} operator actions")
if len(test_actions) > 0:
    print(test_actions[['Source', 'VT_Start', 'Value', 'PrevValue']].head())

Episode 1 - Found 66 operator actions
            Source                   VT_Start    Value PrevValue
147288  03LIC_1016 2022-01-05 08:42:17.358300  20.0000   37.0000
147289  03LIC_1016 2022-01-05 08:42:17.358300  20.0000       NaN
147296  03PIC_1013 2022-01-05 08:42:23.715200  78.3480       NaN
147297  03PIC_1013 2022-01-05 08:42:23.715200  78.3480   80.3480
147299  03PIC_1013 2022-01-05 08:42:24.781600  76.3480   78.3480


In [14]:
# Step 7: Helper function to get PV value at a specific timestamp (with nearest lookup)
def get_pv_at_timestamp(timestamp, pv_tag):
    """
    Get PV value at or nearest to the given timestamp.
    Uses forward fill to get the most recent value if exact time not found.
    """
    try:
        # Make timestamp timezone naive if needed
        if timestamp.tzinfo is not None:
            timestamp = timestamp.tz_localize(None)
        
        # Try exact lookup first
        if timestamp in pv_op_data_df.index:
            return pv_op_data_df.loc[timestamp, pv_tag]
        
        # Use asof for nearest lookup (gets value at or before timestamp)
        idx = pv_op_data_df.index.get_indexer([timestamp], method='ffill')[0]
        if idx >= 0 and idx < len(pv_op_data_df):
            return pv_op_data_df.iloc[idx][pv_tag]
        
        # If no value found before, get nearest after
        idx = pv_op_data_df.index.get_indexer([timestamp], method='bfill')[0]
        if idx >= 0 and idx < len(pv_op_data_df):
            return pv_op_data_df.iloc[idx][pv_tag]
        
        return np.nan
    except Exception as e:
        print(f"Error getting PV value for {pv_tag} at {timestamp}: {e}")
        return np.nan

# Test
test_ts = test_dev_start
test_tag = '03LIC_1071.PV'
test_val = get_pv_at_timestamp(test_ts, test_tag)
print(f"PV value for {test_tag} at {test_ts}: {test_val}")

PV value for 03LIC_1071.PV at 2022-01-05 07:27:00: 34.856712


In [15]:
# Step 8: Build context for a single operator action
def build_context_for_action(deviation_start, action_timestamp, pv_tags):
    """
    Build context features for an operator action.
    
    Returns a dict with the following for each PV tag:
    - {tag}_pv_at_deviation_start: PV value at deviation start
    - {tag}_pv_at_action: PV value at action time
    - {tag}_roc_percent: Rate of change in % = (pv_action - pv_start) / pv_start * 100
    - {tag}_roc_direction: 1 if positive, 0 if negative
    """
    context = {}
    
    for pv_tag in pv_tags:
        # Remove .PV suffix for cleaner column names
        tag_name = pv_tag.replace('.PV', '')
        
        # Get PV values
        pv_at_deviation = get_pv_at_timestamp(deviation_start, pv_tag)
        pv_at_action = get_pv_at_timestamp(action_timestamp, pv_tag)
        
        # Calculate rate of change (%)
        if pd.notna(pv_at_deviation) and pd.notna(pv_at_action) and pv_at_deviation != 0:
            roc_percent = ((pv_at_action - pv_at_deviation) / pv_at_deviation) * 100
        else:
            roc_percent = np.nan
        
        # Determine direction
        if pd.notna(roc_percent):
            roc_direction = 1 if roc_percent >= 0 else 0
        else:
            roc_direction = np.nan
        
        # Store in context
        context[f'{tag_name}_pv_at_deviation_start'] = pv_at_deviation
        context[f'{tag_name}_pv_at_action'] = pv_at_action
        context[f'{tag_name}_roc_percent'] = roc_percent
        context[f'{tag_name}_roc_direction'] = roc_direction
    
    return context

# Test with first action of first episode
if len(test_actions) > 0:
    first_action = test_actions.iloc[0]
    test_context = build_context_for_action(test_dev_start, first_action['VT_Start'], context_pv_tags)
    print(f"Context for action at {first_action['VT_Start']} on {first_action['Source']}:")
    print(f"\n03LIC_1071 (target tag):")
    print(f"  PV at deviation start: {test_context['03LIC_1071_pv_at_deviation_start']:.4f}")
    print(f"  PV at action: {test_context['03LIC_1071_pv_at_action']:.4f}")
    print(f"  ROC %: {test_context['03LIC_1071_roc_percent']:.4f}")
    print(f"  ROC direction: {test_context['03LIC_1071_roc_direction']}")

Context for action at 2022-01-05 08:42:17.358300 on 03LIC_1016:

03LIC_1071 (target tag):
  PV at deviation start: 34.8567
  PV at action: 55.6153
  ROC %: 59.5540
  ROC direction: 1


### Building context with training episodes

In [64]:
# Step 9: Process training episodes and build context for each operator action
from tqdm import tqdm

context_records = []
target_sources = ['03LIC_1071', '03LIC_1016', '03PIC_1013']

print("Processing training episodes...")
for idx, (_, episode) in enumerate(tqdm(train_episodes_df.iterrows(), total=len(train_episodes_df))):
    episode_id = episode['EpisodeID']
    alarm_start = episode['AlarmStart']
    alarm_end = episode['AlarmEnd']
    
    # Get deviation start
    deviation_start = get_deviation_start_for_episode(episode_id, alarm_start, alarm_end)
    
    # Get operator actions for this episode (only target tags)
    actions, _ = get_operator_actions_for_episode(alarm_start, alarm_end, target_sources)
    
    if len(actions) == 0:
        continue
    
    # Process each operator action
    for _, action in actions.iterrows():
        action_timestamp = action['VT_Start']
        
        # Build context
        context = build_context_for_action(deviation_start, action_timestamp, context_pv_tags)
        
        # Add episode and action metadata
        context['episode_id'] = episode_id
        context['alarm_start'] = alarm_start
        context['alarm_end'] = alarm_end
        context['deviation_start'] = deviation_start
        context['action_timestamp'] = action_timestamp
        context['action_source'] = action['Source']
        context['action_value'] = action['Value']
        context['action_prev_value'] = action['PrevValue']
        
        # Calculate action direction and magnitude
        try:
            action_val = float(action['Value'])
            prev_val = float(action['PrevValue'])
            context['action_magnitude'] = action_val - prev_val
            context['action_direction'] = 1 if action_val > prev_val else 0  # 1 = increase, 0 = decrease
        except (ValueError, TypeError):
            context['action_magnitude'] = np.nan
            context['action_direction'] = np.nan
        
        context_records.append(context)

print(f"\nTotal context records created: {len(context_records)}")

Processing training episodes...


100%|██████████| 559/559 [07:51<00:00,  1.19it/s]


Total context records created: 6914





In [65]:
# Step 10: Convert to DataFrame and examine structure
context_df = pd.DataFrame(context_records)

print(f"Context DataFrame shape: {context_df.shape}")
print(f"Total columns: {len(context_df.columns)}")
print(f"\nColumn breakdown:")
print(f"  - Metadata columns: 10 (episode_id, alarm_start, alarm_end, deviation_start, action_timestamp, action_source, action_value, action_prev_value, action_magnitude, action_direction)")
print(f"  - PV context columns: {len(context_pv_tags) * 4} (4 features x {len(context_pv_tags)} tags)")

# Show first few rows with key columns
key_cols = ['episode_id', 'action_timestamp', 'action_source', 'action_direction', 'action_magnitude',
            '03LIC_1071_pv_at_deviation_start', '03LIC_1071_pv_at_action', 
            '03LIC_1071_roc_percent', '03LIC_1071_roc_direction']
context_df[key_cols].head(10)

Context DataFrame shape: (6914, 122)
Total columns: 122

Column breakdown:
  - Metadata columns: 10 (episode_id, alarm_start, alarm_end, deviation_start, action_timestamp, action_source, action_value, action_prev_value, action_magnitude, action_direction)
  - PV context columns: 112 (4 features x 28 tags)


Unnamed: 0,episode_id,action_timestamp,action_source,action_direction,action_magnitude,03LIC_1071_pv_at_deviation_start,03LIC_1071_pv_at_action,03LIC_1071_roc_percent,03LIC_1071_roc_direction
0,1,2022-01-05 08:42:17.358300,03LIC_1016,0.0,-17.0,34.856712,55.615295,59.554048,1.0
1,1,2022-01-05 08:42:17.358300,03LIC_1016,0.0,,34.856712,55.615295,59.554048,1.0
2,1,2022-01-05 08:42:23.715200,03PIC_1013,0.0,,34.856712,55.615295,59.554048,1.0
3,1,2022-01-05 08:42:23.715200,03PIC_1013,0.0,-2.0,34.856712,55.615295,59.554048,1.0
4,1,2022-01-05 08:42:24.781600,03PIC_1013,0.0,-2.0,34.856712,55.615295,59.554048,1.0
5,1,2022-01-05 08:42:24.781600,03PIC_1013,0.0,,34.856712,55.615295,59.554048,1.0
6,1,2022-01-05 08:42:28.203100,03LIC_1071,0.0,-15.5772,34.856712,55.615295,59.554048,1.0
7,1,2022-01-05 08:42:28.203100,03LIC_1071,0.0,,34.856712,55.615295,59.554048,1.0
8,1,2022-01-05 08:44:00.361000,03PIC_1013,0.0,,34.856712,65.26216,87.229822,1.0
9,1,2022-01-05 08:44:00.361000,03PIC_1013,0.0,-2.0,34.856712,65.26216,87.229822,1.0


In [66]:
# Step 11: Data quality check and summary statistics
print("=== Context DataFrame Summary ===\n")

# Count actions per episode
actions_per_episode = context_df.groupby('episode_id').size()
print(f"Actions per episode:")
print(f"  Min: {actions_per_episode.min()}, Max: {actions_per_episode.max()}, Mean: {actions_per_episode.mean():.1f}")

# Actions by source
print(f"\nActions by target tag:")
print(context_df['action_source'].value_counts())

# Action direction distribution
print(f"\nAction direction distribution:")
print(context_df['action_direction'].value_counts())

# Missing values check
missing_cols = context_df.isnull().sum()
cols_with_missing = missing_cols[missing_cols > 0]
if len(cols_with_missing) > 0:
    print(f"\nColumns with missing values: {len(cols_with_missing)}")
else:
    print(f"\nNo missing values in context features!")

=== Context DataFrame Summary ===

Actions per episode:
  Min: 2, Max: 494, Mean: 40.7

Actions by target tag:
action_source
03PIC_1013    3872
03LIC_1071    1984
03LIC_1016    1058
Name: count, dtype: int64

Action direction distribution:
action_direction
0.0    4951
1.0    1690
Name: count, dtype: int64

Columns with missing values: 24


In [67]:
# Step 12: Clean duplicate records (same action appears multiple times with/without PrevValue)
# Keep only records with valid action_magnitude (not NaN)
print(f"Total records before cleaning: {len(context_df)}")

# Filter out records where action_magnitude is NaN (these are duplicates without PrevValue)
context_df_clean = context_df[context_df['action_magnitude'].notna()].copy()

print(f"Total records after removing actions without valid magnitude: {len(context_df_clean)}")

# Also remove exact duplicates (same timestamp, source, value)
context_df_clean = context_df_clean.drop_duplicates(
    subset=['episode_id', 'action_timestamp', 'action_source', 'action_value']
)

print(f"Total records after removing duplicates: {len(context_df_clean)}")

# Summary after cleaning
print(f"\nActions per episode after cleaning:")
actions_per_episode_clean = context_df_clean.groupby('episode_id').size()
print(f"  Min: {actions_per_episode_clean.min()}, Max: {actions_per_episode_clean.max()}, Mean: {actions_per_episode_clean.mean():.1f}")

print(f"\nActions by target tag after cleaning:")
print(context_df_clean['action_source'].value_counts())

Total records before cleaning: 6914
Total records after removing actions without valid magnitude: 3299
Total records after removing duplicates: 3299

Actions per episode after cleaning:
  Min: 1, Max: 247, Mean: 19.4

Actions by target tag after cleaning:
action_source
03PIC_1013    1936
03LIC_1071     955
03LIC_1016     408
Name: count, dtype: int64


In [68]:
# Step 13: Save the context DataFrame for later use
output_path = '/home/h604827/ControlActions/RESULTS/similarity_test_results/similarity_context_training_all_episodes.csv'
context_df_clean.to_csv(output_path, index=False)
print(f"Context data saved to: {output_path}")

# Save the list of episodes used to build context
episodes_info = {
    'context_episodes': all_episodes_df['EpisodeID'].tolist()
}
import json
with open('/home/h604827/ControlActions/RESULTS/similarity_test_results/similarity_approach_context_episodes.json', 'w') as f:
    json.dump(episodes_info, f, indent=2)
print("Episode list saved to: RESULTS/similarity_test_results/similarity_approach_context_episodes.json")

Context data saved to: /home/h604827/ControlActions/RESULTS/similarity_test_results/similarity_context_training_all_episodes.csv
Episode list saved to: RESULTS/similarity_test_results/similarity_approach_context_episodes.json


In [69]:
# Step 14: Display sample of the final context dataframe
print("=== Final Context DataFrame ===")
print(f"Shape: {context_df_clean.shape}")
print(f"Columns: {context_df_clean.columns.tolist()[:20]}... (and {len(context_df_clean.columns)-20} more)")

# Show sample with all ROC and direction columns for context overview
roc_cols = [c for c in context_df_clean.columns if '_roc_percent' in c]
dir_cols = [c for c in context_df_clean.columns if '_roc_direction' in c]

print(f"\nROC columns ({len(roc_cols)}): {roc_cols[:5]}...")
print(f"Direction columns ({len(dir_cols)}): {dir_cols[:5]}...")

# Show statistics for target tag 1071
print(f"\n=== 03LIC_1071 ROC Statistics ===")
print(context_df_clean['03LIC_1071_roc_percent'].describe())

=== Final Context DataFrame ===
Shape: (3299, 122)
Columns: ['03LIC_1071_pv_at_deviation_start', '03LIC_1071_pv_at_action', '03LIC_1071_roc_percent', '03LIC_1071_roc_direction', '02FI_1000_pv_at_deviation_start', '02FI_1000_pv_at_action', '02FI_1000_roc_percent', '02FI_1000_roc_direction', '03FIC_1085_pv_at_deviation_start', '03FIC_1085_pv_at_action', '03FIC_1085_roc_percent', '03FIC_1085_roc_direction', '03FIC_3415_pv_at_deviation_start', '03FIC_3415_pv_at_action', '03FIC_3415_roc_percent', '03FIC_3415_roc_direction', '03FIC_3435_pv_at_deviation_start', '03FIC_3435_pv_at_action', '03FIC_3435_roc_percent', '03FIC_3435_roc_direction']... (and 102 more)

ROC columns (28): ['03LIC_1071_roc_percent', '02FI_1000_roc_percent', '03FIC_1085_roc_percent', '03FIC_3415_roc_percent', '03FIC_3435_roc_percent']...
Direction columns (28): ['03LIC_1071_roc_direction', '02FI_1000_roc_direction', '03FIC_1085_roc_direction', '03FIC_3415_roc_direction', '03FIC_3435_roc_direction']...

=== 03LIC_1071 ROC S

---
## Weighted Multi-Component Similarity Approach

### Testing on Reserved Episodes

For each test episode:
1. Start from deviation start time
2. For each target-tag SP/OP action timestamp:
   - Calculate context at that timestamp
   - Compare with all training contexts using weighted similarity
   - Return the most similar historical action(s)

In [70]:
# Step 15: Define the Weighted Multi-Component Similarity Function
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Get column names for ROC and direction features
roc_cols = [c for c in context_df_clean.columns if '_roc_percent' in c]
dir_cols = [c for c in context_df_clean.columns if '_roc_direction' in c]
pv_action_cols = [c for c in context_df_clean.columns if '_pv_at_action' in c]

# Define tag weights (higher weight for target and controllable tags)
tag_weights = {}
for col in roc_cols:
    tag_name = col.replace('_roc_percent', '')
    if '1071' in tag_name:
        tag_weights[col] = 3.0  # Target tag - highest weight
    elif '1016' in tag_name or '1013' in tag_name:
        tag_weights[col] = 2.0  # Other controllable targets
    else:
        tag_weights[col] = 1.0  # Related tags

print(f"ROC columns: {len(roc_cols)}")
print(f"Direction columns: {len(dir_cols)}")
print(f"\nTag weights for controllable tags:")
for k, v in tag_weights.items():
    if v > 1:
        print(f"  {k}: {v}")

ROC columns: 28
Direction columns: 28

Tag weights for controllable tags:
  03LIC_1071_roc_percent: 3.0
  03LIC_1016_roc_percent: 2.0
  03PIC_1013_roc_percent: 2.0


In [71]:
# Step 16: Implement the weighted similarity calculation function (with PV similarity)

def calculate_weighted_similarity(runtime_context, historical_df, 
                                   roc_cols, dir_cols, pv_action_cols, tag_weights,
                                   w_roc=0.5, w_dir=0.35, w_pv=0.15):
    """
    Calculate weighted multi-component similarity between runtime context and all historical contexts.
    
    Components:
    1. ROC Pattern Similarity (Cosine) - weighted by tag importance
    2. Direction Match Score (Jaccard-like)
    3. PV State Similarity (Cosine similarity on normalized PV values)
    
    Returns: DataFrame with similarity scores and corresponding action details
    """
    similarities = []
    
    # Create weight array for ROC columns
    weight_array = np.array([tag_weights.get(col, 1.0) for col in roc_cols])

    # Runtime ROC values (weighted) - handle NaN by replacing with 0
    runtime_roc_raw = np.array([runtime_context.get(col, 0) for col in roc_cols])
    runtime_roc_raw = np.nan_to_num(runtime_roc_raw, nan=0.0)
    runtime_roc = runtime_roc_raw * weight_array
    
    runtime_dir = np.array([runtime_context.get(col.replace('_roc_percent', '_roc_direction'), 0) for col in roc_cols])
    runtime_dir = np.nan_to_num(runtime_dir, nan=0.0)
    
    # Runtime PV values at action time (for PV state similarity)
    runtime_pv = np.array([runtime_context.get(col, 0) for col in pv_action_cols])
    runtime_pv = np.nan_to_num(runtime_pv, nan=0.0)
    
    # Get PV weights (same mapping as ROC weights)
    pv_weight_array = np.array([tag_weights.get(col.replace('_pv_at_action', '_roc_percent'), 1.0) 
                                for col in pv_action_cols])
    
    for idx, hist_row in historical_df.iterrows():
        # 1. Cosine Similarity on weighted ROC pattern
        hist_roc_raw = np.array([hist_row[col] if pd.notna(hist_row[col]) else 0 for col in roc_cols])
        hist_roc = hist_roc_raw * weight_array
        
        # Handle zero vectors
        runtime_norm = np.linalg.norm(runtime_roc)
        hist_norm = np.linalg.norm(hist_roc)
        
        if runtime_norm == 0 or hist_norm == 0:
            roc_sim = 0.5  # Default to neutral similarity
        else:
            # Manual cosine similarity to avoid sklearn NaN issues
            dot_product = np.dot(runtime_roc, hist_roc)
            roc_sim = dot_product / (runtime_norm * hist_norm)
            # Normalize from [-1, 1] to [0, 1]
            roc_sim = (roc_sim + 1) / 2
        
        # 2. Direction Match Score
        hist_dir = np.array([hist_row[col.replace('_roc_percent', '_roc_direction')] 
                           if pd.notna(hist_row[col.replace('_roc_percent', '_roc_direction')]) else 0 
                           for col in roc_cols])
        dir_match = np.mean(runtime_dir == hist_dir)
        
        # 3. PV State Similarity using Cosine Similarity (scale-invariant)
        hist_pv = np.array([hist_row[col] if pd.notna(hist_row[col]) else 0 for col in pv_action_cols])
        
        # Apply tag weights to PV values
        weighted_runtime_pv = runtime_pv * pv_weight_array
        weighted_hist_pv = hist_pv * pv_weight_array
        
        # Cosine similarity for PV values (handles different scales naturally)
        runtime_pv_norm = np.linalg.norm(weighted_runtime_pv)
        hist_pv_norm = np.linalg.norm(weighted_hist_pv)
        
        if runtime_pv_norm == 0 or hist_pv_norm == 0:
            pv_sim = 0.5  # Default to neutral similarity
        else:
            pv_dot = np.dot(weighted_runtime_pv, weighted_hist_pv)
            pv_sim = pv_dot / (runtime_pv_norm * hist_pv_norm)
            # Normalize from [-1, 1] to [0, 1]
            pv_sim = (pv_sim + 1) / 2
        
        # 4. Combined weighted similarity
        total_similarity = w_roc * roc_sim + w_dir * dir_match + w_pv * pv_sim
        
        similarities.append({
            'hist_index': idx,
            'total_similarity': total_similarity,
            'roc_similarity': roc_sim,
            'direction_match': dir_match,
            'pv_similarity': pv_sim,
            'action_source': hist_row['action_source'],
            'action_direction': hist_row['action_direction'],
            'action_magnitude': hist_row['action_magnitude'],
            'episode_id': hist_row['episode_id']
        })
    
    return pd.DataFrame(similarities).sort_values('total_similarity', ascending=False)

# Get the pv_at_action columns for the similarity function
pv_action_cols = [col for col in context_df_clean.columns if col.endswith('_pv_at_action')]
print(f"PV action columns for similarity: {len(pv_action_cols)} tags")
print(f"Example columns: {pv_action_cols[:3]}")

print("\nSimilarity function defined successfully (with cosine-based PV similarity component)")

PV action columns for similarity: 28 tags
Example columns: ['03LIC_1071_pv_at_action', '02FI_1000_pv_at_action', '03FIC_1085_pv_at_action']

Similarity function defined successfully (with cosine-based PV similarity component)


In [72]:
# Step 17: Function to build context at a specific timestamp for runtime evaluation

def build_runtime_context(deviation_start, current_time, pv_tags):
    """
    Build context at runtime - from deviation_start to current_time.
    Same structure as training context but calculated at runtime.
    """
    context = {}
    
    for pv_tag in pv_tags:
        tag_base = pv_tag.replace('.PV', '')
        
        # Get PV at deviation start
        pv_at_start = get_pv_at_timestamp(deviation_start, pv_tag)
        
        # Get PV at current time
        pv_at_current = get_pv_at_timestamp(current_time, pv_tag)
        
        # Calculate ROC percent
        if pv_at_start is not None and pv_at_start != 0 and not np.isnan(pv_at_start):
            roc_percent = ((pv_at_current - pv_at_start) / abs(pv_at_start)) * 100
        else:
            roc_percent = 0.0
        
        # ROC direction (1 = positive/rising, 0 = negative/falling)
        roc_direction = 1 if roc_percent >= 0 else 0
        
        context[f'{tag_base}_pv_at_deviation_start'] = pv_at_start
        context[f'{tag_base}_pv_at_action'] = pv_at_current
        context[f'{tag_base}_roc_percent'] = roc_percent
        context[f'{tag_base}_roc_direction'] = roc_direction
    
    return context

print("Runtime context builder function defined")

Runtime context builder function defined


In [74]:
# Step 25: Run similarity approach only at target-tag action timestamps (2025 test episodes)
import os
from tqdm import tqdm
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

# Create output directory
output_dir = '/home/h604827/ControlActions/RESULTS/similarity_test_results'
os.makedirs(output_dir, exist_ok=True)

print(f"Selected {len(test_episode_ids)} test episodes from 2025")

# Action type lookup (SP/OP only)
events_change_df = events_df[events_df['ConditionName'] == 'CHANGE'].copy()
if events_change_df['VT_Start'].dtype == object:
    events_change_df['VT_Start'] = pd.to_datetime(events_change_df['VT_Start'])

desc_lookup = (
    events_change_df.dropna(subset=['Description'])
    .drop_duplicates(subset=['Source', 'VT_Start'])
    .set_index(['Source', 'VT_Start'])['Description']
 )

def normalize_action_type(val):
    if pd.isna(val):
        return None
    return str(val).strip()

def get_action_type(source, ts):
    key = (source, ts)
    try:
        return normalize_action_type(desc_lookup.loc[key])
    except KeyError:
        return None

def get_recommended_action_type(rec):
    rec_source = rec['recommended_action_source']
    rec_dir = int(rec['recommended_action_direction'])
    rec_mag = rec['recommended_action_magnitude']
    if pd.isna(rec_mag):
        return None
    hist_match = context_df_clean[
        (context_df_clean['episode_id'] == rec['matched_episode_id']) &
        (context_df_clean['action_source'] == rec_source) &
        (context_df_clean['action_direction'] == rec_dir) &
        (np.isclose(context_df_clean['action_magnitude'], rec_mag, atol=1e-6))
    ]
    if len(hist_match) == 0:
        return None
    hist_ts = pd.to_datetime(hist_match.iloc[0]['action_timestamp'])
    return get_action_type(rec_source, hist_ts)

magnitude_tolerance = 1  # abs(diff) <= this value counts as match

# Store summary and excel rows
all_episodes_summary = []
excel_rows_by_episode = {}

overall_counts = {
    'total_actual': 0,
    'tag_matches': 0,
    'dir_matches': 0,
    'mag_matches': 0,
    'type_matches': 0,
    'all_matches': 0
}

print(f"Processing {len(test_episode_ids)} test episodes...")
print(f"Output directory: {output_dir}\n")

for test_ep_id in test_episode_ids:
    print(f"\n{'='*60}")
    print(f"Processing Episode {test_ep_id}...")
    print(f"{'='*60}")
    
    # Get episode details
    ep_data = episodes_operated_tags_df[episodes_operated_tags_df['EpisodeID'] == test_ep_id].iloc[0]
    ep_alarm_start = ep_data['AlarmStart']
    ep_alarm_end = ep_data['AlarmEnd']
    ep_operated_tags = ep_data['OperatedTags']
    
    # Get deviation start
    ep_deviation_start = get_deviation_start_for_episode(test_ep_id, ep_alarm_start, ep_alarm_end)
    
    print(f"  Alarm: {ep_alarm_start} to {ep_alarm_end}")
    print(f"  Deviation Start: {ep_deviation_start}")
    print(f"  Actual Operated Tags: {ep_operated_tags}")
    
    # Get actual actions for this episode (target tags only)
    ep_actual_actions, _ = get_operator_actions_for_episode(
        ep_alarm_start, ep_alarm_end,
        target_sources=['03LIC_1071', '03LIC_1016', '03PIC_1013']
    )
    
    if len(ep_actual_actions) > 0:
        ep_actual_clean = ep_actual_actions.dropna(subset=['PrevValue'])
        ep_actual_clean = ep_actual_clean.drop_duplicates(subset=['VT_Start', 'Source', 'Value'])
        ep_actual_clean = ep_actual_clean.copy()
        
        # Handle non-numeric values safely
        try:
            ep_actual_clean['action_magnitude'] = pd.to_numeric(ep_actual_clean['Value'], errors='coerce') - pd.to_numeric(ep_actual_clean['PrevValue'], errors='coerce')
            ep_actual_clean['action_direction'] = (ep_actual_clean['action_magnitude'] > 0).astype(int)
            ep_actual_clean = ep_actual_clean.dropna(subset=['action_magnitude'])
        except Exception as e:
            print(f"  Warning: Could not compute action magnitude: {e}")
            ep_actual_clean = pd.DataFrame()
    else:
        ep_actual_clean = pd.DataFrame()
    
    if len(ep_actual_clean) == 0:
        print("  No actual actions with valid magnitude - skipping similarity evaluation")
        continue
    
    print(f"  Target-tag actions for evaluation: {len(ep_actual_clean)}")
    
    # Run similarity matching only at action timestamps
    ep_action_results = []
    for _, act in ep_actual_clean.iterrows():
        current_time = pd.to_datetime(act['VT_Start'])
        
        runtime_ctx = build_runtime_context(ep_deviation_start, current_time, context_pv_tags)
        sim_results = calculate_weighted_similarity(
            runtime_ctx, context_df_clean, roc_cols, dir_cols, pv_action_cols, tag_weights
        )
        
        for rank, (_, match) in enumerate(sim_results.head(3).iterrows(), 1):
            ep_action_results.append({
                'episode_id': test_ep_id,
                'action_time': current_time,
                'rank': rank,
                'similarity': match['total_similarity'],
                'roc_similarity': match['roc_similarity'],
                'direction_match': match['direction_match'],
                'pv_similarity': match['pv_similarity'],
                'recommended_action_source': match['action_source'],
                'recommended_action_direction': match['action_direction'],
                'recommended_action_magnitude': match['action_magnitude'],
                'matched_episode_id': match['episode_id'],
                'actual_action_source': act['Source'],
                'actual_action_direction': int(act['action_direction']),
                'actual_action_magnitude': float(act['action_magnitude']),
                '1071_roc_percent': runtime_ctx['03LIC_1071_roc_percent'],
                '1071_roc_direction': runtime_ctx['03LIC_1071_roc_direction']
            })
    
    ep_results_df = pd.DataFrame(ep_action_results)
    ep_top_recs = ep_results_df[ep_results_df['rank'] == 1].copy()
    
    # Create and save visualization
    fig = make_subplots(
        rows=3, cols=1,
        subplot_titles=[
            f'Episode {test_ep_id}: 03LIC_1071.PV Trend',
            'Recommended Action Source at Actual Action Times',
            'Similarity Score & 1071 ROC% at Action Times'
        ],
        vertical_spacing=0.1,
        row_heights=[0.35, 0.3, 0.35]
    )
    
    # Get PV data window
    pv_start = ep_deviation_start - pd.Timedelta(minutes=10)
    pv_end = ep_alarm_end + pd.Timedelta(minutes=70)
    pv_window = pv_op_data_df.loc[pv_start:pv_end, '03LIC_1071.PV']
    
    # Row 1: PV Trend
    fig.add_trace(
        go.Scatter(x=pv_window.index, y=pv_window.values, mode='lines', name='03LIC_1071.PV', line=dict(color='blue')),
        row=1, col=1
    )
    fig.add_hline(y=28.75, line_dash="dash", line_color="red", annotation_text="Alarm Threshold", row=1, col=1)
    fig.add_vrect(x0=ep_deviation_start, x1=ep_alarm_start, fillcolor="orange", opacity=0.1, line_width=0, row=1, col=1)
    fig.add_vrect(x0=ep_alarm_start, x1=ep_alarm_end, fillcolor="red", opacity=0.2, line_width=0, row=1, col=1)
    
    # Add actual action markers
    action_times = pd.to_datetime(ep_actual_clean['VT_Start'])
    action_pv_vals = [get_pv_at_timestamp(t, '03LIC_1071.PV') for t in action_times]
    fig.add_trace(
        go.Scatter(x=action_times, y=action_pv_vals, mode='markers', name=f'Actual: {ep_operated_tags}',
                  marker=dict(symbol='triangle-up', size=12, color='red')),
        row=1, col=1
    )
    
    # Row 2: Recommended action source at action times
    source_map = {'03LIC_1071': 0, '03LIC_1016': 1, '03PIC_1013': 2}
    ep_top_recs['source_numeric'] = ep_top_recs['recommended_action_source'].map(source_map)
    colors = ['green' if d == 1 else 'red' for d in ep_top_recs['recommended_action_direction']]
    fig.add_trace(
        go.Scatter(x=ep_top_recs['action_time'], y=ep_top_recs['source_numeric'],
                  mode='markers', marker=dict(size=8, color=colors),
                  name='Recommended (green=up, red=down)',
                  text=[f"Tag: {s}<br>Dir: {'up' if d==1 else 'down'}<br>Sim: {sim:.3f}" 
                        for s, d, sim in zip(ep_top_recs['recommended_action_source'],
                                             ep_top_recs['recommended_action_direction'],
                                             ep_top_recs['similarity'])],
                  hoverinfo='text'),
        row=2, col=1
    )
    fig.add_vrect(x0=ep_alarm_start, x1=ep_alarm_end, fillcolor="red", opacity=0.2, line_width=0, row=2, col=1)
    
    # Row 3: Similarity and ROC at action times
    fig.add_trace(
        go.Scatter(x=ep_top_recs['action_time'], y=ep_top_recs['similarity'],
                  mode='markers+lines', name='Similarity', line=dict(color='purple')),
        row=3, col=1
    )
    fig.add_trace(
        go.Scatter(x=ep_top_recs['action_time'], y=ep_top_recs['1071_roc_percent'],
                  mode='markers+lines', name='1071 ROC%', line=dict(color='orange')),
        row=3, col=1
    )
    fig.add_vrect(x0=ep_alarm_start, x1=ep_alarm_end, fillcolor="red", opacity=0.2, line_width=0, row=3, col=1)
    
    fig.update_layout(
        height=900,
        title_text=f'Episode {test_ep_id} - Similarity-Based Recommendations (Action Times)<br>'
                   f'<sub>Actual: {ep_operated_tags} | Alarm: {ep_alarm_start} to {ep_alarm_end}</sub>',
        showlegend=True
    )
    fig.update_yaxes(title_text="Level", row=1, col=1)
    fig.update_yaxes(title_text="Tag", ticktext=['1071', '1016', '1013'], tickvals=[0, 1, 2], row=2, col=1)
    fig.update_yaxes(title_text="Score / ROC%", row=3, col=1)
    
    # Save visualization
    html_path = f'{output_dir}/episode_{test_ep_id}_visualization.html'
    fig.write_html(html_path)
    
    # Build per-episode rows for Excel and compute top-1 match metrics
    rows = []
    tag_matches = 0
    dir_matches = 0
    mag_matches = 0
    type_matches = 0
    all_matches = 0
    total_actual = len(ep_actual_clean)
    
    for i, (_, act) in enumerate(ep_actual_clean.iterrows(), start=1):
        act_time = pd.to_datetime(act['VT_Start'])
        act_source = act['Source']
        act_dir = int(act['action_direction'])
        act_mag = float(act['action_magnitude'])
        act_val = act['Value']
        act_prev = act['PrevValue']
        act_type = normalize_action_type(act.get('Description', None))
        
        rows.append({
            'episode_id': test_ep_id,
            'action_group': i,
            'row_type': 'actual',
            'actual_action_time': act_time,
            'action_time': act_time,
            'source': act_source,
            'direction': act_dir,
            'magnitude': act_mag,
            'value': act_val,
            'prev_value': act_prev,
            'action_type': act_type,
            'similarity': np.nan,
            'roc_similarity': np.nan,
            'direction_match': np.nan,
            'pv_similarity': np.nan,
            'matched_episode_id': np.nan,
            'time_delta_minutes': 0.0,
            'actual_source': act_source,
            'actual_direction': act_dir,
            'actual_action_type': act_type,
            'tag_match_to_actual': np.nan,
            'direction_match_to_actual': np.nan,
            'magnitude_match_to_actual': np.nan,
            'action_type_match_to_actual': np.nan
        })
        
        recs = ep_results_df[ep_results_df['action_time'] == act_time].sort_values('rank')
        for _, rec in recs.iterrows():
            rec_source = rec['recommended_action_source']
            rec_dir = int(rec['recommended_action_direction'])
            rec_mag = rec['recommended_action_magnitude']
            rec_type = get_recommended_action_type(rec)
            rows.append({
                'episode_id': test_ep_id,
                'action_group': i,
                'row_type': 'recommended',
                'actual_action_time': act_time,
                'action_time': act_time,
                'source': rec_source,
                'direction': rec_dir,
                'magnitude': rec_mag,
                'value': np.nan,
                'prev_value': np.nan,
                'action_type': rec_type,
                'similarity': rec['similarity'],
                'roc_similarity': rec['roc_similarity'],
                'direction_match': rec['direction_match'],
                'pv_similarity': rec['pv_similarity'],
                'matched_episode_id': rec['matched_episode_id'],
                'time_delta_minutes': 0.0,
                'actual_source': act_source,
                'actual_direction': act_dir,
                'actual_action_type': act_type,
                'tag_match_to_actual': 1 if rec_source == act_source else 0,
                'direction_match_to_actual': 1 if rec_dir == act_dir else 0,
                'magnitude_match_to_actual': 1 if (pd.notna(rec_mag) and abs(rec_mag - act_mag) <= magnitude_tolerance) else 0,
                'action_type_match_to_actual': 1 if (rec_type is not None and act_type is not None and rec_type == act_type) else 0
            })
        
        # Top-1 match metrics (rank 1 only)
        top1 = ep_top_recs[ep_top_recs['action_time'] == act_time]
        if len(top1) > 0:
            top1 = top1.iloc[0]
            top1_type = get_recommended_action_type(top1)
            tag_ok = top1['recommended_action_source'] == act_source
            dir_ok = int(top1['recommended_action_direction']) == act_dir
            mag_ok = pd.notna(top1['recommended_action_magnitude']) and abs(top1['recommended_action_magnitude'] - act_mag) <= magnitude_tolerance
            type_ok = top1_type is not None and act_type is not None and top1_type == act_type
            if tag_ok:
                tag_matches += 1
            if dir_ok:
                dir_matches += 1
            if mag_ok:
                mag_matches += 1
            if type_ok:
                type_matches += 1
            if tag_ok and dir_ok and mag_ok and type_ok:
                all_matches += 1
    
    excel_rows_by_episode[test_ep_id] = pd.DataFrame(rows)
    
    summary = {
        'episode_id': test_ep_id,
        'alarm_start': str(ep_alarm_start),
        'alarm_end': str(ep_alarm_end),
        'deviation_start': str(ep_deviation_start),
        'actual_operated_tags': ep_operated_tags,
        'actual_actions_count': total_actual,
        'tag_match_accuracy': tag_matches / total_actual if total_actual > 0 else None,
        'direction_match_accuracy': dir_matches / total_actual if total_actual > 0 else None,
        'magnitude_match_accuracy': mag_matches / total_actual if total_actual > 0 else None,
        'action_type_match_accuracy': type_matches / total_actual if total_actual > 0 else None,
        'all_match_accuracy': all_matches / total_actual if total_actual > 0 else None
    }
    all_episodes_summary.append(summary)
    overall_counts['total_actual'] += total_actual
    overall_counts['tag_matches'] += tag_matches
    overall_counts['dir_matches'] += dir_matches
    overall_counts['mag_matches'] += mag_matches
    overall_counts['type_matches'] += type_matches
    overall_counts['all_matches'] += all_matches
    
    print(f"  Saved: {html_path}")
    print(f"  Actual actions: {total_actual}")
    print(f"  Tag match accuracy: {tag_matches}/{total_actual} = {tag_matches/total_actual*100:.1f}%")
    print(f"  Direction match accuracy: {dir_matches}/{total_actual} = {dir_matches/total_actual*100:.1f}%")
    print(f"  Magnitude match accuracy: {mag_matches}/{total_actual} = {mag_matches/total_actual*100:.1f}%")
    print(f"  Action type match accuracy: {type_matches}/{total_actual} = {type_matches/total_actual*100:.1f}%")
    print(f"  All-match accuracy: {all_matches}/{total_actual} = {all_matches/total_actual*100:.1f}%")

# Save Excel report with top-3 recommendations per actual action
excel_path = f"{output_dir}/operator_action_recommendations_by_episode.xlsx"
with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
    for ep_id in test_episode_ids:
        if ep_id not in excel_rows_by_episode:
            continue
        sheet_df = excel_rows_by_episode[ep_id]
        sheet_name = f"episode_{ep_id}"
        sheet_df.to_excel(writer, sheet_name=sheet_name, index=False)

overall_total = overall_counts['total_actual']
if overall_total > 0:
    print(f"\nOverall tag match accuracy: {overall_counts['tag_matches']}/{overall_total} = {overall_counts['tag_matches']/overall_total*100:.1f}%")
    print(f"Overall direction match accuracy: {overall_counts['dir_matches']}/{overall_total} = {overall_counts['dir_matches']/overall_total*100:.1f}%")
    print(f"Overall magnitude match accuracy: {overall_counts['mag_matches']}/{overall_total} = {overall_counts['mag_matches']/overall_total*100:.1f}%")
    print(f"Overall action type match accuracy: {overall_counts['type_matches']}/{overall_total} = {overall_counts['type_matches']/overall_total*100:.1f}%")
    print(f"Overall all-match accuracy: {overall_counts['all_matches']}/{overall_total} = {overall_counts['all_matches']/overall_total*100:.1f}%")
else:
    print("\nOverall accuracy: N/A (no actual actions)")

print(f"\n{'='*60}")
print(f"ALL EPISODES PROCESSED SUCCESSFULLY!")
print(f"{'='*60}")
print(f"\nResults saved to: {output_dir}/")
print(f"  - {len(excel_rows_by_episode)} episode HTML visualizations")
print(f"  - operator_action_recommendations_by_episode.xlsx")

Selected 50 test episodes from 2025
Processing 50 test episodes...
Output directory: /home/h604827/ControlActions/RESULTS/similarity_test_results


Processing Episode 558...
  Alarm: 2025-05-05 18:52:00 to 2025-05-05 22:57:00
  Deviation Start: 2025-05-05 17:26:00
  Actual Operated Tags: 03FIC_1085, 03FIC_3435, 03HIC_1141, 03HIC_3100, 03LIC_1034, 03LIC_1071, 03LIC_1085, 03LIC_1094, 03LIC_1097, 03PIC_1013, 03SDV_1167, 03TIC_1009
  Target-tag actions for evaluation: 21
  Saved: /home/h604827/ControlActions/RESULTS/similarity_test_results/episode_558_visualization.html
  Actual actions: 21
  Tag match accuracy: 4/21 = 19.0%
  Direction match accuracy: 14/21 = 66.7%
  Magnitude match accuracy: 13/21 = 61.9%
  Action type match accuracy: 17/21 = 81.0%
  All-match accuracy: 0/21 = 0.0%

Processing Episode 540...
  Alarm: 2025-01-12 05:27:00 to 2025-01-12 05:58:00
  Deviation Start: 2025-01-12 04:01:00
  Actual Operated Tags: 03GHS_0121AA, 03LIC_1034
  No actual actions with valid magnitude -