### Load Data

In [1]:
import pandas as pd

episodes_operated_tags_df = pd.read_excel('/home/h604827/ControlActions/RESULTS/episode_all_operator_action_plots/episodes_all_with_actions_and_deviations.xlsx')
ssd_df = pd.read_excel('/home/h604827/ControlActions/DATA/SSD_1071_SSD_output_1071_7Jan2026.xlsx')
events_df = pd.read_csv('/home/h604827/ControlActions/DATA/trip_filtered_events.csv')
pv_op_data_df = pd.read_parquet('/home/h604827/ControlActions/DATA/03LIC_1071_JAN_2026_filtered.parquet')
operating_limits_df = pd.read_csv('/home/h604827/ControlActions/DATA/operating_limits.csv')

episodes_operated_tags_df.head()

Unnamed: 0,EpisodeID,AlarmStart,AlarmEnd,AlarmDurationMinutes,TotalWindowMinutes,OperatedTags,OperatedTagsCount,DeviatedTags,DeviatedTagsCount,HasOperatorActions,HasOnlyTargetTags,Has1071Action,Has1016Action,Has1013Action
0,1,2022-01-05 08:53:00,2022-01-05 09:33:00,40,186,"03FIC_1085, 03FIC_3435, 03HIC_1141, 03HIC_1151...",15,"02FI_1000.PV, 03FIC_1085.PV, 03FIC_3415.PV, 03...",22,True,False,True,True,True
1,2,2022-01-07 09:55:00,2022-01-07 10:00:00,5,151,"03FIC_3435, 03GHS_0121A, 03GHS_0121AA, 03GHS_0...",8,"03FIC_1085.PV, 03FIC_3415.PV, 03FI_1141A.PV, 0...",23,True,False,False,False,False
2,3,2022-01-07 13:33:00,2022-01-07 13:36:00,3,149,"03FIC_3435, 03LIC_1034",2,"03FIC_1085.PV, 03FIC_3415.PV, 03FI_1141A.PV, 0...",19,True,False,False,False,False
3,4,2022-01-07 14:17:00,2022-01-07 14:19:00,2,148,"03FIC_3435, 03LIC_1016, 03LIC_1034",3,"03FIC_1085.PV, 03FI_1141A.PV, 03LIC_1016.PV, 0...",22,True,False,False,True,False
4,5,2022-01-07 14:54:00,2022-01-07 14:58:00,4,150,"03FIC_3435, 03LIC_1016",2,"02FI_1000.PV, 03FIC_1085.PV, 03FI_1141A.PV, 03...",19,True,False,False,True,False


### Checking operator actions for target tags only

In [2]:
# Check how operator actions (CHANGE events) look for target tags
# Filter for CHANGE events on target tags (1071, 1016, 1013)
target_sources = ['03LIC_1071', '03LIC_1016', '03PIC_1013']
change_events = events_df[(events_df['ConditionName'] == 'CHANGE') & 
                          (events_df['Source'].isin(target_sources))]
print(f"Total CHANGE events for target tags: {len(change_events)}")
print("\nSample CHANGE events:")
change_events[['Source', 'VT_Start', 'Value', 'PrevValue', 'ConditionName']].head(10)

Total CHANGE events for target tags: 20504

Sample CHANGE events:


Unnamed: 0,Source,VT_Start,Value,PrevValue,ConditionName
2512,03PIC_1013,2021-10-05 15:16:02.478400,MAN,CAS,CHANGE
2513,03PIC_1013,2021-10-05 15:16:02.478400,MAN,,CHANGE
2514,03PIC_1013,2021-10-05 15:16:05.255600,87.9999,,CHANGE
2516,03PIC_1013,2021-10-05 15:16:05.255600,87.9999,89.9999,CHANGE
2518,03PIC_1013,2021-10-05 15:16:07.755600,85.9998,87.9999,CHANGE
2519,03PIC_1013,2021-10-05 15:16:07.755600,85.9998,,CHANGE
2520,03PIC_1013,2021-10-05 15:16:21.711300,83.9999,,CHANGE
2522,03PIC_1013,2021-10-05 15:16:21.711300,83.9999,85.9998,CHANGE
2524,03PIC_1013,2021-10-05 15:16:44.655600,81.9999,83.9999,CHANGE
2525,03PIC_1013,2021-10-05 15:16:44.655600,81.9999,,CHANGE


## Similarity Approach: Building Context for Operator Actions

### Approach Overview
1. Select 20 random episodes from 27 episodes where only target tags (1071, 1016, 1013) were operated
2. For each operator action in these episodes, create a "context" capturing process state
3. Context window: From deviation start time to operator action timestamp

### Context Features (for each of 28 PV tags):
1. **PV value at deviation start**
2. **PV value at operator action time**
3. **Rate of change (%)** = (PV_action - PV_start) / PV_start * 100
4. **Direction** = 1 if ROC positive, 0 if negative

In [3]:
# Step 1: Filter episodes where only target tags were operated
only_target_episodes_df = episodes_operated_tags_df[episodes_operated_tags_df['HasOnlyTargetTags'] == True].copy()
print(f"Total episodes with only target tags: {len(only_target_episodes_df)}")

# Step 2: Randomly select 20 episodes (set seed for reproducibility)
import numpy as np
np.random.seed(42)
sample_n = 20
selected_episodes_df = only_target_episodes_df.sample(n=sample_n, random_state=42)
print(f"Selected {sample_n} episodes for training context")
print(f"\nSelected Episode IDs: {selected_episodes_df['EpisodeID'].tolist()}")

# Keep remaining 7 for later use (testing)
remaining_episodes_df = only_target_episodes_df[~only_target_episodes_df['EpisodeID'].isin(selected_episodes_df['EpisodeID'])]
print(f"\nRemaining {len(remaining_episodes_df)} episodes for testing: {remaining_episodes_df['EpisodeID'].tolist()}")

Total episodes with only target tags: 27
Selected 20 episodes for training context

Selected Episode IDs: [397, 468, 399, 526, 295, 407, 473, 475, 413, 529, 359, 371, 377, 360, 470, 527, 363, 560, 528, 483]

Remaining 7 episodes for testing: [386, 388, 400, 469, 521, 522, 604]


In [4]:
selected_episodes_df['OperatedTags'].value_counts()

OperatedTags
03LIC_1071                11
03PIC_1013                 6
03LIC_1016, 03LIC_1071     3
Name: count, dtype: int64

In [7]:
# Step 3: Define the PV columns for context (28 tags)

context_pv_tags = [col for col in pv_op_data_df.columns if col.endswith('.PV')]

# Step 4: Prepare events data for timestamp parsing
events_df['VT_Start'] = pd.to_datetime(events_df['VT_Start'])

# Prepare SSD data for timestamp parsing
ssd_df['AlarmStart_rounded_minutes'] = pd.to_datetime(ssd_df['AlarmStart_rounded_minutes'])
ssd_df['AlarmEnd_rounded_minutes'] = pd.to_datetime(ssd_df['AlarmEnd_rounded_minutes'])
ssd_df['Tag_First_Transition_Start_minutes'] = pd.to_datetime(ssd_df['Tag_First_Transition_Start_minutes'])

# Prepare episodes data
episodes_operated_tags_df['AlarmStart'] = pd.to_datetime(episodes_operated_tags_df['AlarmStart'])
episodes_operated_tags_df['AlarmEnd'] = pd.to_datetime(episodes_operated_tags_df['AlarmEnd'])

print("Timestamp columns converted to datetime")

Timestamp columns converted to datetime


In [None]:
# Step 5: Helper function to get deviation start time for an episode
# Deviation start is the earliest Tag_First_Transition_Start_minutes for the target tag (03LIC_1071)
# in that alarm episode

def get_deviation_start_for_episode(episode_id, alarm_start, alarm_end):
    """
    Get the deviation start time for an episode.
    This is the Tag_First_Transition_Start_minutes for 03LIC_1071 (or earliest among related tags)
    """
    # Find SSD records for this alarm episode (matching by alarm start time)
    episode_ssd = ssd_df[
        (ssd_df['AlarmStart_rounded_minutes'] == alarm_start)
    ]
    
    if len(episode_ssd) == 0:
        # Try with a small time tolerance (within 1 minute)
        print(f"No exact SSD data found for EpisodeID {episode_id} with AlarmStart {alarm_start}. Trying with time tolerance.")
        episode_ssd = ssd_df[
            (abs((ssd_df['AlarmStart_rounded_minutes'] - alarm_start).dt.total_seconds()) <= 60)
        ]
    
    if len(episode_ssd) == 0:
        # If no SSD data found, use alarm start minus 30 minutes as default deviation start
        print(f"No SSD data found for EpisodeID {episode_id} with AlarmStart {alarm_start}. Using default deviation start.")
        return alarm_start - pd.Timedelta(minutes=30)
    
    # Get deviation start for target tag 03LIC_1071 if available
    target_ssd = episode_ssd[episode_ssd['TagName'] == '03LIC_1071']
    if len(target_ssd) > 0:
        return target_ssd['Tag_First_Transition_Start_minutes'].iloc[0]
    
    # If not found, use earliest transition start among all tags
    return episode_ssd['Tag_First_Transition_Start_minutes'].min()

# Test with first selected episode
test_ep = selected_episodes_df.iloc[0]
test_dev_start = get_deviation_start_for_episode(
    test_ep['EpisodeID'], 
    test_ep['AlarmStart'], 
    test_ep['AlarmEnd']
)
print(f"Episode {test_ep['EpisodeID']}:")
print(f"  Alarm Start: {test_ep['AlarmStart']}")
print(f"  Deviation Start: {test_dev_start}")
print(f"  Alarm End: {test_ep['AlarmEnd']}")

Episode 397:
  Alarm Start: 2024-06-16 10:29:00
  Deviation Start: 2024-06-16 09:03:00
  Alarm End: 2024-06-16 10:50:00


In [76]:
# Step 6: Helper function to get operator actions for an episode
def get_operator_actions_for_episode(alarm_start, alarm_end, target_sources=['03LIC_1071', '03LIC_1016', '03PIC_1013']):
    """
    Get all CHANGE events (operator actions) for target sources during an episode.
    Episode window: deviation_start to alarm_end
    """
    # Get deviation start
    deviation_start = get_deviation_start_for_episode(None, alarm_start, alarm_end)
    
    # Filter CHANGE events within the episode window for target sources
    actions = events_df[
        (events_df['ConditionName'] == 'CHANGE') &
        (events_df['Source'].isin(target_sources)) &
        (events_df['VT_Start'] >= deviation_start) &
        (events_df['VT_Start'] <= alarm_end)
    ].copy()
    
    return actions, deviation_start

# Test with first selected episode
test_actions, test_dev_start = get_operator_actions_for_episode(
    test_ep['AlarmStart'], 
    test_ep['AlarmEnd']
)
print(f"Episode {test_ep['EpisodeID']} - Found {len(test_actions)} operator actions")
if len(test_actions) > 0:
    print(test_actions[['Source', 'VT_Start', 'Value', 'PrevValue']].head())

Episode 397 - Found 24 operator actions
            Source                   VT_Start    Value PrevValue
991844  03PIC_1013 2024-06-16 09:18:57.165100  82.9997   84.9997
991845  03PIC_1013 2024-06-16 09:18:57.165100  82.9997       NaN
991846  03PIC_1013 2024-06-16 09:18:58.805800  80.9997       NaN
991847  03PIC_1013 2024-06-16 09:18:58.805800  80.9997   82.9997
991849  03PIC_1013 2024-06-16 09:19:04.305600  78.9997   80.9997


In [77]:
# Step 7: Helper function to get PV value at a specific timestamp (with nearest lookup)
def get_pv_at_timestamp(timestamp, pv_tag):
    """
    Get PV value at or nearest to the given timestamp.
    Uses forward fill to get the most recent value if exact time not found.
    """
    try:
        # Make timestamp timezone naive if needed
        if timestamp.tzinfo is not None:
            timestamp = timestamp.tz_localize(None)
        
        # Try exact lookup first
        if timestamp in pv_op_data_df.index:
            return pv_op_data_df.loc[timestamp, pv_tag]
        
        # Use asof for nearest lookup (gets value at or before timestamp)
        idx = pv_op_data_df.index.get_indexer([timestamp], method='ffill')[0]
        if idx >= 0 and idx < len(pv_op_data_df):
            return pv_op_data_df.iloc[idx][pv_tag]
        
        # If no value found before, get nearest after
        idx = pv_op_data_df.index.get_indexer([timestamp], method='bfill')[0]
        if idx >= 0 and idx < len(pv_op_data_df):
            return pv_op_data_df.iloc[idx][pv_tag]
        
        return np.nan
    except Exception as e:
        return np.nan

# Test
test_ts = test_dev_start
test_tag = '03LIC_1071.PV'
test_val = get_pv_at_timestamp(test_ts, test_tag)
print(f"PV value for {test_tag} at {test_ts}: {test_val}")

PV value for 03LIC_1071.PV at 2024-06-16 09:03:00: 40.202705


In [78]:
# Step 8: Build context for a single operator action
def build_context_for_action(deviation_start, action_timestamp, pv_tags):
    """
    Build context features for an operator action.
    
    Returns a dict with the following for each PV tag:
    - {tag}_pv_at_deviation_start: PV value at deviation start
    - {tag}_pv_at_action: PV value at action time
    - {tag}_roc_percent: Rate of change in % = (pv_action - pv_start) / pv_start * 100
    - {tag}_roc_direction: 1 if positive, 0 if negative
    """
    context = {}
    
    for pv_tag in pv_tags:
        # Remove .PV suffix for cleaner column names
        tag_name = pv_tag.replace('.PV', '')
        
        # Get PV values
        pv_at_deviation = get_pv_at_timestamp(deviation_start, pv_tag)
        pv_at_action = get_pv_at_timestamp(action_timestamp, pv_tag)
        
        # Calculate rate of change (%)
        if pd.notna(pv_at_deviation) and pd.notna(pv_at_action) and pv_at_deviation != 0:
            roc_percent = ((pv_at_action - pv_at_deviation) / pv_at_deviation) * 100
        else:
            roc_percent = np.nan
        
        # Determine direction
        if pd.notna(roc_percent):
            roc_direction = 1 if roc_percent >= 0 else 0
        else:
            roc_direction = np.nan
        
        # Store in context
        context[f'{tag_name}_pv_at_deviation_start'] = pv_at_deviation
        context[f'{tag_name}_pv_at_action'] = pv_at_action
        context[f'{tag_name}_roc_percent'] = roc_percent
        context[f'{tag_name}_roc_direction'] = roc_direction
    
    return context

# Test with first action of first episode
if len(test_actions) > 0:
    first_action = test_actions.iloc[0]
    test_context = build_context_for_action(test_dev_start, first_action['VT_Start'], context_pv_tags)
    print(f"Context for action at {first_action['VT_Start']} on {first_action['Source']}:")
    print(f"\n03LIC_1071 (target tag):")
    print(f"  PV at deviation start: {test_context['03LIC_1071_pv_at_deviation_start']:.4f}")
    print(f"  PV at action: {test_context['03LIC_1071_pv_at_action']:.4f}")
    print(f"  ROC %: {test_context['03LIC_1071_roc_percent']:.4f}")
    print(f"  ROC direction: {test_context['03LIC_1071_roc_direction']}")

Context for action at 2024-06-16 09:18:57.165100 on 03PIC_1013:

03LIC_1071 (target tag):
  PV at deviation start: 40.2027
  PV at action: 40.1849
  ROC %: -0.0442
  ROC direction: 0


In [79]:
# Step 9: Process all 20 selected episodes and build context for each operator action
from tqdm import tqdm

context_records = []
target_sources = ['03LIC_1071', '03LIC_1016', '03PIC_1013']

print("Processing 20 selected episodes...")
for idx, (_, episode) in enumerate(tqdm(selected_episodes_df.iterrows(), total=len(selected_episodes_df))):
    episode_id = episode['EpisodeID']
    alarm_start = episode['AlarmStart']
    alarm_end = episode['AlarmEnd']
    
    # Get deviation start
    deviation_start = get_deviation_start_for_episode(episode_id, alarm_start, alarm_end)
    
    # Get operator actions for this episode
    actions, _ = get_operator_actions_for_episode(alarm_start, alarm_end, target_sources)
    
    if len(actions) == 0:
        continue
    
    # Process each operator action
    for _, action in actions.iterrows():
        action_timestamp = action['VT_Start']
        
        # Build context
        context = build_context_for_action(deviation_start, action_timestamp, context_pv_tags)
        
        # Add episode and action metadata
        context['episode_id'] = episode_id
        context['alarm_start'] = alarm_start
        context['alarm_end'] = alarm_end
        context['deviation_start'] = deviation_start
        context['action_timestamp'] = action_timestamp
        context['action_source'] = action['Source']
        context['action_value'] = action['Value']
        context['action_prev_value'] = action['PrevValue']
        
        # Calculate action direction and magnitude
        try:
            action_val = float(action['Value'])
            prev_val = float(action['PrevValue'])
            context['action_magnitude'] = action_val - prev_val
            context['action_direction'] = 1 if action_val > prev_val else 0  # 1 = increase, 0 = decrease
        except (ValueError, TypeError):
            context['action_magnitude'] = np.nan
            context['action_direction'] = np.nan
        
        context_records.append(context)

print(f"\nTotal context records created: {len(context_records)}")

Processing 20 selected episodes...


100%|██████████| 20/20 [00:30<00:00,  1.53s/it]


Total context records created: 468





In [80]:
# Step 10: Convert to DataFrame and examine structure
context_df = pd.DataFrame(context_records)

print(f"Context DataFrame shape: {context_df.shape}")
print(f"Total columns: {len(context_df.columns)}")
print(f"\nColumn breakdown:")
print(f"  - Metadata columns: 10 (episode_id, alarm_start, alarm_end, deviation_start, action_timestamp, action_source, action_value, action_prev_value, action_magnitude, action_direction)")
print(f"  - PV context columns: {len(context_pv_tags) * 4} (4 features x {len(context_pv_tags)} tags)")

# Show first few rows with key columns
key_cols = ['episode_id', 'action_timestamp', 'action_source', 'action_direction', 'action_magnitude',
            '03LIC_1071_pv_at_deviation_start', '03LIC_1071_pv_at_action', 
            '03LIC_1071_roc_percent', '03LIC_1071_roc_direction']
context_df[key_cols].head(10)

Context DataFrame shape: (468, 122)
Total columns: 122

Column breakdown:
  - Metadata columns: 10 (episode_id, alarm_start, alarm_end, deviation_start, action_timestamp, action_source, action_value, action_prev_value, action_magnitude, action_direction)
  - PV context columns: 112 (4 features x 28 tags)


Unnamed: 0,episode_id,action_timestamp,action_source,action_direction,action_magnitude,03LIC_1071_pv_at_deviation_start,03LIC_1071_pv_at_action,03LIC_1071_roc_percent,03LIC_1071_roc_direction
0,397,2024-06-16 09:18:57.165100,03PIC_1013,0.0,-2.0,40.202705,40.184917,-0.044246,0
1,397,2024-06-16 09:18:57.165100,03PIC_1013,0.0,,40.202705,40.184917,-0.044246,0
2,397,2024-06-16 09:18:58.805800,03PIC_1013,0.0,,40.202705,40.184917,-0.044246,0
3,397,2024-06-16 09:18:58.805800,03PIC_1013,0.0,-2.0,40.202705,40.184917,-0.044246,0
4,397,2024-06-16 09:19:04.305600,03PIC_1013,0.0,-2.0,40.202705,38.263474,-4.823633,0
5,397,2024-06-16 09:19:04.305600,03PIC_1013,0.0,,40.202705,38.263474,-4.823633,0
6,397,2024-06-16 09:54:47.056800,03PIC_1013,0.0,-2.0,40.202705,38.953682,-3.106813,0
7,397,2024-06-16 09:54:47.056800,03PIC_1013,0.0,,40.202705,38.953682,-3.106813,0
8,397,2024-06-16 09:54:48.356000,03PIC_1013,0.0,,40.202705,38.953682,-3.106813,0
9,397,2024-06-16 09:54:48.356000,03PIC_1013,0.0,-2.0,40.202705,38.953682,-3.106813,0


In [81]:
# Step 11: Data quality check and summary statistics
print("=== Context DataFrame Summary ===\n")

# Count actions per episode
actions_per_episode = context_df.groupby('episode_id').size()
print(f"Actions per episode:")
print(f"  Min: {actions_per_episode.min()}, Max: {actions_per_episode.max()}, Mean: {actions_per_episode.mean():.1f}")

# Actions by source
print(f"\nActions by target tag:")
print(context_df['action_source'].value_counts())

# Action direction distribution
print(f"\nAction direction distribution:")
print(context_df['action_direction'].value_counts())

# Missing values check
missing_cols = context_df.isnull().sum()
cols_with_missing = missing_cols[missing_cols > 0]
if len(cols_with_missing) > 0:
    print(f"\nColumns with missing values: {len(cols_with_missing)}")
else:
    print(f"\nNo missing values in context features!")

=== Context DataFrame Summary ===

Actions per episode:
  Min: 6, Max: 136, Mean: 33.4

Actions by target tag:
action_source
03LIC_1071    400
03PIC_1013     54
03LIC_1016     14
Name: count, dtype: int64

Action direction distribution:
action_direction
0.0    336
1.0     96
Name: count, dtype: int64

Columns with missing values: 6


In [82]:
# Step 12: Clean duplicate records (same action appears multiple times with/without PrevValue)
# Keep only records with valid action_magnitude (not NaN)
print(f"Total records before cleaning: {len(context_df)}")

# Filter out records where action_magnitude is NaN (these are duplicates without PrevValue)
context_df_clean = context_df[context_df['action_magnitude'].notna()].copy()

print(f"Total records after removing actions without valid magnitude: {len(context_df_clean)}")

# Also remove exact duplicates (same timestamp, source, value)
context_df_clean = context_df_clean.drop_duplicates(
    subset=['episode_id', 'action_timestamp', 'action_source', 'action_value']
)

print(f"Total records after removing duplicates: {len(context_df_clean)}")

# Summary after cleaning
print(f"\nActions per episode after cleaning:")
actions_per_episode_clean = context_df_clean.groupby('episode_id').size()
print(f"  Min: {actions_per_episode_clean.min()}, Max: {actions_per_episode_clean.max()}, Mean: {actions_per_episode_clean.mean():.1f}")

print(f"\nActions by target tag after cleaning:")
print(context_df_clean['action_source'].value_counts())

Total records before cleaning: 468
Total records after removing actions without valid magnitude: 216
Total records after removing duplicates: 216

Actions per episode after cleaning:
  Min: 3, Max: 67, Mean: 15.4

Actions by target tag after cleaning:
action_source
03LIC_1071    185
03PIC_1013     26
03LIC_1016      5
Name: count, dtype: int64


In [83]:
# Step 13: Save the context DataFrame for later use
output_path = '/home/h604827/ControlActions/RESULTS/similarity_context_training_20episodes.csv'
context_df_clean.to_csv(output_path, index=False)
print(f"Context data saved to: {output_path}")

# Also save the list of training and testing episode IDs
training_testing_info = {
    'training_episodes': selected_episodes_df['EpisodeID'].tolist(),
    'testing_episodes': remaining_episodes_df['EpisodeID'].tolist()
}
import json
with open('/home/h604827/ControlActions/RESULTS/similarity_approach_episodes_split.json', 'w') as f:
    json.dump(training_testing_info, f, indent=2)
print("Episode split info saved to: RESULTS/similarity_approach_episodes_split.json")

Context data saved to: /home/h604827/ControlActions/RESULTS/similarity_context_training_20episodes.csv
Episode split info saved to: RESULTS/similarity_approach_episodes_split.json


In [84]:
# Step 14: Display sample of the final context dataframe
print("=== Final Context DataFrame ===")
print(f"Shape: {context_df_clean.shape}")
print(f"Columns: {context_df_clean.columns.tolist()[:20]}... (and {len(context_df_clean.columns)-20} more)")

# Show sample with all ROC and direction columns for context overview
roc_cols = [c for c in context_df_clean.columns if '_roc_percent' in c]
dir_cols = [c for c in context_df_clean.columns if '_roc_direction' in c]

print(f"\nROC columns ({len(roc_cols)}): {roc_cols[:5]}...")
print(f"Direction columns ({len(dir_cols)}): {dir_cols[:5]}...")

# Show statistics for target tag 1071
print(f"\n=== 03LIC_1071 ROC Statistics ===")
print(context_df_clean['03LIC_1071_roc_percent'].describe())

=== Final Context DataFrame ===
Shape: (216, 122)
Columns: ['03LIC_1071_pv_at_deviation_start', '03LIC_1071_pv_at_action', '03LIC_1071_roc_percent', '03LIC_1071_roc_direction', '02FI_1000_pv_at_deviation_start', '02FI_1000_pv_at_action', '02FI_1000_roc_percent', '02FI_1000_roc_direction', '03FIC_1085_pv_at_deviation_start', '03FIC_1085_pv_at_action', '03FIC_1085_roc_percent', '03FIC_1085_roc_direction', '03FIC_3415_pv_at_deviation_start', '03FIC_3415_pv_at_action', '03FIC_3415_roc_percent', '03FIC_3415_roc_direction', '03FIC_3435_pv_at_deviation_start', '03FIC_3435_pv_at_action', '03FIC_3435_roc_percent', '03FIC_3435_roc_direction']... (and 102 more)

ROC columns (28): ['03LIC_1071_roc_percent', '02FI_1000_roc_percent', '03FIC_1085_roc_percent', '03FIC_3415_roc_percent', '03FIC_3435_roc_percent']...
Direction columns (28): ['03LIC_1071_roc_direction', '02FI_1000_roc_direction', '03FIC_1085_roc_direction', '03FIC_3415_roc_direction', '03FIC_3435_roc_direction']...

=== 03LIC_1071 ROC St

## Summary: Context Data Created

### Training Data
- **20 episodes** randomly selected from 27 episodes where only target tags (1071, 1016, 1013) were operated
- **216 operator actions** with valid context after cleaning duplicates
- **122 columns** in the context dataframe:
  - 10 metadata columns (episode_id, timestamps, action details)
  - 112 context features (4 features × 28 PV tags)

### Context Features per PV Tag:
1. `{tag}_pv_at_deviation_start` - PV value when deviation began
2. `{tag}_pv_at_action` - PV value at the time of operator action
3. `{tag}_roc_percent` - Rate of change as percentage: `((pv_action - pv_start) / pv_start) * 100`
4. `{tag}_roc_direction` - Direction: 1 if positive (rising), 0 if negative (falling)

### Target Variable (What the operator did):
- `action_source` - Which tag was operated (03LIC_1071, 03LIC_1016, or 03PIC_1013)
- `action_direction` - 1 if increased, 0 if decreased
- `action_magnitude` - How much the OP value was changed

### Testing Data Reserved
- **7 episodes** held out for testing: [386, 388, 400, 469, 521, 522, 604]

### Files Saved:
- `RESULTS/similarity_context_training_20episodes.csv` - Context data
- `RESULTS/similarity_approach_episodes_split.json` - Episode split info

---
## Part 2: Weighted Multi-Component Similarity Approach

### Testing on Reserved Episodes

For each test episode:
1. Start from deviation start time
2. For each minute until alarm_end + 60 minutes:
   - Calculate context at that minute
   - Compare with all training contexts using weighted similarity
   - Return the most similar historical action(s)

In [85]:
# Step 15: Define the Weighted Multi-Component Similarity Function
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Get column names for ROC and direction features
roc_cols = [c for c in context_df_clean.columns if '_roc_percent' in c]
dir_cols = [c for c in context_df_clean.columns if '_roc_direction' in c]
pv_action_cols = [c for c in context_df_clean.columns if '_pv_at_action' in c]

# Define tag weights (higher weight for target and controllable tags)
tag_weights = {}
for col in roc_cols:
    tag_name = col.replace('_roc_percent', '')
    if '1071' in tag_name:
        tag_weights[col] = 3.0  # Target tag - highest weight
    elif '1016' in tag_name or '1013' in tag_name:
        tag_weights[col] = 2.0  # Other controllable targets
    else:
        tag_weights[col] = 1.0  # Related tags

print(f"ROC columns: {len(roc_cols)}")
print(f"Direction columns: {len(dir_cols)}")
print(f"\nTag weights for controllable tags:")
for k, v in tag_weights.items():
    if v > 1:
        print(f"  {k}: {v}")

ROC columns: 28
Direction columns: 28

Tag weights for controllable tags:
  03LIC_1071_roc_percent: 3.0
  03LIC_1016_roc_percent: 2.0
  03PIC_1013_roc_percent: 2.0


In [86]:
# Step 16: Implement the weighted similarity calculation function (with PV similarity)

def calculate_weighted_similarity(runtime_context, historical_df, 
                                   roc_cols, dir_cols, pv_action_cols, tag_weights,
                                   w_roc=0.5, w_dir=0.35, w_pv=0.15):
    """
    Calculate weighted multi-component similarity between runtime context and all historical contexts.
    
    Components:
    1. ROC Pattern Similarity (Cosine) - weighted by tag importance
    2. Direction Match Score (Jaccard-like)
    3. PV State Similarity (Cosine similarity on normalized PV values)
    
    Returns: DataFrame with similarity scores and corresponding action details
    """
    similarities = []
    
    # Create weight array for ROC columns
    weight_array = np.array([tag_weights.get(col, 1.0) for col in roc_cols])
    
    # Runtime ROC values (weighted) - handle NaN by replacing with 0
    runtime_roc_raw = np.array([runtime_context.get(col, 0) for col in roc_cols])
    runtime_roc_raw = np.nan_to_num(runtime_roc_raw, nan=0.0)
    runtime_roc = runtime_roc_raw * weight_array
    
    runtime_dir = np.array([runtime_context.get(col.replace('_roc_percent', '_roc_direction'), 0) for col in roc_cols])
    runtime_dir = np.nan_to_num(runtime_dir, nan=0.0)
    
    # Runtime PV values at action time (for PV state similarity)
    runtime_pv = np.array([runtime_context.get(col, 0) for col in pv_action_cols])
    runtime_pv = np.nan_to_num(runtime_pv, nan=0.0)
    
    # Get PV weights (same mapping as ROC weights)
    pv_weight_array = np.array([tag_weights.get(col.replace('_pv_at_action', '_roc_percent'), 1.0) 
                                for col in pv_action_cols])
    
    for idx, hist_row in historical_df.iterrows():
        # 1. Cosine Similarity on weighted ROC pattern
        hist_roc_raw = np.array([hist_row[col] if pd.notna(hist_row[col]) else 0 for col in roc_cols])
        hist_roc = hist_roc_raw * weight_array
        
        # Handle zero vectors
        runtime_norm = np.linalg.norm(runtime_roc)
        hist_norm = np.linalg.norm(hist_roc)
        
        if runtime_norm == 0 or hist_norm == 0:
            roc_sim = 0.5  # Default to neutral similarity
        else:
            # Manual cosine similarity to avoid sklearn NaN issues
            dot_product = np.dot(runtime_roc, hist_roc)
            roc_sim = dot_product / (runtime_norm * hist_norm)
            # Normalize from [-1, 1] to [0, 1]
            roc_sim = (roc_sim + 1) / 2
        
        # 2. Direction Match Score
        hist_dir = np.array([hist_row[col.replace('_roc_percent', '_roc_direction')] 
                           if pd.notna(hist_row[col.replace('_roc_percent', '_roc_direction')]) else 0 
                           for col in roc_cols])
        dir_match = np.mean(runtime_dir == hist_dir)
        
        # 3. PV State Similarity using Cosine Similarity (scale-invariant)
        hist_pv = np.array([hist_row[col] if pd.notna(hist_row[col]) else 0 for col in pv_action_cols])
        
        # Apply tag weights to PV values
        weighted_runtime_pv = runtime_pv * pv_weight_array
        weighted_hist_pv = hist_pv * pv_weight_array
        
        # Cosine similarity for PV values (handles different scales naturally)
        runtime_pv_norm = np.linalg.norm(weighted_runtime_pv)
        hist_pv_norm = np.linalg.norm(weighted_hist_pv)
        
        if runtime_pv_norm == 0 or hist_pv_norm == 0:
            pv_sim = 0.5  # Default to neutral similarity
        else:
            pv_dot = np.dot(weighted_runtime_pv, weighted_hist_pv)
            pv_sim = pv_dot / (runtime_pv_norm * hist_pv_norm)
            # Normalize from [-1, 1] to [0, 1]
            pv_sim = (pv_sim + 1) / 2
        
        # 4. Combined weighted similarity
        total_similarity = w_roc * roc_sim + w_dir * dir_match + w_pv * pv_sim
        
        similarities.append({
            'hist_index': idx,
            'total_similarity': total_similarity,
            'roc_similarity': roc_sim,
            'direction_match': dir_match,
            'pv_similarity': pv_sim,
            'action_source': hist_row['action_source'],
            'action_direction': hist_row['action_direction'],
            'action_magnitude': hist_row['action_magnitude'],
            'episode_id': hist_row['episode_id']
        })
    
    return pd.DataFrame(similarities).sort_values('total_similarity', ascending=False)

# Get the pv_at_action columns for the similarity function
pv_action_cols = [col for col in context_df_clean.columns if col.endswith('_pv_at_action')]
print(f"PV action columns for similarity: {len(pv_action_cols)} tags")
print(f"Example columns: {pv_action_cols[:3]}")

print("\nSimilarity function defined successfully (with cosine-based PV similarity component)")

PV action columns for similarity: 28 tags
Example columns: ['03LIC_1071_pv_at_action', '02FI_1000_pv_at_action', '03FIC_1085_pv_at_action']

Similarity function defined successfully (with cosine-based PV similarity component)


In [87]:
# Step 17: Function to build context at a specific timestamp for runtime evaluation

def build_runtime_context(deviation_start, current_time, pv_tags):
    """
    Build context at runtime - from deviation_start to current_time.
    Same structure as training context but calculated at runtime.
    """
    context = {}
    
    for pv_tag in pv_tags:
        tag_base = pv_tag.replace('.PV', '')
        
        # Get PV at deviation start
        pv_at_start = get_pv_at_timestamp(deviation_start, pv_tag)
        
        # Get PV at current time
        pv_at_current = get_pv_at_timestamp(current_time, pv_tag)
        
        # Calculate ROC percent
        if pv_at_start is not None and pv_at_start != 0 and not np.isnan(pv_at_start):
            roc_percent = ((pv_at_current - pv_at_start) / abs(pv_at_start)) * 100
        else:
            roc_percent = 0.0
        
        # ROC direction (1 = positive/rising, 0 = negative/falling)
        roc_direction = 1 if roc_percent >= 0 else 0
        
        context[f'{tag_base}_pv_at_deviation_start'] = pv_at_start
        context[f'{tag_base}_pv_at_action'] = pv_at_current
        context[f'{tag_base}_roc_percent'] = roc_percent
        context[f'{tag_base}_roc_direction'] = roc_direction
    
    return context

print("Runtime context builder function defined")

Runtime context builder function defined


In [88]:
# Step 18: Select one test episode to demonstrate the approach
# Test episodes: [386, 388, 400, 469, 521, 522, 604]

test_episode_id = 386  # First test episode
test_episode = episodes_operated_tags_df[episodes_operated_tags_df['EpisodeID'] == test_episode_id].iloc[0]

print(f"=== Test Episode {test_episode_id} ===")
print(f"Alarm Start: {test_episode['AlarmStart']}")
print(f"Alarm End: {test_episode['AlarmEnd']}")
print(f"Alarm Duration: {test_episode['AlarmDurationMinutes']} minutes")
print(f"Operated Tags: {test_episode['OperatedTags']}")

# Get deviation start for this episode
test_deviation_start = get_deviation_start_for_episode(
    test_episode_id, 
    test_episode['AlarmStart'], 
    test_episode['AlarmEnd']
)
print(f"\nDeviation Start: {test_deviation_start}")

# Calculate time range: deviation start to alarm end + 60 minutes
test_end_time = test_episode['AlarmEnd'] + pd.Timedelta(minutes=60)
print(f"Test End Time (Alarm End + 60 min): {test_end_time}")

# Calculate number of minutes to evaluate
total_minutes = int((test_end_time - test_deviation_start).total_seconds() / 60) + 1
print(f"\nTotal minutes to evaluate: {total_minutes}")

=== Test Episode 386 ===
Alarm Start: 2024-06-06 18:42:00
Alarm End: 2024-06-06 18:46:00
Alarm Duration: 4 minutes
Operated Tags: 03PIC_1013

Deviation Start: 2024-06-06 17:16:00
Test End Time (Alarm End + 60 min): 2024-06-06 19:46:00

Total minutes to evaluate: 151


In [89]:
# Step 19: Run the similarity approach for each minute in the test episode
from tqdm import tqdm

# Store results for each minute
minute_results = []

print(f"Running similarity matching for {total_minutes} minutes...")
print(f"Comparing against {len(context_df_clean)} historical operator actions\n")

for minute_offset in tqdm(range(total_minutes)):
    current_time = test_deviation_start + pd.Timedelta(minutes=minute_offset)
    
    # Build runtime context at this minute
    runtime_context = build_runtime_context(test_deviation_start, current_time, context_pv_tags)
    
    # Calculate similarity with all historical contexts (including PV similarity)
    similarity_results = calculate_weighted_similarity(
        runtime_context, 
        context_df_clean, 
        roc_cols, 
        dir_cols, 
        pv_action_cols,  # Added PV action columns for PV state similarity
        tag_weights
    )
    
    # Get top 3 matches
    top_matches = similarity_results.head(3)
    
    # Store results
    for rank, (_, match) in enumerate(top_matches.iterrows(), 1):
        minute_results.append({
            'minute_offset': minute_offset,
            'current_time': current_time,
            'rank': rank,
            'similarity': match['total_similarity'],
            'roc_similarity': match['roc_similarity'],
            'direction_match': match['direction_match'],
            'pv_similarity': match['pv_similarity'],  # Added PV similarity
            'recommended_action_source': match['action_source'],
            'recommended_action_direction': match['action_direction'],
            'recommended_action_magnitude': match['action_magnitude'],
            'matched_episode_id': match['episode_id'],
            # Add runtime context for target tag
            '1071_roc_percent': runtime_context['03LIC_1071_roc_percent'],
            '1071_roc_direction': runtime_context['03LIC_1071_roc_direction']
        })

minute_results_df = pd.DataFrame(minute_results)
print(f"\nTotal result records: {len(minute_results_df)}")

Running similarity matching for 151 minutes...
Comparing against 216 historical operator actions



  0%|          | 0/151 [00:00<?, ?it/s]

100%|██████████| 151/151 [00:09<00:00, 15.58it/s]


Total result records: 453





In [90]:
# Verify all three similarity components are working properly
print("Sample results with all similarity components:")
print(minute_results_df[['minute_offset', 'similarity', 'roc_similarity', 'direction_match', 'pv_similarity', 
                          'recommended_action_source']].head(10))

print(f"\n=== Similarity Component Statistics ===")
print(f"\n1. ROC Similarity (Cosine on rate of change):")
print(f"   Mean: {minute_results_df['roc_similarity'].mean():.4f}, Std: {minute_results_df['roc_similarity'].std():.4f}")
print(f"   Range: [{minute_results_df['roc_similarity'].min():.4f}, {minute_results_df['roc_similarity'].max():.4f}]")

print(f"\n2. Direction Match (Jaccard-like):")
print(f"   Mean: {minute_results_df['direction_match'].mean():.4f}, Std: {minute_results_df['direction_match'].std():.4f}")
print(f"   Range: [{minute_results_df['direction_match'].min():.4f}, {minute_results_df['direction_match'].max():.4f}]")

print(f"\n3. PV State Similarity (Cosine on PV values):")
print(f"   Mean: {minute_results_df['pv_similarity'].mean():.4f}, Std: {minute_results_df['pv_similarity'].std():.4f}")
print(f"   Range: [{minute_results_df['pv_similarity'].min():.4f}, {minute_results_df['pv_similarity'].max():.4f}]")

print(f"\n4. Total Weighted Similarity (w_roc=0.5, w_dir=0.35, w_pv=0.15):")
print(f"   Mean: {minute_results_df['similarity'].mean():.4f}, Std: {minute_results_df['similarity'].std():.4f}")
print(f"   Range: [{minute_results_df['similarity'].min():.4f}, {minute_results_df['similarity'].max():.4f}]")

Sample results with all similarity components:
   minute_offset  similarity  roc_similarity  direction_match  pv_similarity  \
0              0    0.749623        0.500000         1.000000       0.997485   
1              0    0.662471        0.500000         0.750000       0.999808   
2              0    0.662462        0.500000         0.750000       0.999747   
3              1    0.879460        0.934291         0.750000       0.998764   
4              1    0.865613        0.906543         0.750000       0.998943   
5              1    0.865613        0.906543         0.750000       0.998943   
6              2    0.864192        0.978545         0.642857       0.999464   
7              2    0.864192        0.978545         0.642857       0.999464   
8              2    0.864192        0.978545         0.642857       0.999464   
9              3    0.880212        0.960710         0.714286       0.999047   

  recommended_action_source  
0                03LIC_1071  
1           

In [91]:
# Step 20: Analyze the results - show top recommended action at each minute (rank=1)
top_recommendations = minute_results_df[minute_results_df['rank'] == 1].copy()

print(f"=== Test Episode {test_episode_id} - Similarity Matching Results ===\n")
print(f"Actual action taken in this episode: {test_episode['OperatedTags']}")
print(f"Deviation Start: {test_deviation_start}")
print(f"Alarm Start: {test_episode['AlarmStart']}")
print(f"Alarm End: {test_episode['AlarmEnd']}")

print(f"\n--- Recommended Actions Summary ---")
print(f"\nRecommended action source distribution:")
print(top_recommendations['recommended_action_source'].value_counts())

print(f"\nRecommended action direction distribution (1=increase, 0=decrease):")
print(top_recommendations['recommended_action_direction'].value_counts())

print(f"\n--- Sample of recommendations at key timestamps ---")
# Show recommendations at deviation start, alarm start, alarm end, and intervals
key_times = [0, 10, 30, 60, 86, 90, 100, 120, 150]  # minute offsets
key_recs = top_recommendations[top_recommendations['minute_offset'].isin(key_times)][
    ['minute_offset', 'current_time', 'similarity', 'recommended_action_source', 
     'recommended_action_direction', 'recommended_action_magnitude', '1071_roc_percent']
]
key_recs

=== Test Episode 386 - Similarity Matching Results ===

Actual action taken in this episode: 03PIC_1013
Deviation Start: 2024-06-06 17:16:00
Alarm Start: 2024-06-06 18:42:00
Alarm End: 2024-06-06 18:46:00

--- Recommended Actions Summary ---

Recommended action source distribution:
recommended_action_source
03LIC_1071    118
03PIC_1013     33
Name: count, dtype: int64

Recommended action direction distribution (1=increase, 0=decrease):
recommended_action_direction
0.0    79
1.0    72
Name: count, dtype: int64

--- Sample of recommendations at key timestamps ---


Unnamed: 0,minute_offset,current_time,similarity,recommended_action_source,recommended_action_direction,recommended_action_magnitude,1071_roc_percent
0,0,2024-06-06 17:16:00,0.749623,03LIC_1071,1.0,5.7193,0.0
30,10,2024-06-06 17:26:00,0.826029,03LIC_1071,0.0,-3.0,-1.974979
90,30,2024-06-06 17:46:00,0.830077,03LIC_1071,0.0,-3.0,-4.990111
180,60,2024-06-06 18:16:00,0.82179,03PIC_1013,1.0,2.0,16.777361
258,86,2024-06-06 18:42:00,0.844303,03PIC_1013,0.0,-2.0,-25.827367
270,90,2024-06-06 18:46:00,0.821196,03PIC_1013,0.0,-2.0,-15.198827
300,100,2024-06-06 18:56:00,0.785147,03LIC_1071,0.0,-2.0,1.95862
360,120,2024-06-06 19:16:00,0.818644,03LIC_1071,1.0,2.0,-0.970422
450,150,2024-06-06 19:46:00,0.769921,03PIC_1013,0.0,-2.0,8.812689


In [92]:
# Step 21: Get actual operator actions taken during this test episode for comparison
actual_actions, _ = get_operator_actions_for_episode(
    test_episode['AlarmStart'], 
    test_episode['AlarmEnd'],
    target_sources=['03LIC_1071', '03LIC_1016', '03PIC_1013']
)

# Clean actual actions (remove duplicates)
if len(actual_actions) > 0:
    actual_actions_clean = actual_actions.dropna(subset=['PrevValue'])
    actual_actions_clean = actual_actions_clean.drop_duplicates(
        subset=['VT_Start', 'Source', 'Value']
    )
    
    print(f"=== Actual Operator Actions in Episode {test_episode_id} ===")
    print(f"\nTotal actions taken: {len(actual_actions_clean)}")
    print(actual_actions_clean[['Source', 'VT_Start', 'Value', 'PrevValue']].head(20))
    
    # Calculate action details
    actual_actions_clean['action_magnitude'] = actual_actions_clean['Value'].astype(float) - actual_actions_clean['PrevValue'].astype(float)
    actual_actions_clean['action_direction'] = (actual_actions_clean['action_magnitude'] > 0).astype(int)
    
    print(f"\n--- Action Summary ---")
    print(f"Tags operated: {actual_actions_clean['Source'].unique()}")
    print(f"Action directions: {actual_actions_clean['action_direction'].value_counts().to_dict()}")
else:
    print("No actual actions found in this episode")

=== Actual Operator Actions in Episode 386 ===

Total actions taken: 6
            Source                   VT_Start    Value PrevValue
979739  03PIC_1013 2024-06-06 18:29:47.905600  70.9999   68.9999
979744  03PIC_1013 2024-06-06 18:32:34.005600  72.9999   70.9999
979747  03PIC_1013 2024-06-06 18:33:38.455800  74.9999   72.9999
979748  03PIC_1013 2024-06-06 18:34:33.705900  72.9999   74.9999
979751  03PIC_1013 2024-06-06 18:35:26.555800  74.9999   72.9999
979755  03PIC_1013 2024-06-06 18:35:31.170600  76.9999   74.9999

--- Action Summary ---
Tags operated: ['03PIC_1013']
Action directions: {1: 5, 0: 1}


In [93]:
# Step 22: Visualize the results with Plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(
    rows=3, cols=1,
    subplot_titles=[
        f'Episode {test_episode_id}: 03LIC_1071.PV Trend',
        'Recommended Action Source Over Time',
        'Similarity Score & 1071 ROC% Over Time'
    ],
    vertical_spacing=0.1,
    row_heights=[0.35, 0.3, 0.35]
)

# Get PV data for the time window
pv_start = test_deviation_start - pd.Timedelta(minutes=10)
pv_end = test_end_time + pd.Timedelta(minutes=10)
pv_window = pv_op_data_df.loc[pv_start:pv_end, '03LIC_1071.PV']

# Row 1: PV Trend
fig.add_trace(
    go.Scatter(x=pv_window.index, y=pv_window.values, mode='lines', name='03LIC_1071.PV', line=dict(color='blue')),
    row=1, col=1
)

# Add alarm threshold line
fig.add_hline(y=28.75, line_dash="dash", line_color="red", annotation_text="Alarm Threshold (28.75)", row=1, col=1)

# Add shaded regions for key periods using vrect
# Deviation to Alarm Start (orange)
fig.add_vrect(x0=test_deviation_start, x1=test_episode['AlarmStart'], 
              fillcolor="orange", opacity=0.1, line_width=0, row=1, col=1)
# Alarm period (red)
fig.add_vrect(x0=test_episode['AlarmStart'], x1=test_episode['AlarmEnd'], 
              fillcolor="red", opacity=0.2, line_width=0, row=1, col=1)

# Add actual action markers
if len(actual_actions_clean) > 0:
    action_times = pd.to_datetime(actual_actions_clean['VT_Start'])
    action_pv_values = [get_pv_at_timestamp(t, '03LIC_1071.PV') for t in action_times]
    fig.add_trace(
        go.Scatter(x=action_times, y=action_pv_values, mode='markers', name='Actual Actions (03PIC_1013)',
                  marker=dict(symbol='triangle-up', size=12, color='red')),
        row=1, col=1
    )

# Row 2: Recommended action source over time
# Map action source to numeric for plotting
source_map = {'03LIC_1071': 0, '03LIC_1016': 1, '03PIC_1013': 2}
top_recommendations['source_numeric'] = top_recommendations['recommended_action_source'].map(source_map)

# Color by direction: green=increase, red=decrease
colors = ['green' if d == 1 else 'red' for d in top_recommendations['recommended_action_direction']]

fig.add_trace(
    go.Scatter(x=top_recommendations['current_time'], 
              y=top_recommendations['source_numeric'],
              mode='markers',
              marker=dict(size=5, color=colors),
              name='Recommended Tag (green=↑, red=↓)',
              text=[f"Tag: {s}<br>Dir: {'↑ increase' if d==1 else '↓ decrease'}<br>Sim: {sim:.3f}" 
                    for s, d, sim in zip(top_recommendations['recommended_action_source'],
                                         top_recommendations['recommended_action_direction'],
                                         top_recommendations['similarity'])],
              hoverinfo='text'),
    row=2, col=1
)

# Add shaded regions for alarm period in row 2
fig.add_vrect(x0=test_episode['AlarmStart'], x1=test_episode['AlarmEnd'], 
              fillcolor="red", opacity=0.2, line_width=0, row=2, col=1)

# Add actual action times as vertical lines
if len(actual_actions_clean) > 0:
    for _, act in actual_actions_clean.iterrows():
        fig.add_shape(type="line", x0=act['VT_Start'], x1=act['VT_Start'], 
                     y0=0, y1=2, line=dict(color="red", dash="dash", width=1),
                     row=2, col=1)

# Row 3: Similarity score and ROC
fig.add_trace(
    go.Scatter(x=top_recommendations['current_time'], y=top_recommendations['similarity'],
              mode='lines', name='Similarity Score', line=dict(color='purple')),
    row=3, col=1
)
fig.add_trace(
    go.Scatter(x=top_recommendations['current_time'], y=top_recommendations['1071_roc_percent'],
              mode='lines', name='1071 ROC%', line=dict(color='orange')),
    row=3, col=1
)

# Add shaded regions for alarm period in row 3
fig.add_vrect(x0=test_episode['AlarmStart'], x1=test_episode['AlarmEnd'], 
              fillcolor="red", opacity=0.2, line_width=0, row=3, col=1)

# Update layout
fig.update_layout(
    height=900,
    title_text=f'Test Episode {test_episode_id} - Similarity-Based Action Recommendations<br>'
               f'<sub>Actual: 03PIC_1013 (increased) | Alarm: {test_episode["AlarmStart"]} to {test_episode["AlarmEnd"]}</sub>',
    showlegend=True
)

# Update y-axes
fig.update_yaxes(title_text="Level", row=1, col=1)
fig.update_yaxes(title_text="Tag", ticktext=['1071', '1016', '1013'], tickvals=[0, 1, 2], row=2, col=1)
fig.update_yaxes(title_text="Score / ROC%", row=3, col=1)

fig.show()

In [94]:
# Step 23: Evaluate recommendation accuracy around actual action times
print(f"=== Evaluation: How well did recommendations match actual actions? ===\n")
print(f"Actual actions taken: 03PIC_1013 (6 times, mostly INCREASE direction)")

# For each actual action, check what was recommended at that time
print(f"\n--- Recommendations at actual action times ---")
for _, act in actual_actions_clean.iterrows():
    act_time = pd.to_datetime(act['VT_Start'])
    act_dir = 'increase' if act['action_direction'] == 1 else 'decrease'
    
    # Find closest recommendation
    time_diff = abs((top_recommendations['current_time'] - act_time).dt.total_seconds())
    closest_idx = time_diff.idxmin()
    closest_rec = top_recommendations.loc[closest_idx]
    
    rec_dir = 'increase' if closest_rec['recommended_action_direction'] == 1 else 'decrease'
    
    print(f"\nActual: {act['Source']} {act_dir} at {act_time}")
    print(f"  Recommended: {closest_rec['recommended_action_source']} {rec_dir} "
          f"(similarity: {closest_rec['similarity']:.3f})")
    print(f"  Tag Match: {'✓' if act['Source'] == closest_rec['recommended_action_source'] else '✗'}")
    print(f"  Direction Match: {'✓' if act_dir == rec_dir else '✗'}")

# Calculate overall accuracy
print(f"\n=== Summary Statistics ===")

# Count how often 03PIC_1013 was in top recommendations during alarm period
alarm_start_offset = int((test_episode['AlarmStart'] - test_deviation_start).total_seconds() / 60)
alarm_end_offset = int((test_episode['AlarmEnd'] - test_deviation_start).total_seconds() / 60)

alarm_period_recs = top_recommendations[
    (top_recommendations['minute_offset'] >= alarm_start_offset) & 
    (top_recommendations['minute_offset'] <= alarm_end_offset)
]

# Pre-alarm period (30 min before alarm)
pre_alarm_recs = top_recommendations[
    (top_recommendations['minute_offset'] >= alarm_start_offset - 30) & 
    (top_recommendations['minute_offset'] < alarm_start_offset)
]

print(f"\nDuring alarm period ({len(alarm_period_recs)} minutes):")
if len(alarm_period_recs) > 0:
    print(f"  03PIC_1013 recommended: {(alarm_period_recs['recommended_action_source'] == '03PIC_1013').sum()} times "
          f"({(alarm_period_recs['recommended_action_source'] == '03PIC_1013').mean()*100:.1f}%)")

print(f"\n30 min before alarm ({len(pre_alarm_recs)} minutes):")
if len(pre_alarm_recs) > 0:
    print(f"  03PIC_1013 recommended: {(pre_alarm_recs['recommended_action_source'] == '03PIC_1013').sum()} times "
          f"({(pre_alarm_recs['recommended_action_source'] == '03PIC_1013').mean()*100:.1f}%)")

=== Evaluation: How well did recommendations match actual actions? ===

Actual actions taken: 03PIC_1013 (6 times, mostly INCREASE direction)

--- Recommendations at actual action times ---

Actual: 03PIC_1013 increase at 2024-06-06 18:29:47.905600
  Recommended: 03LIC_1071 decrease (similarity: 0.850)
  Tag Match: ✗
  Direction Match: ✗

Actual: 03PIC_1013 increase at 2024-06-06 18:32:34.005600
  Recommended: 03LIC_1071 decrease (similarity: 0.911)
  Tag Match: ✗
  Direction Match: ✗

Actual: 03PIC_1013 increase at 2024-06-06 18:33:38.455800
  Recommended: 03LIC_1071 decrease (similarity: 0.894)
  Tag Match: ✗
  Direction Match: ✗

Actual: 03PIC_1013 decrease at 2024-06-06 18:34:33.705900
  Recommended: 03PIC_1013 increase (similarity: 0.875)
  Tag Match: ✓
  Direction Match: ✗

Actual: 03PIC_1013 increase at 2024-06-06 18:35:26.555800
  Recommended: 03PIC_1013 increase (similarity: 0.875)
  Tag Match: ✓
  Direction Match: ✓

Actual: 03PIC_1013 increase at 2024-06-06 18:35:31.170600
 

In [95]:
# Step 24: Save the minute-by-minute results for this test episode
output_file = f'/home/h604827/ControlActions/RESULTS/similarity_test_episode_{test_episode_id}_results.csv'
minute_results_df.to_csv(output_file, index=False)
print(f"Results saved to: {output_file}")

# Also display the full results dataframe for reference
print(f"\n=== Full Minute-by-Minute Results (showing rank=1 only) ===")
display_cols = ['minute_offset', 'current_time', 'similarity', 'roc_similarity', 'direction_match',
                'recommended_action_source', 'recommended_action_direction', 'recommended_action_magnitude',
                '1071_roc_percent', '1071_roc_direction']
top_recommendations[display_cols].head(20)

Results saved to: /home/h604827/ControlActions/RESULTS/similarity_test_episode_386_results.csv

=== Full Minute-by-Minute Results (showing rank=1 only) ===


Unnamed: 0,minute_offset,current_time,similarity,roc_similarity,direction_match,recommended_action_source,recommended_action_direction,recommended_action_magnitude,1071_roc_percent,1071_roc_direction
0,0,2024-06-06 17:16:00,0.749623,0.5,1.0,03LIC_1071,1.0,5.7193,0.0,1
3,1,2024-06-06 17:17:00,0.87946,0.934291,0.75,03LIC_1071,0.0,-2.5185,6.469669,1
6,2,2024-06-06 17:18:00,0.864192,0.978545,0.642857,03LIC_1071,0.0,-2.0,13.076042,1
9,3,2024-06-06 17:19:00,0.880212,0.96071,0.714286,03LIC_1071,1.0,5.0,16.031303,1
12,4,2024-06-06 17:20:00,0.907873,0.965986,0.785714,03LIC_1071,1.0,5.0,20.625101,1
15,5,2024-06-06 17:21:00,0.884506,0.969134,0.714286,03LIC_1071,0.0,-2.0,20.972753,1
18,6,2024-06-06 17:22:00,0.88678,0.948847,0.75,03LIC_1071,1.0,5.0,15.733677,1
21,7,2024-06-06 17:23:00,0.858516,0.967066,0.642857,03PIC_1013,1.0,2.0,14.24785,1
24,8,2024-06-06 17:24:00,0.762939,0.901744,0.464286,03LIC_1071,0.0,-3.0,7.122185,1
27,9,2024-06-06 17:25:00,0.856828,0.889571,0.75,03LIC_1071,0.0,-3.0,-0.960622,0


## Summary: Test Episode 386 Results

### Actual Actions Taken
- **Tag**: `03PIC_1013` (Pressure controller)
- **Actions**: 6 times, mostly INCREASE direction
- **Timing**: Around 18:29 - 18:35 (just before and during alarm)

### Similarity-Based Recommendations
The weighted multi-component similarity approach recommended actions for each minute from deviation start to alarm end + 60 minutes.

**Key Findings:**
1. The algorithm recommends different tags at different times based on the current process state
2. `03PIC_1013` was recommended during certain periods, matching the actual operator action
3. Direction of action (increase/decrease) varies based on process context

### Next Steps
- Test on remaining 6 episodes to validate approach
- Fine-tune weights (w_roc, w_dir, w_pv) and tag weights
- Consider using top-k consensus voting instead of just top-1 recommendation
- Analyze when/why the algorithm recommends the correct vs incorrect tag

In [96]:
# Step 25: Run similarity approach on all remaining 6 test episodes and save results
import os
from tqdm import tqdm

# Create output directory
output_dir = '/home/h604827/ControlActions/RESULTS/similarity_test_results'
os.makedirs(output_dir, exist_ok=True)

# Test episodes (remaining 6, excluding 386 which we already tested)
test_episode_ids = [388, 400, 469, 521, 522, 604]

# Store summary for all episodes
all_episodes_summary = []

print(f"Processing {len(test_episode_ids)} test episodes...")
print(f"Output directory: {output_dir}\n")

for test_ep_id in test_episode_ids:
    print(f"\n{'='*60}")
    print(f"Processing Episode {test_ep_id}...")
    print(f"{'='*60}")
    
    # Get episode details
    ep_data = episodes_operated_tags_df[episodes_operated_tags_df['EpisodeID'] == test_ep_id].iloc[0]
    ep_alarm_start = ep_data['AlarmStart']
    ep_alarm_end = ep_data['AlarmEnd']
    ep_operated_tags = ep_data['OperatedTags']
    
    # Get deviation start
    ep_deviation_start = get_deviation_start_for_episode(test_ep_id, ep_alarm_start, ep_alarm_end)
    
    # Calculate time range
    ep_end_time = ep_alarm_end + pd.Timedelta(minutes=60)
    ep_total_minutes = int((ep_end_time - ep_deviation_start).total_seconds() / 60) + 1
    
    print(f"  Alarm: {ep_alarm_start} to {ep_alarm_end}")
    print(f"  Deviation Start: {ep_deviation_start}")
    print(f"  Actual Operated Tags: {ep_operated_tags}")
    print(f"  Total minutes to evaluate: {ep_total_minutes}")
    
    # Run similarity matching for each minute
    ep_minute_results = []
    
    for minute_offset in tqdm(range(ep_total_minutes), desc=f"Episode {test_ep_id}"):
        current_time = ep_deviation_start + pd.Timedelta(minutes=minute_offset)
        
        # Build runtime context
        runtime_ctx = build_runtime_context(ep_deviation_start, current_time, context_pv_tags)
        
        # Calculate similarity (including PV state similarity)
        sim_results = calculate_weighted_similarity(
            runtime_ctx, context_df_clean, roc_cols, dir_cols, pv_action_cols, tag_weights
        )
        
        # Get top 3 matches
        for rank, (_, match) in enumerate(sim_results.head(3).iterrows(), 1):
            ep_minute_results.append({
                'episode_id': test_ep_id,
                'minute_offset': minute_offset,
                'current_time': current_time,
                'rank': rank,
                'similarity': match['total_similarity'],
                'roc_similarity': match['roc_similarity'],
                'direction_match': match['direction_match'],
                'pv_similarity': match['pv_similarity'],  # Added PV similarity
                'recommended_action_source': match['action_source'],
                'recommended_action_direction': match['action_direction'],
                'recommended_action_magnitude': match['action_magnitude'],
                'matched_episode_id': match['episode_id'],
                '1071_roc_percent': runtime_ctx['03LIC_1071_roc_percent'],
                '1071_roc_direction': runtime_ctx['03LIC_1071_roc_direction']
            })
    
    ep_results_df = pd.DataFrame(ep_minute_results)
    ep_top_recs = ep_results_df[ep_results_df['rank'] == 1].copy()
    
    # Get actual actions for this episode
    ep_actual_actions, _ = get_operator_actions_for_episode(
        ep_alarm_start, ep_alarm_end,
        target_sources=['03LIC_1071', '03LIC_1016', '03PIC_1013']
    )
    
    if len(ep_actual_actions) > 0:
        ep_actual_clean = ep_actual_actions.dropna(subset=['PrevValue'])
        ep_actual_clean = ep_actual_clean.drop_duplicates(subset=['VT_Start', 'Source', 'Value'])
        ep_actual_clean = ep_actual_clean.copy()
        
        # Handle non-numeric values safely
        try:
            ep_actual_clean['action_magnitude'] = pd.to_numeric(ep_actual_clean['Value'], errors='coerce') - pd.to_numeric(ep_actual_clean['PrevValue'], errors='coerce')
            ep_actual_clean['action_direction'] = (ep_actual_clean['action_magnitude'] > 0).astype(int)
            # Remove rows where we couldn't compute magnitude
            ep_actual_clean = ep_actual_clean.dropna(subset=['action_magnitude'])
        except Exception as e:
            print(f"  Warning: Could not compute action magnitude: {e}")
            ep_actual_clean = pd.DataFrame()
    else:
        ep_actual_clean = pd.DataFrame()
    
    # Save minute-by-minute results CSV
    csv_path = f'{output_dir}/episode_{test_ep_id}_minute_results.csv'
    ep_results_df.to_csv(csv_path, index=False)
    
    # Create and save visualization
    fig = make_subplots(
        rows=3, cols=1,
        subplot_titles=[
            f'Episode {test_ep_id}: 03LIC_1071.PV Trend',
            'Recommended Action Source Over Time',
            'Similarity Score & 1071 ROC% Over Time'
        ],
        vertical_spacing=0.1,
        row_heights=[0.35, 0.3, 0.35]
    )
    
    # Get PV data
    pv_start = ep_deviation_start - pd.Timedelta(minutes=10)
    pv_end = ep_end_time + pd.Timedelta(minutes=10)
    pv_window = pv_op_data_df.loc[pv_start:pv_end, '03LIC_1071.PV']
    
    # Row 1: PV Trend
    fig.add_trace(
        go.Scatter(x=pv_window.index, y=pv_window.values, mode='lines', name='03LIC_1071.PV', line=dict(color='blue')),
        row=1, col=1
    )
    fig.add_hline(y=28.75, line_dash="dash", line_color="red", annotation_text="Alarm Threshold", row=1, col=1)
    fig.add_vrect(x0=ep_deviation_start, x1=ep_alarm_start, fillcolor="orange", opacity=0.1, line_width=0, row=1, col=1)
    fig.add_vrect(x0=ep_alarm_start, x1=ep_alarm_end, fillcolor="red", opacity=0.2, line_width=0, row=1, col=1)
    
    # Add actual action markers
    if len(ep_actual_clean) > 0:
        action_times = pd.to_datetime(ep_actual_clean['VT_Start'])
        action_pv_vals = [get_pv_at_timestamp(t, '03LIC_1071.PV') for t in action_times]
        fig.add_trace(
            go.Scatter(x=action_times, y=action_pv_vals, mode='markers', name=f'Actual: {ep_operated_tags}',
                      marker=dict(symbol='triangle-up', size=12, color='red')),
            row=1, col=1
        )
    
    # Row 2: Recommended action source
    source_map = {'03LIC_1071': 0, '03LIC_1016': 1, '03PIC_1013': 2}
    ep_top_recs['source_numeric'] = ep_top_recs['recommended_action_source'].map(source_map)
    colors = ['green' if d == 1 else 'red' for d in ep_top_recs['recommended_action_direction']]
    
    fig.add_trace(
        go.Scatter(x=ep_top_recs['current_time'], y=ep_top_recs['source_numeric'],
                  mode='markers', marker=dict(size=5, color=colors),
                  name='Recommended (green=↑, red=↓)',
                  text=[f"Tag: {s}<br>Dir: {'↑' if d==1 else '↓'}<br>Sim: {sim:.3f}" 
                        for s, d, sim in zip(ep_top_recs['recommended_action_source'],
                                             ep_top_recs['recommended_action_direction'],
                                             ep_top_recs['similarity'])],
                  hoverinfo='text'),
        row=2, col=1
    )
    fig.add_vrect(x0=ep_alarm_start, x1=ep_alarm_end, fillcolor="red", opacity=0.2, line_width=0, row=2, col=1)
    
    # Row 3: Similarity and ROC
    fig.add_trace(
        go.Scatter(x=ep_top_recs['current_time'], y=ep_top_recs['similarity'],
                  mode='lines', name='Similarity', line=dict(color='purple')),
        row=3, col=1
    )
    fig.add_trace(
        go.Scatter(x=ep_top_recs['current_time'], y=ep_top_recs['1071_roc_percent'],
                  mode='lines', name='1071 ROC%', line=dict(color='orange')),
        row=3, col=1
    )
    fig.add_vrect(x0=ep_alarm_start, x1=ep_alarm_end, fillcolor="red", opacity=0.2, line_width=0, row=3, col=1)
    
    fig.update_layout(
        height=900,
        title_text=f'Episode {test_ep_id} - Similarity-Based Recommendations<br>'
                   f'<sub>Actual: {ep_operated_tags} | Alarm: {ep_alarm_start} to {ep_alarm_end}</sub>',
        showlegend=True
    )
    fig.update_yaxes(title_text="Level", row=1, col=1)
    fig.update_yaxes(title_text="Tag", ticktext=['1071', '1016', '1013'], tickvals=[0, 1, 2], row=2, col=1)
    fig.update_yaxes(title_text="Score / ROC%", row=3, col=1)
    
    # Save visualization
    html_path = f'{output_dir}/episode_{test_ep_id}_visualization.html'
    fig.write_html(html_path)
    
    # Calculate summary statistics
    rec_source_dist = ep_top_recs['recommended_action_source'].value_counts().to_dict()
    rec_dir_dist = ep_top_recs['recommended_action_direction'].value_counts().to_dict()
    
    # Accuracy at actual action times
    tag_matches = 0
    dir_matches = 0
    total_actual = len(ep_actual_clean) if len(ep_actual_clean) > 0 else 0
    
    if total_actual > 0:
        for _, act in ep_actual_clean.iterrows():
            act_time = pd.to_datetime(act['VT_Start'])
            time_diff = abs((ep_top_recs['current_time'] - act_time).dt.total_seconds())
            closest_idx = time_diff.idxmin()
            closest_rec = ep_top_recs.loc[closest_idx]
            
            if act['Source'] == closest_rec['recommended_action_source']:
                tag_matches += 1
            act_dir_val = 1 if act['action_direction'] == 1 else 0
            if act_dir_val == closest_rec['recommended_action_direction']:
                dir_matches += 1
    
    summary = {
        'episode_id': test_ep_id,
        'alarm_start': str(ep_alarm_start),
        'alarm_end': str(ep_alarm_end),
        'deviation_start': str(ep_deviation_start),
        'actual_operated_tags': ep_operated_tags,
        'total_minutes_evaluated': ep_total_minutes,
        'actual_actions_count': total_actual,
        'tag_match_accuracy': tag_matches / total_actual if total_actual > 0 else None,
        'direction_match_accuracy': dir_matches / total_actual if total_actual > 0 else None,
        'recommended_source_distribution': rec_source_dist,
        'recommended_direction_distribution': rec_dir_dist
    }
    all_episodes_summary.append(summary)
    
    print(f"  Saved: {csv_path}")
    print(f"  Saved: {html_path}")
    print(f"  Actual actions: {total_actual}")
    if total_actual > 0:
        print(f"  Tag match accuracy: {tag_matches}/{total_actual} = {tag_matches/total_actual*100:.1f}%")
    else:
        print("  No actual actions with valid magnitude")

# Save overall summary
summary_df = pd.DataFrame(all_episodes_summary)
summary_path = f'{output_dir}/all_episodes_summary.csv'
summary_df.to_csv(summary_path, index=False)

# Also save as JSON for detailed info
import json
with open(f'{output_dir}/all_episodes_summary.json', 'w') as f:
    json.dump(all_episodes_summary, f, indent=2, default=str)

print(f"\n{'='*60}")
print(f"ALL EPISODES PROCESSED SUCCESSFULLY!")
print(f"{'='*60}")
print(f"\nResults saved to: {output_dir}/")
print(f"  - 6 episode CSV files (minute-by-minute results)")
print(f"  - 6 episode HTML visualizations")
print(f"  - all_episodes_summary.csv")
print(f"  - all_episodes_summary.json")

Processing 6 test episodes...
Output directory: /home/h604827/ControlActions/RESULTS/similarity_test_results


Processing Episode 388...
  Alarm: 2024-06-09 14:40:00 to 2024-06-09 14:44:00
  Deviation Start: 2024-06-09 13:14:00
  Actual Operated Tags: 03PIC_1013
  Total minutes to evaluate: 151


Episode 388:   0%|          | 0/151 [00:00<?, ?it/s]

Episode 388: 100%|██████████| 151/151 [00:09<00:00, 15.31it/s]


  Saved: /home/h604827/ControlActions/RESULTS/similarity_test_results/episode_388_minute_results.csv
  Saved: /home/h604827/ControlActions/RESULTS/similarity_test_results/episode_388_visualization.html
  Actual actions: 3
  Tag match accuracy: 0/3 = 0.0%

Processing Episode 400...
  Alarm: 2024-06-17 15:46:00 to 2024-06-17 16:28:00
  Deviation Start: 2024-06-17 14:20:00
  Actual Operated Tags: 03PIC_1013
  Total minutes to evaluate: 189


Episode 400: 100%|██████████| 189/189 [00:12<00:00, 15.51it/s]


  Saved: /home/h604827/ControlActions/RESULTS/similarity_test_results/episode_400_minute_results.csv
  Saved: /home/h604827/ControlActions/RESULTS/similarity_test_results/episode_400_visualization.html
  Actual actions: 21
  Tag match accuracy: 10/21 = 47.6%

Processing Episode 469...
  Alarm: 2024-09-21 08:07:00 to 2024-09-21 08:24:00
  Deviation Start: 2024-09-21 06:41:00
  Actual Operated Tags: 03LIC_1071
  Total minutes to evaluate: 164


Episode 469: 100%|██████████| 164/164 [00:10<00:00, 15.50it/s]


  Saved: /home/h604827/ControlActions/RESULTS/similarity_test_results/episode_469_minute_results.csv
  Saved: /home/h604827/ControlActions/RESULTS/similarity_test_results/episode_469_visualization.html
  Actual actions: 57
  Tag match accuracy: 50/57 = 87.7%

Processing Episode 521...
  Alarm: 2025-01-05 07:16:00 to 2025-01-05 08:06:00
  Deviation Start: 2025-01-05 05:51:00
  Actual Operated Tags: 03LIC_1071
  Total minutes to evaluate: 196


Episode 521: 100%|██████████| 196/196 [00:12<00:00, 15.50it/s]


  Saved: /home/h604827/ControlActions/RESULTS/similarity_test_results/episode_521_minute_results.csv
  Saved: /home/h604827/ControlActions/RESULTS/similarity_test_results/episode_521_visualization.html
  Actual actions: 1
  Tag match accuracy: 1/1 = 100.0%

Processing Episode 522...
  Alarm: 2025-01-05 08:24:00 to 2025-01-05 08:31:00
  Deviation Start: 2025-01-05 06:58:00
  Actual Operated Tags: 03LIC_1071
  Total minutes to evaluate: 154


Episode 522: 100%|██████████| 154/154 [00:09<00:00, 15.50it/s]


  Saved: /home/h604827/ControlActions/RESULTS/similarity_test_results/episode_522_minute_results.csv
  Saved: /home/h604827/ControlActions/RESULTS/similarity_test_results/episode_522_visualization.html
  Actual actions: 1
  Tag match accuracy: 1/1 = 100.0%

Processing Episode 604...
  Alarm: 2025-06-21 22:15:00 to 2025-06-21 22:16:00
  Deviation Start: 2025-06-21 20:49:00
  Actual Operated Tags: 03LIC_1071
  Total minutes to evaluate: 148


Episode 604: 100%|██████████| 148/148 [00:09<00:00, 15.09it/s]


  Saved: /home/h604827/ControlActions/RESULTS/similarity_test_results/episode_604_minute_results.csv
  Saved: /home/h604827/ControlActions/RESULTS/similarity_test_results/episode_604_visualization.html
  Actual actions: 1
  Tag match accuracy: 1/1 = 100.0%

ALL EPISODES PROCESSED SUCCESSFULLY!

Results saved to: /home/h604827/ControlActions/RESULTS/similarity_test_results/
  - 6 episode CSV files (minute-by-minute results)
  - 6 episode HTML visualizations
  - all_episodes_summary.csv
  - all_episodes_summary.json


In [97]:
# Step 26: Save episode 386 results and generate final summary

# Save episode 386 minute results to the results folder
minute_results_df.to_csv(f'{output_dir}/episode_386_minute_results.csv', index=False)

# Save episode 386 visualization
fig386 = make_subplots(
    rows=3, cols=1,
    subplot_titles=[
        f'Episode 386: 03LIC_1071.PV Trend',
        'Recommended Action Source Over Time',
        'Similarity Score & 1071 ROC% Over Time'
    ],
    vertical_spacing=0.1,
    row_heights=[0.35, 0.3, 0.35]
)

pv_start_386 = test_deviation_start - pd.Timedelta(minutes=10)
pv_end_386 = test_end_time + pd.Timedelta(minutes=10)
pv_window_386 = pv_op_data_df.loc[pv_start_386:pv_end_386, '03LIC_1071.PV']

fig386.add_trace(
    go.Scatter(x=pv_window_386.index, y=pv_window_386.values, mode='lines', name='03LIC_1071.PV', line=dict(color='blue')),
    row=1, col=1
)
fig386.add_hline(y=28.75, line_dash="dash", line_color="red", annotation_text="Alarm Threshold", row=1, col=1)
fig386.add_vrect(x0=test_deviation_start, x1=test_episode['AlarmStart'], fillcolor="orange", opacity=0.1, line_width=0, row=1, col=1)
fig386.add_vrect(x0=test_episode['AlarmStart'], x1=test_episode['AlarmEnd'], fillcolor="red", opacity=0.2, line_width=0, row=1, col=1)

if len(actual_actions_clean) > 0:
    action_times_386 = pd.to_datetime(actual_actions_clean['VT_Start'])
    action_pv_vals_386 = [get_pv_at_timestamp(t, '03LIC_1071.PV') for t in action_times_386]
    fig386.add_trace(
        go.Scatter(x=action_times_386, y=action_pv_vals_386, mode='markers', name='Actual: 03PIC_1013',
                  marker=dict(symbol='triangle-up', size=12, color='red')),
        row=1, col=1
    )

top_recommendations['source_numeric'] = top_recommendations['recommended_action_source'].map(source_map)
colors_386 = ['green' if d == 1 else 'red' for d in top_recommendations['recommended_action_direction']]

fig386.add_trace(
    go.Scatter(x=top_recommendations['current_time'], y=top_recommendations['source_numeric'],
              mode='markers', marker=dict(size=5, color=colors_386),
              name='Recommended (green=↑, red=↓)'),
    row=2, col=1
)
fig386.add_vrect(x0=test_episode['AlarmStart'], x1=test_episode['AlarmEnd'], fillcolor="red", opacity=0.2, line_width=0, row=2, col=1)

fig386.add_trace(
    go.Scatter(x=top_recommendations['current_time'], y=top_recommendations['similarity'],
              mode='lines', name='Similarity', line=dict(color='purple')),
    row=3, col=1
)
fig386.add_trace(
    go.Scatter(x=top_recommendations['current_time'], y=top_recommendations['1071_roc_percent'],
              mode='lines', name='1071 ROC%', line=dict(color='orange')),
    row=3, col=1
)
fig386.add_vrect(x0=test_episode['AlarmStart'], x1=test_episode['AlarmEnd'], fillcolor="red", opacity=0.2, line_width=0, row=3, col=1)

fig386.update_layout(
    height=900,
    title_text=f'Episode 386 - Similarity-Based Recommendations<br>'
               f'<sub>Actual: 03PIC_1013 | Alarm: {test_episode["AlarmStart"]} to {test_episode["AlarmEnd"]}</sub>',
    showlegend=True
)
fig386.update_yaxes(title_text="Level", row=1, col=1)
fig386.update_yaxes(title_text="Tag", ticktext=['1071', '1016', '1013'], tickvals=[0, 1, 2], row=2, col=1)
fig386.update_yaxes(title_text="Score / ROC%", row=3, col=1)

fig386.write_html(f'{output_dir}/episode_386_visualization.html')

# Create final summary for all 7 episodes
ep386_tag_matches = 0
ep386_total = len(actual_actions_clean)
for _, act in actual_actions_clean.iterrows():
    act_time = pd.to_datetime(act['VT_Start'])
    time_diff = abs((top_recommendations['current_time'] - act_time).dt.total_seconds())
    closest_idx = time_diff.idxmin()
    closest_rec = top_recommendations.loc[closest_idx]
    if act['Source'] == closest_rec['recommended_action_source']:
        ep386_tag_matches += 1

ep386_summary = {
    'episode_id': 386,
    'alarm_start': str(test_episode['AlarmStart']),
    'alarm_end': str(test_episode['AlarmEnd']),
    'deviation_start': str(test_deviation_start),
    'actual_operated_tags': test_episode['OperatedTags'],
    'total_minutes_evaluated': total_minutes,
    'actual_actions_count': ep386_total,
    'tag_match_accuracy': ep386_tag_matches / ep386_total if ep386_total > 0 else None,
    'direction_match_accuracy': None,
    'recommended_source_distribution': top_recommendations['recommended_action_source'].value_counts().to_dict(),
    'recommended_direction_distribution': top_recommendations['recommended_action_direction'].value_counts().to_dict()
}

# Combine all summaries
all_7_summaries = [ep386_summary] + all_episodes_summary
final_summary_df = pd.DataFrame(all_7_summaries)
final_summary_df.to_csv(f'{output_dir}/all_7_episodes_summary.csv', index=False)

# Save as JSON
with open(f'{output_dir}/all_7_episodes_summary.json', 'w') as f:
    json.dump(all_7_summaries, f, indent=2, default=str)

print(f"Episode 386 results saved to: {output_dir}/")
print(f"\n{'='*60}")
print("FINAL SUMMARY - ALL 7 TEST EPISODES")
print(f"{'='*60}")
print(f"\nResults for all 7 test episodes:")
print(final_summary_df[['episode_id', 'actual_operated_tags', 'actual_actions_count', 'tag_match_accuracy']].to_string(index=False))
print(f"\nAll results saved to: {output_dir}/")

Episode 386 results saved to: /home/h604827/ControlActions/RESULTS/similarity_test_results/

FINAL SUMMARY - ALL 7 TEST EPISODES

Results for all 7 test episodes:
 episode_id actual_operated_tags  actual_actions_count  tag_match_accuracy
        386           03PIC_1013                     6            0.333333
        388           03PIC_1013                     3            0.000000
        400           03PIC_1013                    21            0.476190
        469           03LIC_1071                    57            0.877193
        521           03LIC_1071                     1            1.000000
        522           03LIC_1071                     1            1.000000
        604           03LIC_1071                     1            1.000000

All results saved to: /home/h604827/ControlActions/RESULTS/similarity_test_results/
