# Generate New TEP Evaluation Dataset

This notebook generates a completely new, independent TEP dataset using `tep-sim` for evaluating trained models on unseen data.

**Purpose**: Create an independent evaluation dataset to test model generalization beyond the original test set.

**Dataset Specifications**:
- **Duration**: 48 hours per simulation (same as original test set)
- **Sampling interval**: 3 minutes (180 seconds) → 960 samples per simulation
- **Fault introduction**: At hour 8 (sample 161), same as original test set
- **Fault classes**: 18 total (0=normal, 1, 2, 4-8, 10-14, 16-20; excluding 3, 9, 15)

**Output Files**:
- `data/new_multiclass_eval.csv` - Multiclass evaluation (~2.16M samples from 150 runs/class)
- `data/new_binary_eval.csv` - Binary anomaly detection evaluation (~1.3M samples)

**Generation Time**: ~12 hours (full mode)

## Configuration

In [None]:
import os
import time
import numpy as np
import pandas as pd
from pathlib import Path
from tep import TEPSimulator

# =============================================================================
# QUICK MODE: Set to True for fast testing with minimal data
# Can be set via environment variable or directly here
# =============================================================================
QUICK_MODE = os.environ.get('QUICK_MODE', 'False').lower() in ('true', '1', 'yes')

# Paths
DATA_DIR = Path('../data')
DATA_DIR.mkdir(exist_ok=True)

# Simulation parameters
FAULT_ONSET_HOURS = 8.0         # Fault introduced at hour 8
FAULT_ONSET_SAMPLE = 161        # Sample 161 (0-indexed: 160) = 8 hours at 3-min sampling

# Fault classes (matching original dataset, excluding 3, 9, 15)
FAULT_CLASSES = [0, 1, 2, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20]
NUM_CLASSES = len(FAULT_CLASSES)

# Random seed offset (different from original which used 42)
SEED_OFFSET = 1000

if QUICK_MODE:
    # Quick mode: full 48h duration (comparable trajectories) but minimal runs
    DURATION_HOURS = 48.0           # 48 hours - same as full mode for comparable trajectories
    RECORD_INTERVAL = 180           # 3 minutes
    RUNS_PER_CLASS = 3              # 3 runs per class (54 total runs)
    NORMAL_RUNS_BINARY = 3          # 3 normal runs
    FAULT_RUNS_BINARY = 2           # 2 runs per fault
    FILE_SUFFIX = '_quick'
    print("="*60)
    print("QUICK MODE ENABLED - Full duration, minimal runs")
    print("="*60)
else:
    # Full mode: Large dataset for robust evaluation (~12 hours to generate)
    # 48h simulation = 960 samples, post-fault = 800 samples per run
    # Target: statistically robust evaluation with <2% impact from single-run failures
    DURATION_HOURS = 48.0           # 48 hours per simulation
    RECORD_INTERVAL = 180           # 3 minutes = 180 seconds
    RUNS_PER_CLASS = 150            # 150 runs per fault class for multiclass
    NORMAL_RUNS_BINARY = 300        # 300 normal runs for binary
    FAULT_RUNS_BINARY = 75          # 75 runs per fault for binary
    FILE_SUFFIX = ''
    print("="*60)
    print("TEP New Dataset Generation (FULL MODE - LARGE)")
    print("="*60)

# Calculate expected samples
samples_per_run = int((DURATION_HOURS * 3600 / RECORD_INTERVAL)) + 1
post_fault_samples = samples_per_run - FAULT_ONSET_SAMPLE + 1
expected_multiclass = RUNS_PER_CLASS * NUM_CLASSES * post_fault_samples
expected_binary_normal = NORMAL_RUNS_BINARY * samples_per_run
expected_binary_fault = FAULT_RUNS_BINARY * (NUM_CLASSES - 1) * post_fault_samples
expected_binary = expected_binary_normal + expected_binary_fault

# Estimate generation time (~26 sec per simulation)
total_multiclass_runs = RUNS_PER_CLASS * NUM_CLASSES
total_binary_runs = NORMAL_RUNS_BINARY + FAULT_RUNS_BINARY * (NUM_CLASSES - 1)
est_time_hours = (total_multiclass_runs + total_binary_runs) * 26 / 3600

print(f"Duration: {DURATION_HOURS} hours per simulation")
print(f"Sampling: {RECORD_INTERVAL} seconds ({RECORD_INTERVAL/60:.0f} minutes)")
print(f"Samples per run: {samples_per_run} total, {post_fault_samples} post-fault")
print(f"Fault onset: Hour {FAULT_ONSET_HOURS} (sample {FAULT_ONSET_SAMPLE})")
print(f"Fault classes: {NUM_CLASSES} ({FAULT_CLASSES})")
print(f"Runs per class (multiclass): {RUNS_PER_CLASS}")
print(f"Normal runs (binary): {NORMAL_RUNS_BINARY}")
print(f"Fault runs per class (binary): {FAULT_RUNS_BINARY}")
print(f"Total multiclass runs: {total_multiclass_runs}")
print(f"Total binary runs: {total_binary_runs}")
print(f"Expected multiclass samples: ~{expected_multiclass:,}")
print(f"Expected binary samples: ~{expected_binary:,}")
print(f"Estimated generation time: ~{est_time_hours:.1f} hours")
if QUICK_MODE:
    print(f"Output files will have '{FILE_SUFFIX}' suffix")
print("="*60)

QUICK MODE ENABLED - Full duration, minimal runs
Duration: 48.0 hours per simulation
Sampling: 180 seconds (3 minutes)
Samples per run: 961 total, 801 post-fault
Fault onset: Hour 8.0 (sample 161)
Fault classes: 18 ([0, 1, 2, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20])
Runs per class (multiclass): 3
Normal runs (binary): 3
Fault runs per class (binary): 2
Total multiclass runs: 54
Total binary runs: 37
Expected multiclass samples: ~43,254
Expected binary samples: ~30,117
Estimated generation time: ~0.7 hours
Output files will have '_quick' suffix


## Dataset Generation Functions

In [2]:
def run_simulation(fault_number, seed, duration_hours=DURATION_HOURS, 
                   fault_onset_hours=FAULT_ONSET_HOURS, record_interval=RECORD_INTERVAL):
    """
    Run a single TEP simulation with optional fault injection.
    
    Parameters:
    -----------
    fault_number : int
        Fault ID (0 = normal, 1-20 = faults)
    seed : int
        Random seed for reproducibility
    duration_hours : float
        Total simulation duration in hours
    fault_onset_hours : float
        Time at which to introduce fault (hours)
    record_interval : int
        Recording interval in seconds
        
    Returns:
    --------
    dict with keys: 'measurements', 'manipulated', 'time', 'shutdown'
    """
    np.random.seed(seed)

    sim = TEPSimulator(random_seed=seed)
    sim.initialize()
    
    # Set up disturbances
    if fault_number == 0:
        disturbances = None
    else:
        # Introduce fault at specified time
        disturbances = {fault_number: (fault_onset_hours, 1)}
    
    result = sim.simulate(
        duration_hours=duration_hours,
        disturbances=disturbances,
        record_interval=record_interval
    )
    
    return {
        'measurements': result.measurements,           # Shape: (samples, 41)
        'manipulated': result.manipulated_vars[:, :11], # Shape: (samples, 11) - use first 11
        'time': result.time,
        'shutdown': result.shutdown
    }


def simulation_to_dataframe(sim_result, fault_number, run_number, origin='new'):
    """
    Convert simulation result to DataFrame matching original data format.
    
    Columns: faultNumber, simulationRun, sample, xmeas_1...xmeas_41, xmv_1...xmv_11, origin, traj_key
    """
    n_samples = len(sim_result['time'])
    
    # Build data dictionary
    data = {
        'faultNumber': [fault_number] * n_samples,
        'simulationRun': [float(run_number)] * n_samples,
        'sample': list(range(1, n_samples + 1))  # 1-indexed like original
    }
    
    # Add XMEAS columns (41 measurements)
    for i in range(41):
        data[f'xmeas_{i+1}'] = sim_result['measurements'][:, i]
    
    # Add XMV columns (11 manipulated variables)
    for i in range(11):
        data[f'xmv_{i+1}'] = sim_result['manipulated'][:, i]
    
    # Add metadata
    data['origin'] = [origin] * n_samples
    data['traj_key'] = [f'{origin}_f{fault_number}_r{run_number}'] * n_samples
    
    return pd.DataFrame(data)


print("✓ Functions defined")

✓ Functions defined


## Test Single Simulation

In [3]:
print("Testing single simulation...")
test_start = time.time()

# Test normal operation
result_normal = run_simulation(fault_number=0, seed=SEED_OFFSET)
print(f"Normal simulation: {result_normal['measurements'].shape[0]} samples, "
      f"shutdown={result_normal['shutdown']}")

# Test fault 1
result_fault = run_simulation(fault_number=1, seed=SEED_OFFSET + 1)
print(f"Fault 1 simulation: {result_fault['measurements'].shape[0]} samples, "
      f"shutdown={result_fault['shutdown']}")

# Convert to DataFrame and verify format
df_test = simulation_to_dataframe(result_fault, fault_number=1, run_number=1)
print(f"\nDataFrame shape: {df_test.shape}")
print(f"Columns: {list(df_test.columns)[:8]}...{list(df_test.columns)[-3:]}")

test_time = time.time() - test_start
print(f"\n✓ Test complete in {test_time:.2f}s")
print(f"Estimated time for full dataset: {test_time * RUNS_PER_CLASS * NUM_CLASSES / 60:.1f} minutes")

Testing single simulation...
Normal simulation: 961 samples, shutdown=False
Fault 1 simulation: 961 samples, shutdown=False

DataFrame shape: (961, 57)
Columns: ['faultNumber', 'simulationRun', 'sample', 'xmeas_1', 'xmeas_2', 'xmeas_3', 'xmeas_4', 'xmeas_5']...['xmv_11', 'origin', 'traj_key']

✓ Test complete in 51.55s
Estimated time for full dataset: 46.4 minutes


## Generate Multiclass Evaluation Dataset

Generate desired runs per fault class (18 classes). Use only post-fault samples (161-960) for balanced classes.

In [4]:
print("="*60)
print("Generating Multiclass Evaluation Dataset")
print("="*60)

start_time = time.time()
all_dfs = []
total_runs = RUNS_PER_CLASS * NUM_CLASSES
completed = 0

for fault_idx, fault_number in enumerate(FAULT_CLASSES):
    fault_start = time.time()
    print(f"\nFault {fault_number} ({fault_idx+1}/{NUM_CLASSES})...")
    
    for run in range(1, RUNS_PER_CLASS + 1):
        # Unique seed for each simulation
        seed = SEED_OFFSET + fault_number * 1000 + run
        
        try:
            result = run_simulation(fault_number=fault_number, seed=seed)
            df = simulation_to_dataframe(result, fault_number, run, origin='new_eval')
            
            # For multiclass: use only post-fault samples (161 onwards)
            # This gives 800 samples per run (961 - 161 = 800)
            df_post_fault = df[df['sample'] >= FAULT_ONSET_SAMPLE].copy()
            all_dfs.append(df_post_fault)
            
        except Exception as e:
            print(f"  Warning: Fault {fault_number} run {run} failed: {e}")
        
        completed += 1
        if run % 50 == 0:
            elapsed = time.time() - start_time
            eta = elapsed / completed * (total_runs - completed)
            print(f"  Run {run}/{RUNS_PER_CLASS} - "
                  f"Progress: {completed}/{total_runs} ({100*completed/total_runs:.1f}%) - "
                  f"ETA: {eta/60:.1f} min")
    
    fault_time = time.time() - fault_start
    print(f"  Completed in {fault_time:.1f}s")

# Combine all DataFrames
print("\nCombining DataFrames...")
multiclass_df = pd.concat(all_dfs, ignore_index=True)

total_time = time.time() - start_time
print(f"\n✓ Multiclass dataset generated in {total_time/60:.1f} minutes")
print(f"  Shape: {multiclass_df.shape}")
print(f"  Samples per class: {multiclass_df.groupby('faultNumber').size().values}")

Generating Multiclass Evaluation Dataset

Fault 0 (1/18)...
  Completed in 77.5s

Fault 1 (2/18)...
  Completed in 77.7s

Fault 2 (3/18)...
  Completed in 77.6s

Fault 4 (4/18)...
  Completed in 77.8s

Fault 5 (5/18)...
  Completed in 77.7s

Fault 6 (6/18)...
  Completed in 22.5s

Fault 7 (7/18)...
  Completed in 77.6s

Fault 8 (8/18)...
  Completed in 77.6s

Fault 10 (9/18)...
  Completed in 77.2s

Fault 11 (10/18)...
  Completed in 77.1s

Fault 12 (11/18)...
  Completed in 61.4s

Fault 13 (12/18)...
  Completed in 77.5s

Fault 14 (13/18)...
  Completed in 77.4s

Fault 16 (14/18)...
  Completed in 77.4s

Fault 17 (15/18)...
  Completed in 77.2s

Fault 18 (16/18)...
  Completed in 38.4s

Fault 19 (17/18)...
  Completed in 77.4s

Fault 20 (18/18)...
  Completed in 77.4s

Combining DataFrames...

✓ Multiclass dataset generated in 21.4 minutes
  Shape: (39157, 57)
  Samples per class: [2403 2403 2403 2403 2403  356 2403 2403 2403 2403 1800 2403 2403 2403
 2403  956 2403 2403]


In [5]:
# Save multiclass dataset
output_file = DATA_DIR / f'new_multiclass_eval{FILE_SUFFIX}.csv'
print(f"Saving multiclass dataset to {output_file}...")
multiclass_df.to_csv(output_file, index=False)
file_size = output_file.stat().st_size
if file_size > 1e9:
    print(f"✓ Saved ({file_size / 1e9:.2f} GB)")
else:
    print(f"✓ Saved ({file_size / 1e6:.1f} MB)")

# Verify class balance
print("\nClass distribution:")
print(multiclass_df['faultNumber'].value_counts().sort_index())

Saving multiclass dataset to ../data/new_multiclass_eval_quick.csv...
✓ Saved (38.9 MB)

Class distribution:
faultNumber
0     2403
1     2403
2     2403
4     2403
5     2403
6      356
7     2403
8     2403
10    2403
11    2403
12    1800
13    2403
14    2403
16    2403
17    2403
18     956
19    2403
20    2403
Name: count, dtype: int64


## Generate Binary Evaluation Dataset

Generate:
- 300 normal runs (all samples)
- 75 runs per fault (17 faults, post-fault samples only)

In [6]:
print("="*60)
print("Generating Binary Evaluation Dataset")
print("="*60)

start_time = time.time()
binary_dfs = []

# Generate normal runs (all samples)
print("\nGenerating normal runs (120 runs, all samples)...")
for run in range(1, NORMAL_RUNS_BINARY + 1):
    seed = SEED_OFFSET + 50000 + run  # Different seed range from multiclass
    
    try:
        result = run_simulation(fault_number=0, seed=seed)
        df = simulation_to_dataframe(result, fault_number=0, run_number=run, origin='new_binary')
        binary_dfs.append(df)
    except Exception as e:
        print(f"  Warning: Normal run {run} failed: {e}")
    
    if run % 30 == 0:
        print(f"  Progress: {run}/{NORMAL_RUNS_BINARY}")

print(f"  ✓ Normal runs complete")

# Generate fault runs (post-fault samples only)
fault_classes_no_normal = [f for f in FAULT_CLASSES if f != 0]
print(f"\nGenerating fault runs ({len(fault_classes_no_normal)} faults × {FAULT_RUNS_BINARY} runs)...")

for fault_idx, fault_number in enumerate(fault_classes_no_normal):
    for run in range(1, FAULT_RUNS_BINARY + 1):
        seed = SEED_OFFSET + 60000 + fault_number * 100 + run
        
        try:
            result = run_simulation(fault_number=fault_number, seed=seed)
            df = simulation_to_dataframe(result, fault_number=fault_number, run_number=run, origin='new_binary')
            # Use only post-fault samples
            df_post_fault = df[df['sample'] >= FAULT_ONSET_SAMPLE].copy()
            binary_dfs.append(df_post_fault)
        except Exception as e:
            print(f"  Warning: Fault {fault_number} run {run} failed: {e}")
    
    if (fault_idx + 1) % 5 == 0:
        print(f"  Progress: {fault_idx+1}/{len(fault_classes_no_normal)} faults")

# Combine
print("\nCombining DataFrames...")
binary_df = pd.concat(binary_dfs, ignore_index=True)

total_time = time.time() - start_time
print(f"\n✓ Binary dataset generated in {total_time/60:.1f} minutes")
print(f"  Shape: {binary_df.shape}")

Generating Binary Evaluation Dataset

Generating normal runs (120 runs, all samples)...
  ✓ Normal runs complete

Generating fault runs (17 faults × 2 runs)...
  Progress: 5/17 faults
  Progress: 10/17 faults
  Progress: 15/17 faults

Combining DataFrames...

✓ Binary dataset generated in 15.0 minutes
  Shape: (27846, 57)


In [7]:
# Add binary label column
binary_df['label'] = (binary_df['faultNumber'] != 0).astype(int)

# Save binary dataset
output_file = DATA_DIR / f'new_binary_eval{FILE_SUFFIX}.csv'
print(f"Saving binary dataset to {output_file}...")
binary_df.to_csv(output_file, index=False)
print(f"✓ Saved ({output_file.stat().st_size / 1e6:.1f} MB)")

# Statistics
n_normal = (binary_df['label'] == 0).sum()
n_fault = (binary_df['label'] == 1).sum()
print(f"\nBinary distribution:")
print(f"  Normal: {n_normal:,} ({100*n_normal/len(binary_df):.1f}%)")
print(f"  Fault:  {n_fault:,} ({100*n_fault/len(binary_df):.1f}%)")

Saving binary dataset to ../data/new_binary_eval_quick.csv...
✓ Saved (27.9 MB)

Binary distribution:
  Normal: 2,883 (10.4%)
  Fault:  24,963 (89.6%)


## Verify Dataset Quality

In [8]:
print("="*60)
print("Dataset Verification")
print("="*60)

# Load and verify multiclass
print("\nMulticlass Dataset:")
mc = pd.read_csv(DATA_DIR / f'new_multiclass_eval{FILE_SUFFIX}.csv')
print(f"  Shape: {mc.shape}")
print(f"  Classes: {sorted(mc['faultNumber'].unique())}")
print(f"  Samples per class: {mc.groupby('faultNumber').size().min()} - {mc.groupby('faultNumber').size().max()}")
print(f"  Missing values: {mc.isnull().sum().sum()}")

# Load and verify binary
print("\nBinary Dataset:")
bn = pd.read_csv(DATA_DIR / f'new_binary_eval{FILE_SUFFIX}.csv')
print(f"  Shape: {bn.shape}")
print(f"  Normal samples: {(bn['label'] == 0).sum():,}")
print(f"  Fault samples: {(bn['label'] == 1).sum():,}")
print(f"  Missing values: {bn.isnull().sum().sum()}")

# Compare with original test sets (only in full mode)
if not QUICK_MODE:
    print("\nComparison with Original Test Sets:")
    orig_mc = pd.read_csv(DATA_DIR / 'multiclass_test.csv')
    orig_bn = pd.read_csv(DATA_DIR / 'binary_test.csv')
    print(f"  Original multiclass: {orig_mc.shape}")
    print(f"  New multiclass:      {mc.shape}")
    print(f"  Original binary:     {orig_bn.shape}")
    print(f"  New binary:          {bn.shape}")

print("\n" + "="*60)
if QUICK_MODE:
    print("✓ Quick Dataset Generation Complete!")
    print(f"  Files saved with '{FILE_SUFFIX}' suffix")
else:
    print("✓ Dataset Generation Complete!")
print("="*60)

Dataset Verification

Multiclass Dataset:
  Shape: (39157, 57)
  Classes: [np.int64(0), np.int64(1), np.int64(2), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20)]
  Samples per class: 356 - 2403
  Missing values: 0

Binary Dataset:
  Shape: (27846, 58)
  Normal samples: 2,883
  Fault samples: 24,963
  Missing values: 0

✓ Quick Dataset Generation Complete!
  Files saved with '_quick' suffix
