# Notebook 1: Train and Save Isolation Forest Model
## SWaT Dataset - SCADA Anomaly Detection

**Purpose:** Train the model ONCE and save it for reuse

**Run this notebook:**
- Once initially
- When you want to retrain with new data
- When SCADA operations change significantly

---

## Step 1: Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
print("Google Drive mounted successfully!")

## Step 2: Configuration

In [None]:
# =============================================================
# CONFIGURATION
# =============================================================

# Path to your SWaT training data (normal operations)
TRAIN_DATA_PATH = '/content/drive/MyDrive/4th7thsemproject/SWaT_Normal.csv'

# Path to save the trained model
MODEL_SAVE_PATH = '/content/drive/MyDrive/4th7thsemproject/models/isolation_forest_model.pkl'

# Path to save the scaler (IMPORTANT - needed for predictions!)
SCALER_SAVE_PATH = '/content/drive/MyDrive/4th7thsemproject/models/scaler.pkl'

# Path to save feature column names
FEATURES_SAVE_PATH = '/content/drive/MyDrive/4th7thsemproject/models/feature_columns.pkl'

# =============================================================
# MODEL PARAMETERS
# =============================================================

# Contamination: Expected proportion of anomalies in training data
# For SWaT normal data, this should be very low (0.01 = 1%)
CONTAMINATION = 0.01

# Number of trees in the forest
N_ESTIMATORS = 100

# Random state for reproducibility
RANDOM_STATE = 42

# =============================================================
# COLUMN CONFIGURATION
# =============================================================

# Columns to EXCLUDE from training (not sensor data)
EXCLUDE_COLUMNS = [
    'Timestamp',
    ' Timestamp',
    'datetime',
    'Normal/Attack',
    ' Normal/Attack',
    'label',
    'Label',
    'Attack',
]

print("Configuration loaded!")
print(f"Training data: {TRAIN_DATA_PATH}")
print(f"Model will be saved to: {MODEL_SAVE_PATH}")

## Step 3: Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import joblib
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully!")

## Step 4: Load Training Data

In [None]:
# Load the training data
print(f"Loading training data from: {TRAIN_DATA_PATH}")
print("This may take a moment for large files...")

train_df = pd.read_csv(TRAIN_DATA_PATH)

print(f"\n‚úÖ Data loaded successfully!")
print(f"   Shape: {train_df.shape[0]:,} rows √ó {train_df.shape[1]} columns")
print(f"   Memory usage: {train_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Preview the data
print("First 5 rows:")
display(train_df.head())

print("\nColumn names:")
print(train_df.columns.tolist())

print("\nData types:")
print(train_df.dtypes)

In [None]:
# Check for the label column (if present)
label_col = None
for col in ['Normal/Attack', ' Normal/Attack', 'label', 'Label', 'Attack', 'attack']:
    if col in train_df.columns:
        label_col = col
        print(f"Found label column: '{label_col}'")
        print(f"Value counts:")
        print(train_df[label_col].value_counts())
        break

if label_col is None:
    print("No label column found - assuming all data is normal (which is correct for training!)")

## Step 5: Prepare Features

In [None]:
# Identify feature columns (exclude non-sensor columns)
feature_columns = [col for col in train_df.columns if col not in EXCLUDE_COLUMNS]

# Also exclude any non-numeric columns
numeric_features = train_df[feature_columns].select_dtypes(include=[np.number]).columns.tolist()

print(f"Total columns: {len(train_df.columns)}")
print(f"Feature columns selected: {len(numeric_features)}")
print(f"\nFeatures to be used for training:")
for i, col in enumerate(numeric_features, 1):
    print(f"  {i:2d}. {col}")

In [None]:
# Extract feature matrix
X_train = train_df[numeric_features].values

print(f"Feature matrix shape: {X_train.shape}")
print(f"  - {X_train.shape[0]:,} samples")
print(f"  - {X_train.shape[1]} features")

In [None]:
# Check for missing values
missing = np.isnan(X_train).sum()
if missing > 0:
    print(f"‚ö†Ô∏è Found {missing} missing values")
    print("Filling with column means...")
    col_means = np.nanmean(X_train, axis=0)
    nan_indices = np.where(np.isnan(X_train))
    X_train[nan_indices] = np.take(col_means, nan_indices[1])
    print("‚úÖ Missing values handled")
else:
    print("‚úÖ No missing values found")

## Step 6: Scale Features

In [None]:
# Initialize and fit the scaler
print("Fitting StandardScaler...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

print("‚úÖ Scaler fitted!")
print(f"\nScaling statistics (first 5 features):")
for i in range(min(5, len(numeric_features))):
    print(f"  {numeric_features[i]}: mean={scaler.mean_[i]:.4f}, std={scaler.scale_[i]:.4f}")

## Step 7: Train Isolation Forest Model

In [None]:
# Initialize the model
print("Initializing Isolation Forest model...")
print(f"  - n_estimators: {N_ESTIMATORS}")
print(f"  - contamination: {CONTAMINATION}")
print(f"  - random_state: {RANDOM_STATE}")

clf = IsolationForest(
    n_estimators=N_ESTIMATORS,
    contamination=CONTAMINATION,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=0
)

print("\nüöÄ Training model... (this may take a few minutes)")
start_time = datetime.now()

clf.fit(X_train_scaled)

end_time = datetime.now()
training_time = (end_time - start_time).total_seconds()

print(f"\n‚úÖ Model trained successfully!")
print(f"   Training time: {training_time:.2f} seconds")

In [None]:
# Quick validation on training data
print("Running quick validation on training data...")
train_predictions = clf.predict(X_train_scaled)

n_normal = (train_predictions == 1).sum()
n_anomaly = (train_predictions == -1).sum()

print(f"\nTraining data predictions:")
print(f"  Normal:  {n_normal:,} ({n_normal/len(train_predictions)*100:.2f}%)")
print(f"  Anomaly: {n_anomaly:,} ({n_anomaly/len(train_predictions)*100:.2f}%)")
print(f"\nExpected anomaly rate: {CONTAMINATION*100:.2f}%")
print(f"Actual anomaly rate:   {n_anomaly/len(train_predictions)*100:.2f}%")

## Step 8: Save Model, Scaler, and Feature Names

In [None]:
# Create the models directory if it doesn't exist
model_dir = os.path.dirname(MODEL_SAVE_PATH)
if model_dir and not os.path.exists(model_dir):
    os.makedirs(model_dir)
    print(f"Created directory: {model_dir}")

# Save the trained model
print(f"\nSaving model to: {MODEL_SAVE_PATH}")
joblib.dump(clf, MODEL_SAVE_PATH)
print("‚úÖ Model saved!")

# Save the scaler
print(f"\nSaving scaler to: {SCALER_SAVE_PATH}")
joblib.dump(scaler, SCALER_SAVE_PATH)
print("‚úÖ Scaler saved!")

# Save feature column names
print(f"\nSaving feature columns to: {FEATURES_SAVE_PATH}")
joblib.dump(numeric_features, FEATURES_SAVE_PATH)
print("‚úÖ Feature columns saved!")

In [None]:
# Verify the saved files
print("\n" + "="*60)
print("SAVED FILES VERIFICATION")
print("="*60)

files_to_check = [
    ("Model", MODEL_SAVE_PATH),
    ("Scaler", SCALER_SAVE_PATH),
    ("Features", FEATURES_SAVE_PATH)
]

for name, path in files_to_check:
    if os.path.exists(path):
        size = os.path.getsize(path) / 1024
        print(f"‚úÖ {name}: {path}")
        print(f"   Size: {size:.2f} KB")
    else:
        print(f"‚ùå {name}: FILE NOT FOUND - {path}")

## Step 9: Training Summary

In [None]:
print("\n" + "="*60)
print("TRAINING COMPLETE - SUMMARY")
print("="*60)
print(f"\nüìä Dataset:")
print(f"   - Training samples: {X_train.shape[0]:,}")
print(f"   - Features used: {X_train.shape[1]}")
print(f"\nü§ñ Model Configuration:")
print(f"   - Algorithm: Isolation Forest")
print(f"   - Trees: {N_ESTIMATORS}")
print(f"   - Contamination: {CONTAMINATION}")
print(f"   - Training time: {training_time:.2f} seconds")
print(f"\nüíæ Saved Files:")
print(f"   - Model: {MODEL_SAVE_PATH}")
print(f"   - Scaler: {SCALER_SAVE_PATH}")
print(f"   - Features: {FEATURES_SAVE_PATH}")
print(f"\n‚úÖ Next Step: Run '02_Inference_IsolationForest_SWaT.ipynb' to make predictions!")
print("="*60)