In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

print("=" * 60)
print("PART 1: LOADING DATA")
print("=" * 60)

# Load the windows we created
X_train = np.load('dataset/X_train.npy')
X_test = np.load('dataset/X_test.npy')

print(f"\nTraining windows: {X_train.shape}")
print(f"  - Number of windows: {X_train.shape[0]:,}")
print(f"  - Timesteps per window: {X_train.shape[1]}")
print(f"  - Features per timestep: {X_train.shape[2]}")

print(f"\nTest windows: {X_test.shape}")
print(f"  - Number of windows: {X_test.shape[0]:,}")
print(f"  - Timesteps per window: {X_test.shape[1]}")
print(f"  - Features per timestep: {X_test.shape[2]}")

print("\n✓ Data loaded successfully!")

PART 1: LOADING DATA

Training windows: (410623, 60, 5)
  - Number of windows: 410,623
  - Timesteps per window: 60
  - Features per timestep: 5

Test windows: (105936, 60, 5)
  - Number of windows: 105,936
  - Timesteps per window: 60
  - Features per timestep: 5

✓ Data loaded successfully!


In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print("=" * 60)
print("PART 2: BUILDING LSTM AUTOENCODER")
print("=" * 60)

# Model parameters
TIMESTEPS = 60  # 60 seconds
FEATURES = 5    # HR, HR_change, HR_rolling_mean, HR_rolling_std, HR_deviation

def build_autoencoder():
    """
    Build LSTM Autoencoder
    
    Encoder: Compresses 60 seconds of data into a small summary
    Decoder: Tries to recreate the original 60 seconds from summary
    """
    
    # INPUT: 60 seconds × 5 features
    input_layer = layers.Input(shape=(TIMESTEPS, FEATURES))
    
    # ============ ENCODER (Compressor) ============
    # LSTM Layer 1: Learn patterns
    encoded = layers.LSTM(64, activation='relu', return_sequences=True)(input_layer)
    encoded = layers.Dropout(0.2)(encoded)  # Prevents overfitting
    
    # LSTM Layer 2: Compress more
    encoded = layers.LSTM(32, activation='relu', return_sequences=False)(encoded)
    encoded = layers.Dropout(0.2)(encoded)
    
    # BOTTLENECK: Tiny summary (16 numbers represent entire 60-second pattern!)
    bottleneck = layers.Dense(16, activation='relu')(encoded)
    
    # ============ DECODER (Recreator) ============
    # Expand the summary back to 60 timesteps
    decoded = layers.RepeatVector(TIMESTEPS)(bottleneck)
    
    # LSTM Layer 3: Start recreating
    decoded = layers.LSTM(32, activation='relu', return_sequences=True)(decoded)
    decoded = layers.Dropout(0.2)(decoded)
    
    # LSTM Layer 4: Continue recreating
    decoded = layers.LSTM(64, activation='relu', return_sequences=True)(decoded)
    decoded = layers.Dropout(0.2)(decoded)
    
    # OUTPUT: Recreated 60 seconds × 5 features
    output_layer = layers.TimeDistributed(layers.Dense(FEATURES))(decoded)
    
    # Build the model
    autoencoder = keras.Model(inputs=input_layer, outputs=output_layer)
    
    return autoencoder

# Create the model
model = build_autoencoder()

# Compile (prepare for training)
model.compile(
    optimizer='adam',  # Adam optimizer (smart learning algorithm)
    loss='mse'         # MSE = Mean Squared Error (measures reconstruction error)
)

# Show model structure
print("\nModel Architecture:")
model.summary()

print("\n✓ Model built successfully!")

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

print("=" * 60)
print("PART 3: TRAINING THE MODEL")
print("=" * 60)

# Split training data into train and validation (90-10)
split_index = int(0.9 * len(X_train))
X_train_split = X_train[:split_index]
X_val_split = X_train[split_index:]

print(f"\nTraining samples: {len(X_train_split):,}")
print(f"Validation samples: {len(X_val_split):,}")

# Callbacks (smart training helpers)
early_stopping = EarlyStopping(
    monitor='val_loss',      # Watch validation loss
    patience=5,              # Stop if no improvement for 5 epochs
    restore_best_weights=True  # Keep the best model
)

model_checkpoint = ModelCheckpoint(
    'best_model.keras',      # Save best model
    monitor='val_loss',
    save_best_only=True
)

print("\nStarting training...")
print("This may take 10-30 minutes depending on your computer!")
print("-" * 60)

# Train the model
# Input = X_train, Output = X_train (trying to recreate itself!)
history = model.fit(
    X_train_split, X_train_split,  # Input and output are same!
    epochs=50,                      # Try 50 rounds
    batch_size=32,                  # Process 32 windows at a time
    validation_data=(X_val_split, X_val_split),
    callbacks=[early_stopping, model_checkpoint],
    verbose=1                       # Show progress
)

print("\n✓ Training complete!")
print(f"✓ Best model saved as 'best_model.keras'")

In [None]:
print("=" * 60)
print("PART 4: VISUALIZING TRAINING")
print("=" * 60)

# Plot training and validation loss
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss', linewidth=2)
plt.plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.title('Model Training Progress')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss', linewidth=2)
plt.plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
plt.xlabel('Epoch')
plt.ylabel('Loss (MSE)')
plt.title('Model Training Progress (Log Scale)')
plt.yscale('log')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_history.png', dpi=150)
plt.show()

print("\n✓ Training history plotted!")
print("✓ Saved as 'training_history.png'")

In [None]:
print("=" * 60)
print("PART 5: CALCULATING RECONSTRUCTION ERRORS")
print("=" * 60)

# Load best model
model = keras.models.load_model('best_model.keras')

print("\nCalculating errors on TRAINING data...")
# Predict (recreate) training data
train_predictions = model.predict(X_train, verbose=1)

# Calculate error for each window
# MSE = Mean Squared Error = average of (actual - predicted)^2
train_errors = np.mean(np.square(X_train - train_predictions), axis=(1, 2))

print(f"\nTraining error statistics:")
print(f"  Mean error: {train_errors.mean():.4f}")
print(f"  Std error: {train_errors.std():.4f}")
print(f"  Min error: {train_errors.min():.4f}")
print(f"  Max error: {train_errors.max():.4f}")

print("\nCalculating errors on TEST data...")
# Predict (recreate) test data
test_predictions = model.predict(X_test, verbose=1)

# Calculate error for each window
test_errors = np.mean(np.square(X_test - test_predictions), axis=(1, 2))

print(f"\nTest error statistics:")
print(f"  Mean error: {test_errors.mean():.4f}")
print(f"  Std error: {test_errors.std():.4f}")
print(f"  Min error: {test_errors.min():.4f}")
print(f"  Max error: {test_errors.max():.4f}")

print("\n✓ Errors calculated!")

In [None]:
print("=" * 60)
print("PART 6: SETTING THRESHOLD")
print("=" * 60)

# Set threshold at 95th percentile of training errors
# Meaning: 95% of normal patterns have error below this
threshold = np.percentile(train_errors, 95)

print(f"\nThreshold (95th percentile): {threshold:.4f}")
print(f"Meaning: Any error above {threshold:.4f} is considered unusual")

# Count anomalies in test set
test_anomalies = test_errors > threshold
n_anomalies = test_anomalies.sum()

print(f"\nTest set results:")
print(f"  Total windows: {len(test_errors):,}")
print(f"  Anomalies detected: {n_anomalies:,}")
print(f"  Anomaly rate: {(n_anomalies/len(test_errors))*100:.2f}%")

# Visualize errors
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.hist(train_errors, bins=50, alpha=0.7, label='Training', color='blue')
plt.hist(test_errors, bins=50, alpha=0.7, label='Test', color='green')
plt.axvline(threshold, color='red', linestyle='--', linewidth=2, label='Threshold')
plt.xlabel('Reconstruction Error')
plt.ylabel('Frequency')
plt.title('Distribution of Reconstruction Errors')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(test_errors, alpha=0.7, label='Test Error', linewidth=0.5)
plt.axhline(threshold, color='red', linestyle='--', linewidth=2, label='Threshold')
plt.scatter(np.where(test_anomalies)[0], test_errors[test_anomalies], 
           color='red', label='Anomalies', s=10, alpha=0.6)
plt.xlabel('Window Index')
plt.ylabel('Reconstruction Error')
plt.title('Test Set Reconstruction Errors')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('anomaly_detection_results.png', dpi=150)
plt.show()

print("\n✓ Threshold set!")
print("✓ Results saved as 'anomaly_detection_results.png'")

In [None]:
print("=" * 60)
print("PART 7: SAVING FOR DEPLOYMENT")
print("=" * 60)

# Save threshold
np.save('threshold.npy', threshold)
print(f"✓ Saved threshold: {threshold:.4f}")

# Model already saved as 'best_model.keras'
print(f"✓ Model saved as: best_model.keras")

# Scaler already saved from earlier
print(f"✓ Scaler saved as: scaler.pkl")

print("\n" + "=" * 60)
print("ALL DONE! MODEL IS READY!")
print("=" * 60)

print("\nFiles you have:")
print("  1. best_model.keras - The trained LSTM model")
print("  2. scaler.pkl - For normalizing new data")
print("  3. threshold.npy - The anomaly detection threshold")

print("\nYou can now use these in a smartwatch app!")