# LSTM + Decision Tree Integration Demo

This notebook demonstrates a full pipeline for hard drive failure prediction using both LSTM (deep learning) and Decision Tree (CT) models.

You will:
- Train and test the LSTM model
- Train and test the Decision Tree model on raw data
- Generate LSTM predictions for all drives
- Test a random drive using the LSTM model
- Feed the LSTM results into the Decision Tree and analyze the outcome

**Requirements:**  
- All dependencies installed (see `requirements.txt`)
- Data available in `../../data/data_Q1_2025/`
- LSTM and CT code available in `notebooks/LSTM/smart.py` and `notebooks/CT/CT.py`


In [8]:
# Step 1: Setup and Imports

import os
import sys
from typing import Optional
import pandas as pd
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import joblib
import json
from datetime import datetime


# Add project root and submodules to sys.path for imports
project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(project_root)
sys.path.append(os.path.join(project_root, "notebooks/LSTM"))
sys.path.append(os.path.join(project_root, "notebooks/CT"))

import notebooks.LSTM.smart as smart
import notebooks.CT.CT as CT

# Set device for torch: use GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Step 2: Train and Test the LSTM Model

We will train the LSTM model.
If a trained model already exists, this step will load it instead of retraining.

In [18]:
# Define paths
lstm_model_path = os.path.join(project_root, "models/LSTM/lstm_model.joblib")
lstm_data_path = os.path.join(project_root, "data/data_Q1_2025")
lstm_output_dir = os.path.join(os.getcwd(), "lstm_demo_output")

# Train or load the LSTM model
if os.path.exists(lstm_model_path) or os.path.exists(lstm_model_path.replace('.joblib', '.pth')):
    print(f"✅ LSTM model already exists at {lstm_model_path}")
else:
    print("Training LSTM model...")
    # You may need to adjust parameters as needed
    if hasattr(smart, "train_lstm_model"):
        smart.train_lstm_model(
            data_path=lstm_data_path,
            model_path=lstm_model_path,
            output_dir=lstm_output_dir,
            epochs=5  # Adjust as needed for demo
        )
        print("✅ LSTM model training complete.")
    else:
        print("❌ train_lstm_model function not found in smart.py. Please train the model manually.")


✅ LSTM model already exists at f:\Github\hd-failure-prediction\models/LSTM/lstm_model.joblib


## Step 3: Evaluate LSTM Model on Test Set

Let's evaluate the LSTM model's performance on the test set and display some metrics.

In [21]:
# Try loading the full model with joblib first
try:
    model, model_metrics = smart.load_model(lstm_model_path, device, load_whole_model=True)
except UnicodeDecodeError as e:
    print(f"joblib load failed with UnicodeDecodeError: {e}")
    # Try loading as a PyTorch model if joblib fails
    model, model_metrics = smart.load_model(lstm_model_path.replace('.joblib', '.pth'), device, load_whole_model=True)

minimum_loss = np.inf
if model_metrics:
    print(f"Previous best validation loss: {model_metrics.get('val_loss', 'N/A')}")
    minimum_loss = model_metrics.get('val_loss', minimum_loss)
print("Trained model found! Loading and testing...")

# Test the loaded model
model.eval()
test_loss = 0
num_test_batches = 0

print("Testing loaded model...")
test_pbar = tqdm(test_loader, desc="Testing Model", leave=False)

with torch.no_grad():
    for test_data, test_labels in test_pbar:
        test_data, test_labels = test_data.to(device), test_labels.to(device)
        predictions = model(test_data)
        loss = loss_function(predictions, test_labels)
        
        test_loss += loss.item()
        num_test_batches += 1
        
        test_pbar.set_postfix({'Test Loss': f'{loss.item():.6f}'})

avg_test_loss = test_loss / num_test_batches if num_test_batches > 0 else 0
print(f"Model Test Results - Average Loss: {avg_test_loss:.6f}")

✅ Complete model loaded from f:\Github\hd-failure-prediction\models/LSTM/lstm_model.joblib
joblib load failed with UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte
✅ Complete model loaded from f:\Github\hd-failure-prediction\models/LSTM/lstm_model.joblib
✅ Model metrics loaded from f:\Github\hd-failure-prediction\models/LSTM/lstm_model_metrics.json
Previous best validation loss: 59515.34847878398
Trained model found! Loading and testing...
Testing loaded model...


NameError: name 'test_loader' is not defined

In [None]:

# Test the loaded model
model.eval()
test_loss = 0
num_test_batches = 0

print("Testing loaded model...")
test_pbar = tqdm(test_loader, desc="Testing Model", leave=False)

with torch.no_grad():
    for test_data, test_labels in test_pbar:
        test_data, test_labels = test_data.to(device), test_labels.to(device)
        predictions = model(test_data)
        loss = loss_function(predictions, test_labels)
        
        test_loss += loss.item()
        num_test_batches += 1
        
        test_pbar.set_postfix({'Test Loss': f'{loss.item():.6f}'})

avg_test_loss = test_loss / num_test_batches if num_test_batches > 0 else 0
print(f"Model Test Results - Average Loss: {avg_test_loss:.6f}")


# Generate predictions on test set
model.eval()
test_predictions = []
test_targets = []

print("Generating final predictions...")
with torch.no_grad():
    for test_data, test_labels in tqdm(test_loader, desc="Generating predictions"):
        test_data, test_labels = test_data.to(device), test_labels.to(device)
        predictions = model(test_data)
        test_predictions.append(predictions.cpu().numpy())
        test_targets.append(test_labels.cpu().numpy())
        
# Concatenate all predictions and targets
test_predictions = np.concatenate(test_predictions, axis=0)
test_targets = np.concatenate(test_targets, axis=0)

# Plot predictions vs actual values for the first feature
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

# Plot first few samples for visualization
num_samples_to_plot = min(50, len(test_predictions))
x = np.arange(num_samples_to_plot)

ax1.plot(x, test_predictions[:num_samples_to_plot, 0, 0], label='Predicted', alpha=0.7)
ax1.plot(x, test_targets[:num_samples_to_plot, 0, 0], label='Actual', alpha=0.7)
ax1.set_xlabel("Sample")
ax1.set_ylabel("Feature 1 Value")
ax1.set_title("Predictions vs Actual Values (Feature 1)")
ax1.legend()

# Plot prediction error
error = test_predictions[:num_samples_to_plot, 0, 0] - test_targets[:num_samples_to_plot, 0, 0]
ax2.plot(x, error, label='Prediction Error', color='red', alpha=0.7)
ax2.set_xlabel("Sample")
ax2.set_ylabel("Error")
ax2.set_title("Prediction Error")
ax2.legend()
plt.tight_layout()

## Step 4: Train and Test the Decision Tree (CT) on Raw Data

We will now train and test the Decision Tree model using the raw SMART data, without LSTM features.

In [4]:
# Run standard CT analysis on raw data
ct_data_path = lstm_data_path  # Use the same data as LSTM

# The CT pipeline expects a features file. For raw SMART, set feature_selection_method='smart_only'
ct_results = CT.analyze_lstm_predictions_with_ct(
    lstm_features_path=None,  # None means use raw data
    ground_truth_path=ct_data_path,
    feature_selection_method='smart_only',  # Only use SMART features
    output_dir="ct_raw_analysis"
)
print("CT Results on Raw Data:")
print(ct_results)

🔬 Starting CT analysis of LSTM predictions...


TypeError: stat: path should be string, bytes, os.PathLike or integer, not NoneType

## Step 5: Generate LSTM Predictions for All Drives

We will use the trained LSTM model to generate predictions for all drives, and export features for CT analysis.

In [None]:
# Generate LSTM predictions and export features for CT
ct_features_output_dir = "ct_lstm_features"
if hasattr(smart, "export_for_ct_analysis"):
    predictions_df, ct_features_df = smart.export_for_ct_analysis(
        model_path=lstm_model_path,
        dataset_path=lstm_data_path,
        output_dir=ct_features_output_dir
    )
    print(f"Exported {len(ct_features_df)} drive-level features for CT analysis.")
else:
    print("❌ export_for_ct_analysis function not found in smart.py. Please generate LSTM features manually.")

## Step 6: Test a Random Drive with the LSTM Model

Let's select a random drive from the dataset, run it through the LSTM model, and display the prediction.

In [None]:
# Load the list of drives
all_serials = []
for f in os.listdir(lstm_data_path):
    if f.endswith('.csv'):
        try:
            df = pd.read_csv(os.path.join(lstm_data_path, f), usecols=['serial_number'])
            all_serials.extend(df['serial_number'].unique())
        except Exception as e:
            print(f"Error reading {f}: {e}")
all_drives = list(set(all_serials))
random_drive = random.choice(all_drives)
print(f"Randomly selected drive: {random_drive}")

# Run the LSTM model on this drive (assuming such a function exists)
if hasattr(smart, "predict_drive"):
    random_drive_pred = smart.predict_drive(
        model_path=lstm_model_path,
        drive_serial=random_drive,
        data_path=lstm_data_path
    )
    print(f"LSTM prediction for drive {random_drive}: {random_drive_pred}")
else:
    print("❌ predict_drive function not found in smart.py. Please test a drive manually.")

## Step 7: Feed LSTM Results into the Decision Tree (CT)

Now, we will use the LSTM-generated features as input to the Decision Tree model and analyze the results.

In [None]:
# Path to the exported LSTM features for CT
lstm_features_path = os.path.join(ct_features_output_dir, "ct_features.csv")

# Run CT analysis on LSTM features
ct_lstm_results = CT.analyze_lstm_predictions_with_ct(
    lstm_features_path=lstm_features_path,
    ground_truth_path=lstm_data_path,
    feature_selection_method='all',  # Use all features (LSTM + SMART)
    output_dir="ct_lstm_analysis"
)
print("CT Results on LSTM Features:")
print(ct_lstm_results)

## Step 8: Visualize and Interpret Results

Let's visualize the feature importance and summarize the pipeline's performance.

In [None]:
import matplotlib.pyplot as plt

# Load feature importance from CT results
importance_df = pd.DataFrame(ct_lstm_results['feature_importance'])
top_features = importance_df.sort_values('gini_importance', ascending=False).head(10)

plt.figure(figsize=(10, 6))
plt.barh(top_features['feature'], top_features['gini_importance'])
plt.xlabel('Gini Importance')
plt.title('Top 10 Most Important Features (LSTM + SMART)')
plt.gca().invert_yaxis()
plt.show()

print("Pipeline complete! You have now trained, tested, and integrated LSTM and Decision Tree models for hard drive failure prediction.")