In [1]:
# Imports
# We're adding 'RandomizedSearchCV' to automatically find the best model settings
import pandas as pd
import numpy as np
import sqlite3
import pickle
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import os

print("--- Phase 5: Model Tuning & Improvement ---")

--- Phase 5: Model Tuning & Improvement ---


In [3]:
#  Data Loading & Feature Engineering (Same as Notebook 04)
# This cell contains all our setup from the previous notebook.

# --- Load Data ---
db_path = '/workspaces/Engine_Predictive_System/data/turbofan.db'
table_name = 'train_data_FD001'
conn = sqlite3.connect(db_path)
df_full = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)
conn.close()

# --- Feature Engineering ---
window_size = 5
sensors = ['sensor_2', 'sensor_3', 'sensor_4', 'sensor_7', 'sensor_8', 
           'sensor_11', 'sensor_12', 'sensor_13', 'sensor_15', 'sensor_17', 
           'sensor_20', 'sensor_21']

df_grouped = df_full.groupby('unit_number')
for sensor in sensors:
    df_full[f'{sensor}_avg'] = df_grouped[sensor].rolling(window=window_size).mean().reset_index(level=0, drop=True)
    df_full[f'{sensor}_std'] = df_grouped[sensor].rolling(window=window_size).std().reset_index(level=0, drop=True)

df_engineered = df_full.dropna()

# --- Prepare X and y ---
features = ['time_cycle'] + [f'{s}_avg' for s in sensors] + [f'{s}_std' for s in sensors]
target = 'RUL'
X = df_engineered[features]
y = df_engineered[target]

# --- Train/Test Split (Time-Series Aware) ---
all_units = df_engineered['unit_number'].unique()
total_units = len(all_units)
split_point = int(total_units * 0.8)
train_units = all_units[:split_point]
test_units = all_units[split_point:]

X_train = X[df_engineered['unit_number'].isin(train_units)]
y_train = y[df_engineered['unit_number'].isin(train_units)]
X_test = X[df_engineered['unit_number'].isin(test_units)]
y_test = y[df_engineered['unit_number'].isin(test_units)]

print("--- Setup Complete ---")
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")

--- Setup Complete ---
X_train shape: (15818, 25), X_test shape: (4413, 25)


In [4]:
#  *** NEW: Hyperparameter Tuning ***

print("--- Starting Model Tuning (This will take a few minutes!) ---")

# 1. Define the "Parameter Grid" - the menu of settings to try
param_grid = {
    'n_estimators': [100, 200, 300],         # Number of trees in the forest
    'max_depth': [10, 20, 30, None],         # Max depth of each tree
    'min_samples_split': [2, 5, 10],       # Min samples to split a node
    'min_samples_leaf': [1, 2, 4],         # Min samples at a leaf node
    'max_features': [1.0, 'sqrt', 'log2']  # Number of features to consider at each split
}

# 2. Create the base model
rf = RandomForestRegressor(random_state=42)

# 3. Set up the Randomized Search
# n_iter=50: Try 50 random combinations from the grid
# cv=3: Use 3-fold cross-validation for each combination
# verbose=2: Show me the progress
# n_jobs=-1: Use all available CPU cores
rf_random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=50,  # Increase this for better results, but it will take longer
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# 4. Run the search
rf_random_search.fit(X_train, y_train)

print("\n--- Tuning Complete ---")
print("Best parameters found:")
print(rf_random_search.best_params_)

--- Starting Model Tuning (This will take a few minutes!) ---
Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   4.9s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   5.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   5.0s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   6.6s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   7.1s
[CV] END max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   7.1s
[CV] END max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  17.3s
[CV] END max_depth=20, max_featur

In [5]:
#  Evaluate the NEW Best Model

print("\n--- Evaluating V2 (Tuned) Model ---")

# 1. Get the best model found by the search
best_model = rf_random_search.best_estimator_

# 2. Make predictions on the test set
y_pred = best_model.predict(X_test)

# 3. Calculate new metrics
new_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
new_r2 = r2_score(y_test, y_pred)

print("\n--- V1 (Baseline) Model ---")
print("R-squared (R²): 0.60")
print("RMSE: 48.62 cycles")

print("\n--- V2 (Tuned) Model ---")
print(f"R-squared (R²): {new_r2:.2f}")
print(f"RMSE: {new_rmse:.2f} cycles")


--- Evaluating V2 (Tuned) Model ---

--- V1 (Baseline) Model ---
R-squared (R²): 0.60
RMSE: 48.62 cycles

--- V2 (Tuned) Model ---
R-squared (R²): 0.61
RMSE: 47.78 cycles


In [6]:
#  Save the V2 Model (if it's better)

if new_r2 > 0.60:
    print("\nNew model is better! Saving V2 model...")
    
    # Define paths
    model_dir = '../models'
    model_path = os.path.join(model_dir, 'rul_model.pkl')
    features_path = os.path.join(model_dir, 'model_features.pkl')

    # Save the *new* best model
    with open(model_path, 'wb') as f:
        pickle.dump(best_model, f)
    print(f"V2 model saved to {model_path}")
    
    # Save the feature list (it's the same, but good practice)
    with open(features_path, 'wb') as f:
        pickle.dump(features, f)
    print(f"Feature list saved to {features_path}")
    
    print("\n*** IMPORTANT NEXT STEP ***")
    print(f"Your new RMSE is {new_rmse:.2f}.")
    print(f"Go to 'app.py' and update this line in the 'get_recommendation' function:")
    print(f"Change 'def get_recommendation(rul, rmse=49):' to 'def get_recommendation(rul, rmse={int(np.ceil(new_rmse))}):'")
    
else:
    print("\nNew model was not significantly better. Keeping V1 model.")


New model is better! Saving V2 model...
V2 model saved to ../models/rul_model.pkl
Feature list saved to ../models/model_features.pkl

*** IMPORTANT NEXT STEP ***
Your new RMSE is 47.78.
Go to 'app.py' and update this line in the 'get_recommendation' function:
Change 'def get_recommendation(rul, rmse=49):' to 'def get_recommendation(rul, rmse=48):'
