In [36]:
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

print("Cell 1: Imports successful.")

Cell 1: Imports successful.


In [38]:
try:
    df = pd.read_csv('..data/raw/data.csv')
except FileNotFoundError:
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("!!!!!!!!!!!!!!!!! CRITICAL ERROR !!!!!!!!!!!!!!!!!!!!!")
    print("Dataset not found at '../data/raw/data.csv'.")
    print("SOLUTION: Download the 'Steel Fatigue Strength Prediction' dataset,")
    print("place it in the 'data/raw' folder, and rename it to 'data.csv'.")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    raise

# Standardize column names to be lowercase and use underscores
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# --- SANITY CHECK BLOCK ---
# This block will stop the notebook if the wrong dataset is loaded.
EXPECTED_COL = 'fatigue'
if EXPECTED_COL not in df.columns:
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("!!!!!!!!!!!!!!!!! CRITICAL ERROR !!!!!!!!!!!!!!!!!!!!!")
    print(f"The loaded dataset in 'data/raw/data.csv' is the WRONG one.")
    print(f"The required column '{EXPECTED_COL}' was not found.")
    print(f"The detected columns are: {df.columns.tolist()}")
    print("SOLUTION: Go to 'data/raw/', delete the current 'data.csv',")
    print("and replace it with the correct 'Steel Fatigue Strength Prediction' dataset.")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    raise ValueError("Wrong dataset loaded. Please see the error message above.")
else:
    print("Cell 2: Sanity check passed. Correct dataset is loaded.")
# --- END SANITY CHECK ---

TARGET_COL = 'fatigue'
X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data prepared and split successfully.")


!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!!!!!!!!!!!!!!!!! CRITICAL ERROR !!!!!!!!!!!!!!!!!!!!!
Dataset not found at '../data/raw/data.csv'.
SOLUTION: Download the 'Steel Fatigue Strength Prediction' dataset,
place it in the 'data/raw' folder, and rename it to 'data.csv'.
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


FileNotFoundError: [Errno 2] No such file or directory: '..data/raw/data.csv'

In [None]:
print("\n--- Training RandomForestRegressor ---")
model = RandomForestRegressor(random_state=42, n_jobs=-1)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
score = r2_score(y_test, y_pred)
print(f"Initial R² score: {score:.4f}")


--- Training RandomForest ---
R²: 0.9694 | MAE: 22.5119 | RMSE: 35.5922

--- Training XGBoost ---
R²: 0.9878 | MAE: 16.0718 | RMSE: 22.4744

--- Training LightGBM ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000379 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 529
[LightGBM] [Info] Number of data points in the train set: 349, number of used features: 26
[LightGBM] [Info] Start training from score 552.538682
R²: 0.9708 | MAE: 18.8812 | RMSE: 34.7482

--- Model Comparison ---




Unnamed: 0,R²,MAE,RMSE
RandomForest,0.96936,22.511932,35.592166
XGBoost,0.987783,16.071806,22.474391
LightGBM,0.970796,18.88125,34.748247


In [None]:
print("\n--- Hyperparameter Tuning ---")
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_leaf': [1, 2]
}
kf = KFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=kf, scoring='r2', n_jobs=-1, verbose=1)
grid_search.fit(X_train_scaled, y_train)
best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred_tuned = best_model.predict(X_test_scaled)
final_score = r2_score(y_test, y_pred_tuned)
print(f"Tuned Model R² score: {final_score:.4f}")
print(f"Best Parameters: {grid_search.best_params_}")

--- Hyperparameter Tuning for RandomForestRegressor ---
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Best Parameters found:
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

Best R² score from GridSearch: 0.9647


In [None]:
print("\n--- Saving final model and artifacts ---")
# Create directories if they don't exist
os.makedirs('../models', exist_ok=True)
os.makedirs('../data/processed', exist_ok=True)

# Save the best model
model_path = '../models/model.pkl'
joblib.dump(best_model, model_path)
print(f"Tuned model saved to: {model_path}")

# Save the scaler
scaler_path = '../data/processed/scaler.pkl'
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to: {scaler_path}")

# Save the processed data columns for the app to use
feature_names = X.columns
pd.DataFrame(X_train_scaled, columns=feature_names).to_csv('../data/processed/X_train.csv', index=False)
print("Processed X_train.csv saved for app consumption.")

print("\n--- NOTEBOOK EXECUTION COMPLETE ---")

--- Evaluating the Tuned RandomForest Model on Test Set ---
Tuned Model R²: 0.9719
Tuned Model MAE: 21.7391
Tuned Model RMSE: 34.1122
