In [1]:
#  MACHINE FAILURE PREDICTION - MODEL RETRAINING PIPELINE
# ============================================================
import pandas as pd
import numpy as np
import joblib
import datetime
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
class PreprocessingPipeline(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.type_encoder = OrdinalEncoder(categories=[['L', 'M', 'H']])
        self.failure_encoder = LabelEncoder()
        self.scaler = MinMaxScaler()
        self.scale_cols = ['Rotational_speed_rpm', 'Torque_Nm', 'Tool_wear_min',
                           'Air_temperature_C', 'Process_temperature_C']

    def fit(self, X, y=None):
        self.type_encoder.fit(X[['Type']])
        self.failure_encoder.fit(X['Type_of_failure'])
        self.scale_cols = [col for col in self.scale_cols if col in X.columns]
        self.scaler.fit(X[self.scale_cols])
        return self

    def transform(self, X):
        X = X.copy()
        X['Type'] = self.type_encoder.transform(X[['Type']])
        if 'Type_of_failure' in X.columns:
            X['Type_of_failure'] = self.failure_encoder.transform(X['Type_of_failure'])
        X[self.scale_cols] = self.scaler.transform(X[self.scale_cols])
        return X

In [3]:
print("Starting Retraining Pipeline...")
print("Loading data and preprocessing pipeline...")

# Load processed data and pipeline
df = pd.read_csv('data_preprocessed.csv')
preprocessor = joblib.load('preprocessing_pipeline.pkl')

Starting Retraining Pipeline...
Loading data and preprocessing pipeline...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
# STEP 1: DATA PREPARATION
# ============================================================
print("Preparing data for retraining...")

# Binary classification preparation
binary_df = df.drop(['Type_of_failure'], axis=1)
X = binary_df.drop(columns=['Machine_failure'])
y = binary_df['Machine_failure']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Data split complete → Train: {X_train.shape}, Test: {X_test.shape}")

# SMOTE balancing
print("Applying SMOTE for balancing...")
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)
print(f"SMOTE applied → Balanced Train Shape: {X_train_bal.shape}")

Preparing data for retraining...
Data split complete → Train: (8000, 6), Test: (2000, 6)
Applying SMOTE for balancing...
SMOTE applied → Balanced Train Shape: (15444, 6)


In [5]:
# STEP 2: DEFINE MODELS
# ============================================================
print("\nTraining base models...")
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=5000, solver='saga', random_state=42),
    "Random Forest": RandomForestClassifier(max_depth=10, n_estimators=100, random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(max_depth=4, learning_rate=0.1, n_estimators=200,
                             subsample=0.8, colsample_bytree=0.8,
                             random_state=42, use_label_encoder=False, eval_metric='logloss')
}

# Cross-validation results
for name, model in classifiers.items():
    scores = cross_val_score(model, X_train_bal, y_train_bal, cv=5, scoring='f1')
    print(f"{name}: Mean F1 = {np.mean(scores):.3f} ± {np.std(scores):.3f}")


Training base models...
Logistic Regression: Mean F1 = 0.829 ± 0.008
Random Forest: Mean F1 = 0.963 ± 0.003


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost: Mean F1 = 0.964 ± 0.004


In [6]:
# STEP 3: TRAIN & EVALUATE BASE MODELS
# ============================================================
print("\nEvaluating base models on test data...")
results = {}
for name, model in classifiers.items():
    model.fit(X_train_bal, y_train_bal)
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    results[name] = {
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds, zero_division=0),
        "Recall": recall_score(y_test, preds, zero_division=0),
        "F1-Score": f1_score(y_test, preds, zero_division=0),
        "ROC-AUC": roc_auc_score(y_test, probs) if probs is not None else None,
        "Model": model
    }

df_results = pd.DataFrame(results).T
print("\nBase Model Results:")
print(df_results.round(3))


Evaluating base models on test data...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Base Model Results:
                    Accuracy Precision    Recall  F1-Score   ROC-AUC  \
Logistic Regression   0.8245  0.130102  0.836066  0.225166    0.9018   
Random Forest          0.956  0.395349  0.836066  0.536842  0.954785   
XGBoost                0.954  0.377953  0.786885  0.510638  0.967331   

                                                                 Model  
Logistic Regression  LogisticRegression(max_iter=5000, random_state...  
Random Forest        (DecisionTreeClassifier(max_depth=10, max_feat...  
XGBoost              XGBClassifier(base_score=None, booster=None, c...  


In [7]:
# STEP 4: RANDOM FOREST HYPERPARAMETER TUNING
# ============================================================
print("\nPerforming Random Forest hyperparameter tuning...")
rf_params = {
    'n_estimators': [100, 200, 400],
    'max_depth': [None, 6, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

rf_grid = RandomizedSearchCV(RandomForestClassifier(random_state=42, n_jobs=-1), rf_params,
                             n_iter=10, cv=4, scoring='f1', random_state=42, n_jobs=-1)
rf_grid.fit(X_train_bal, y_train_bal)

print(f"Best RF Params: {rf_grid.best_params_}")
best_rf = rf_grid.best_estimator_


Performing Random Forest hyperparameter tuning...
Best RF Params: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}


In [8]:
# STEP 5: ADD TUNED RF AND PICK BEST MODEL
# ============================================================
print("\nSelecting the best performing model...")

# Add tuned model
results["Random Forest (Tuned)"] = {
    "Accuracy": accuracy_score(y_test, best_rf.predict(X_test)),
    "Precision": precision_score(y_test, best_rf.predict(X_test), zero_division=0),
    "Recall": recall_score(y_test, best_rf.predict(X_test), zero_division=0),
    "F1-Score": f1_score(y_test, best_rf.predict(X_test), zero_division=0),
    "ROC-AUC": roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1]),
    "Model": best_rf
}

# Rebuild results table
df_final = pd.DataFrame(results).T
print(df_final.round(3))

# Select best based on F1-score
best_model_name = df_final['F1-Score'].idxmax()
best_model = df_final.loc[best_model_name, 'Model']
print(f"\nBest Model Selected: {best_model_name}")
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"best_model_binary_final_{timestamp}.pkl"

# Save the best model and pipeline
joblib.dump(best_model, filename)
joblib.dump(preprocessor, 'preprocessing_pipeline.pkl')

print(f"Model saved as '{filename}'")
print("Retraining completed at:", datetime.datetime.now())
print("Retraining pipeline executed successfully.")


Selecting the best performing model...
                      Accuracy Precision    Recall  F1-Score   ROC-AUC  \
Logistic Regression     0.8245  0.130102  0.836066  0.225166    0.9018   
Random Forest            0.956  0.395349  0.836066  0.536842  0.954785   
XGBoost                  0.954  0.377953  0.786885  0.510638  0.967331   
Random Forest (Tuned)   0.9655      0.46  0.754098  0.571429  0.964127   

                                                                   Model  
Logistic Regression    LogisticRegression(max_iter=5000, random_state...  
Random Forest          (DecisionTreeClassifier(max_depth=10, max_feat...  
XGBoost                XGBClassifier(base_score=None, booster=None, c...  
Random Forest (Tuned)  (DecisionTreeClassifier(max_features='sqrt', m...  

Best Model Selected: Random Forest (Tuned)
Model saved as 'best_model_binary_final_20251103_035243.pkl'
Retraining completed at: 2025-11-03 03:52:43.356255
Retraining pipeline executed successfully.
