In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, make_scorer, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import randint

In [2]:
# set up dagshub for mlflow tracking
import dagshub
dagshub.init(repo_owner='JonaKoenemann', repo_name='machine_failure_classification', mlflow=True)

# Data preparation

In [3]:
data_path = "../../../data/predictive_maintenance.csv"

In [4]:
df = pd.read_csv(data_path)

In [5]:
categorical_features = ['Type']
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features.remove('UDI')
numerical_features.remove('Target')

# Feature Engineering: Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('scaler', StandardScaler()),
            ('poly', poly)
        ]), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [6]:
# Split for features (X) und labels (y)
X = df[numerical_features + categorical_features] # select Features 
y = df["Failure Type"]

In [7]:
# Split for test and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Test preprocessor pipeline
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print(X_train_transformed.shape)
print(X_test_transformed.shape)

(8000, 23)
(2000, 23)


# Train and test

In [10]:
# Set the MLflow experiment
mlflow.set_experiment("logistic_regression_rs_feature")

mlflow.sklearn.autolog()

# Define the Logistic Regression model
model = LogisticRegression(solver='liblinear', random_state=42)

# Define the parameters for GridSearchCV
param_distributions = {
    'model__penalty': [None, 'l1', 'l2'],
    'model__C': [0.01, 0.1, 1, 10, 100], 
    'model__class_weight': [None, 'balanced'],
    'model__solver': ['saga']
}

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# weighted f1 score as scoring metric
scorer = make_scorer(f1_score, average = "weighted")

In [11]:
with mlflow.start_run():
    print("Training Logistic Regression Model...")
    randomized_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_distributions,
                                           n_iter=50, cv=5, verbose=2, random_state=42, n_jobs=-1, scoring=scorer)
    randomized_search.fit(X_train, y_train)
    
    best_params = randomized_search.best_params_
    mlflow.log_params(best_params)
    
    best_model = randomized_search.best_estimator_
    y_pred = best_model.predict(X_test)
    
    # evaluation
    weighted_f1 = f1_score(y_true = y_test, y_pred= y_pred, average="weighted")
    print("\nweighted f1 score:", weighted_f1)
    mlflow.log_metric('weighted_f1_score', weighted_f1)
    
    mlflow.end_run()




Training Logistic Regression Model...




Fitting 5 folds for each of 30 candidates, totalling 150 fits


 nan nan nan nan nan nan nan nan nan nan nan nan]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
 'Power Failure' 'Random Failures' 'Tool Wear Failure']
2024/06/24 12:30:16 INFO mlflow.sklearn.utils: Logging the 5 best runs, 25 runs will be omitted.



weighted f1 score: 0.9695695721258611


# Evaluation

In [None]:
error_names = [
 'Heat Failure',
 'No Failure',
 'Overstrain Failure',
 'Power Failure',
 'Random Failure',
 'Tool wear Failure'] # get names of errors in correct order for confusion matrix

## Classification report

In [None]:
print("Classification Report:")
report = classification_report(y_test, y_pred)
print(report)

##  Normalized Confusion matrix

In [None]:
conf_matrix_normalized = confusion_matrix(y_test, y_pred, normalize='true')

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_normalized, annot=True, cmap="Blues", cbar=False, xticklabels=error_names, yticklabels=error_names)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Normalized Confusion Matrix")
plt.show()