In [None]:
import sys

!{sys.executable} -m pip install setuptools
!{sys.executable} -m pip install python-dotenv
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install tensorflow
!{sys.executable} -m pip install mlflow
!{sys.executable} -m pip install dagshub
!{sys.executable} -m pip install keras-tuner
!{sys.executable} -m pip install scikeras

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score
from scikeras.wrappers import KerasClassifier
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import mlflow
import mlflow.tensorflow
import dagshub
from scipy.stats import randint

In [None]:
# set up dagshub for mlflow tracking
dagshub.init(repo_owner='JonaKoenemann', repo_name='machine_failure_classification', mlflow=True)

# Data preparation

In [None]:
data_path = "../../../data/predictive_maintenance.csv"

In [None]:
df = pd.read_csv(data_path)

In [None]:
categorical_features = ['Type']
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features.remove('UDI')
numerical_features.remove('Target')

# Feature Engineering: Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('scaler', StandardScaler()),
            ('poly', poly)
        ]), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [None]:
# Split for features (X) und labels (y)
X = df[numerical_features + categorical_features] # select Features 
y = df["Failure Type"]

In [None]:
# Split for test and train data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Test preprocessor pipeline
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print(X_train_transformed.shape)
print(X_test_transformed.shape)

# Model creation

In [None]:
# Define model creation function
def create_model(input_shape, optimizer='adam', init='glorot_uniform', layers=[64, 32], **kwargs):
    model = Sequential()
    model.add(Input(shape=input_shape))  # Input layer with input_shape parameter
    for neurons in layers:
        model.add(Dense(neurons, activation='relu', kernel_initializer=init))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [None]:
model = KerasClassifier(build_fn=create_model, verbose=0)

In [None]:
model

In [None]:
# Define hyperparameters distribution
param_dist = {
    'model__batch_size': randint(16, 65),
    'model__epochs': randint(10, 31),
    'model__optimizer': ['SGD', 'Adam'],
    'model__layers': [[64, 32], [64, 32, 16]]  # List of layer configurations to test
}

In [None]:
# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

In [None]:
# Set up RandomizedSearchCV with F1 score as the metric
n_iter_search = 10
scorer = make_scorer(f1_score, average='weighted')
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_dist, n_iter=n_iter_search, cv=3, n_jobs=-1, scoring=scorer)

# Train and test

In [None]:
mlflow.set_experiment("neural_net_rs_feature")

In [None]:
with mlflow.start_run() as run:
    # Fit random search
    random_search_result = random_search.fit(X_train_transformed, y_train)

    # Log the best parameters and metrics
    mlflow.log_params(random_search_result.best_params_)
    mlflow.log_metric("best_score", random_search_result.best_score_)
    
    # Train the best model on the test data
    best_model = random_search_result.best_estimator_
    y_pred = best_model.predict(X_test_transformed)
    test_accuracy = best_model.score(X_test_transformed, y_test)
    mlflow.log_metric("test_accuracy", test_accuracy)
    
    # Calculate and log the F1 score
    test_f1_score = f1_score(y_test, y_pred, average='weighted')
    mlflow.log_metric("test_f1_score", test_f1_score)
    
    print(f"Best Parameters: {random_search_result.best_params_}")
    print(f"Best CV F1 Score: {random_search_result.best_score_}")
    print(f"Test Accuracy: {test_accuracy}")
    print(f"Test F1 Score: {test_f1_score}")

# Evaluation

In [None]:
error_names = [
 'Heat Failure',
 'No Failure',
 'Overstrain Failure',
 'Power Failure',
 'Random Failure',
 'Tool wear Failure'] # get names of errors in correct order for confusion matrix

## Classification report

In [None]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print(classification_report(y_test, y_pred))

## Normalized confusion matrix

In [None]:
conf_matrix_normalized = confusion_matrix(y_test, y_pred, normalize='true')

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_normalized, annot=True, cmap="Blues", cbar=False, xticklabels=error_names, yticklabels=error_names)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Normalized Confusion Matrix")
plt.show()

## Train and validation loss

In [None]:
history = best_model.model_.history
plt.figure(figsize=(8, 6))
plt.plot(history['loss'], label='Training Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()