In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# About Dataset

- UID, Product ID: Unique Identity; Won't be needed in Modeling
- Type:  Product Quality -> L(Low), M(Medium), H(High)
- tool wear failure (TWF), heat dissipation failure (HDF), power failure (PWF), overstrain failure (OSF), random failures (RNF)
- Machine failure: Will be the output variable for the pre maintnance model.
- Kudos to Dataset owner,for sharing it. It is one of the few dataset with detailed explantion to dataset and proper references.

# Import Libraries

In [None]:
import joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Data Analysis

In [None]:
dataset = pd.read_csv('/kaggle/input/predictive-maintenance-dataset-ai4i-2020/ai4i2020.csv')
dataset.head()

In [None]:
dataset.columns

In [None]:
dataset.describe()

In [None]:
dataset.isnull().sum()

In [None]:
print(dataset['Machine failure'].value_counts())

The above calculation state that the result is highly biased towards 'Machine failure' = 0, hence we need to sample the data. As of now I am thinking of oversamplung in order not to loose any information. If that won't work we can gave a look at other sampling methods.

In [None]:
# Check for duplicate rows
duplicates = dataset.duplicated()

# Count the number of duplicate rows
num_duplicates = duplicates.sum()

print(f"Number of duplicate rows: {num_duplicates}")

In [None]:
# Visualising Continuous variables
continuous_vars = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

# Plot distributions
plt.figure(figsize=(15, 10))
for i, col in enumerate(continuous_vars, 1):
    plt.subplot(3, 2, i)
    sns.histplot(data=dataset, x=col, hue='Machine failure', kde=True, bins=30, palette='Set2')
    plt.title(f'Distribution of {col} by Machine Failure')


In [None]:
# Maintaing distribution similar as oiginal dataset
# Separate majority and minority classes
df_majority = dataset[dataset['Machine failure'] == 0]
df_minority = dataset[dataset['Machine failure'] == 1]

# Define the target size for undersampling
target_size = len(df_minority)

# Create bins for continuous variables
continuous_vars = ['Air temperature [K]', 'Process temperature [K]', 
                   'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

# Add bin columns for each continuous variable in the majority class
for col in continuous_vars:
    df_majority[f'{col}_bin'] = pd.qcut(df_majority[col], q=10, duplicates='drop')  # Quantile bins

# Perform stratified sampling on the majority class
undersampled_majority = pd.DataFrame()

# For each bin in the continuous variables
for col in continuous_vars:
    bin_column = f'{col}_bin'
    for bin_value, group in df_majority.groupby(bin_column):
        n_samples = int(len(group) / len(df_majority) * target_size)  # Proportional undersampling
        undersampled_majority = pd.concat([undersampled_majority, resample(group, n_samples=n_samples, random_state=42, replace=False)])

# Drop bin columns from the undersampled majority
undersampled_majority = undersampled_majority.drop(columns=[f'{col}_bin' for col in continuous_vars])

# Combine the undersampled majority and minority classes
undersampled_dataset = pd.concat([undersampled_majority, df_minority]).sample(frac=1, random_state=42).reset_index(drop=True)

# Verify class balance
print(undersampled_dataset['Machine failure'].value_counts())

# Check the distribution (Optional)
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(15, 10))
for i, col in enumerate(continuous_vars, 1):
    plt.subplot(3, 2, i)
    sns.kdeplot(data=dataset, x=col, hue='Machine failure', fill=True, alpha=0.5, label='Original', palette='Set2')
    sns.kdeplot(data=undersampled_dataset, x=col, hue='Machine failure', fill=True, alpha=0.3, linestyle='--', label='Undersampled', palette='Set1')
    plt.title(f'KDE of {col}: Original vs. Undersampled')
    plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Mapping 'Type', product quality; L:1, M:2, H:0
label_encoder = LabelEncoder()
dataset['Labled Type'] = label_encoder.fit_transform(dataset['Type'])
dataset.head()

#Lable Encoding for undersampled dataset as well
undersampled_dataset['Labled Type'] = label_encoder.fit_transform(undersampled_dataset['Type'])
# Though this step can be skipped as form the previous experiments this Feature is not that important for final model

In [None]:
# Investigating Failure
failure_columns = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
dataset['Failure'] = dataset[failure_columns].sum(axis=1)
undersampled_dataset['Failure'] = undersampled_dataset[failure_columns].sum(axis=1)
col = ['Failure']
sns.histplot(data=dataset, x='Failure')
plt.title('Distribution of Failure')
plt.xlabel('Failure')
plt.ylabel('Frequency')
plt.show()

In [None]:
features= ['Labled Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF',
       'RNF', 'Failure']

correlation_matrix = dataset[features].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')

In [None]:
features= ['Labled Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF',
       'RNF', 'Failure']

correlation_matrix = undersampled_dataset[features].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')

Relation of each type of Failure with other variale:
- tool wear failure (TWF): Tool Wear
- heat dissipation failure (HDF): Torque, Rotational Speed, Air Temperature
- power failure (PWF): Rotational speed, Torque, OSF
- overstrain failure (OSF): Tool Wear. Torque, Rotational Speed
And these Failure further affects the machine Failure.\
From above statement it is cleare that above features results in minor failure that further results machine failure.\
Where RNF have minor influence on Machine Failure, TWF hase some inluence on Machine Failure, whereas HDF, PWF, OSF have highly influence on Machine Fialure.

# Assess Feature Importance

Since I have bot catgorical and numerical dataset, hence I am coosing Rnadom Forest Classifirer for Assessing Feature Importance and It has one more benefit that is tis not sensitive to scaling. One might be thinking why not SVM or PCA. But we use SVM the realationship between the target and feature is quite complex. PCA helpful in high dimentionality data. Hence I decided to stick to Rnadom Forest.

In [None]:
X = dataset.drop(columns=['Machine failure', 'UDI', 'Product ID', 'Type'])
y = dataset['Machine failure']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handle imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_resampled, y_train_resampled)

# Get Feature Importance
importances = rf.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'Features': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

# Evaluate the model
y_pred = rf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate AUC-ROC
y_pred_prob = rf.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"AUC-ROC: {roc_auc:.4f}")

# Cros Validation Score to wheck if the model is overfitting or not
cv_scores = cross_val_score(rf, X_train_resampled, y_train_resampled, cv=5, scoring='roc_auc')
print(f"Cross-validated AUC-ROC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


The above Cross-validation state that the model is overfitted, Lets do Feature Selection first and then analyse the model again.

# Feature Selection

I have decied to drop ['TWF', 'HDF', 'PWF', 'OSF','RNF'] as they don't have much influence on the machine failure but collection of which will affect the model all together. I am also droping 'Lable Type' as evern it if provides information of the product quality but that has minor influence on the 'Machine Failure'

In [None]:
X = dataset.drop(columns=['Machine failure', 'UDI', 'Product ID', 'Type',
                         'TWF', 'HDF', 'PWF', 'OSF','RNF', 'Labled Type', 'Failure'])
y = dataset['Machine failure']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handle imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_resampled, y_train_resampled)

y_predict = rf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Calculate AUC-ROC
y_pred_prob = rf.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"AUC-ROC: {roc_auc:.4f}")

# Cros Validation Score to wheck if the model is overfitting or not
cv_scores = cross_val_score(rf, X_train_resampled, y_train_resampled, cv=5, scoring='roc_auc')
print(f"Cross-validated AUC-ROC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# Hyper Parameter Tuning

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV, StratifiedKFold
# from sklearn.metrics import classification_report, roc_auc_score, roc_curve
# from sklearn.preprocessing import StandardScaler
# import numpy as np


# X = dataset.drop(columns=['Machine failure', 'UDI', 'Product ID', 'Type',
#                          'TWF', 'HDF', 'PWF', 'OSF','RNF', 'Labled Type', 'Failure'])
# y = dataset['Machine failure']

# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# # Handle imbalance using SMOTE
# smote = SMOTE(random_state=42)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# # Define the Random Forest model with class weighting
# rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

# # Set up the parameter grid for hyperparameter tuning
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [5, 10, 15, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['sqrt', 'log2', None]
# }

# # Stratified cross-validation
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# # Grid Search for hyperparameter tuning
# grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=cv, scoring='roc_auc', verbose=2, n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Best parameters and model
# best_model = grid_search.best_estimator_
# print("Best parameters:", grid_search.best_params_)

# # Evaluate on test data
# y_pred = best_model.predict(X_test)
# y_prob = best_model.predict_proba(X_test)[:, 1]

# # Classification Report
# print("Classification Report:\n", classification_report(y_test, y_pred))

# # AUC-ROC
# auc_score = roc_auc_score(y_test, y_prob)
# print("AUC-ROC Score:", auc_score)

# # Plot ROC Curve
# import matplotlib.pyplot as plt
# fpr, tpr, _ = roc_curve(y_test, y_prob)
# plt.figure(figsize=(8, 6))
# plt.plot(fpr, tpr, label=f"AUC-ROC: {auc_score:.4f}")
# plt.plot([0, 1], [0, 1], 'k--', label="Random Guessing")
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC Curve")
# plt.legend()
# plt.show()


In [None]:
# Splitting data into features (X) and target (y)
X = dataset.drop(columns=['Machine failure', 'UDI', 'Product ID', 'Type', 
                          'TWF', 'HDF', 'PWF', 'OSF', 'RNF', 'Labled Type', 'Failure'])
y = dataset['Machine failure']

# Splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Applying SMOTE for oversampling the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Defining the best parameters for the RandomForestClassifier
param_grid = {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 
              'min_samples_split': 2, 'n_estimators': 200}

# Training the RandomForest model
rf_best_model = RandomForestClassifier(random_state=42, class_weight='balanced', **param_grid)
rf_best_model.fit(X_train_resampled, y_train_resampled)

# Predicting the test data
y_pred = rf_best_model.predict(X_test)

# Predicting probabilities for AUC-ROC
y_prob = rf_best_model.predict_proba(X_test)[:, 1]

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))

# AUC-ROC Score
auc_score = roc_auc_score(y_test, y_prob)
print("AUC-ROC Score:", auc_score)

# Cross-validation for AUC-ROC
cv_scores = cross_val_score(rf_best_model, X_train_resampled, y_train_resampled, cv=5, scoring='roc_auc')
print(f"Cross-validated AUC-ROC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


I am still mot satisfied with the above model.

# Check Features Multicollinearity
A **variance inflation factor (VIF)** score is a regression diagnostic that measures the correlation between independent variables in a model. It's used to detect multicollinearity, which can affect the reliability of a model's coefficients.\

**What does a VIF score indicate?**\ 
- VIF = 1: No correlation between the variables
- VIF between 1 and 5: Moderate correlation between the variables
- VIF greater than 5: High correlation between the variables
- VIF greater than 10: Serious correlation between the variables

In [None]:
# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data)

Since Air temprature and Process temprature are highly correlated, I am dropung Process Teparture based ob Correlation Metrix.

In [None]:
# Splitting data into features (X) and target (y)
X = dataset.drop(columns=['Machine failure', 'UDI', 'Product ID', 'Type', 
                          'TWF', 'HDF', 'PWF', 'OSF', 'RNF', 'Labled Type', 'Failure', 'Process temperature [K]'])
y = dataset['Machine failure']

# Splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Applying SMOTE for oversampling the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Defining the best parameters for the RandomForestClassifier
param_grid = {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 
              'min_samples_split': 2, 'n_estimators': 200}

# Training the RandomForest model
rf_best_model = RandomForestClassifier(random_state=42, class_weight='balanced', **param_grid)
rf_best_model.fit(X_train_resampled, y_train_resampled)

# Predicting the test data
y_pred = rf_best_model.predict(X_test)

# Predicting probabilities for AUC-ROC
y_prob = rf_best_model.predict_proba(X_test)[:, 1]

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))

# AUC-ROC Score
auc_score = roc_auc_score(y_test, y_prob)
print("AUC-ROC Score:", auc_score)

# Cross-validation for AUC-ROC
cv_scores = cross_val_score(rf_best_model, X_train_resampled, y_train_resampled, cv=5, scoring='roc_auc')
print(f"Cross-validated AUC-ROC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


The above models don't seems to be over fitting anymore. But let's try Feature Engineering

# Feature Engineering
If the two variables (Air temperature [K] and Process temperature [K]) are correlated because they measure similar phenomena, consider creating a combined feature:

In [None]:
dataset['Temp_diff'] = dataset['Process temperature [K]'] - dataset['Air temperature [K]']
# dataset = dataset.drop(columns=['Air temperature [K]', 'Process temperature [K]'])

# Splitting data into features (X) and target (y)
X = dataset.drop(columns=['Machine failure', 'UDI', 'Product ID', 'Type', 
                          'TWF', 'HDF', 'PWF', 'OSF', 'RNF', 'Labled Type', 'Failure',
                         'Air temperature [K]', 'Process temperature [K]'])
y = dataset['Machine failure']

# Splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Applying SMOTE for oversampling the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Defining the best parameters for the RandomForestClassifier
param_grid = {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 
              'min_samples_split': 2, 'n_estimators': 200}

# Training the RandomForest model
rf_best_model = RandomForestClassifier(random_state=42, class_weight='balanced', **param_grid)
rf_best_model.fit(X_train_resampled, y_train_resampled)

# Predicting the test data
y_pred = rf_best_model.predict(X_test)

# Predicting probabilities for AUC-ROC
y_prob = rf_best_model.predict_proba(X_test)[:, 1]

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))

# AUC-ROC Score
auc_score = roc_auc_score(y_test, y_prob)
print("AUC-ROC Score:", auc_score)

# Cross-validation for AUC-ROC
cv_scores = cross_val_score(rf_best_model, X_train_resampled, y_train_resampled, cv=5, scoring='roc_auc')
print(f"Cross-validated AUC-ROC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


# Trying Regularization

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Create the new column 'Temp_diff'
dataset['Temp_diff'] = dataset['Process temperature [K]'] - dataset['Air temperature [K]']

# Splitting data into features (X) and target (y)
X = dataset.drop(columns=['Machine failure', 'UDI', 'Product ID', 'Type', 
                          'TWF', 'HDF', 'PWF', 'OSF', 'RNF', 'Labled Type', 'Failure',
                          'Air temperature [K]', 'Process temperature [K]'])

# Rename columns to remove invalid characters
X.columns = X.columns.str.replace(r"[\[\]<> ]", "_", regex=True)  # Replace special characters with underscores

y = dataset['Machine failure']

# Splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Applying SMOTE for oversampling the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# XGBoost with regularization
xgb_model = XGBClassifier(
    max_depth=10,
    n_estimators=200,
    learning_rate=0.1,
    reg_alpha=1,  # L1 Regularization (LASSO)
    reg_lambda=1,  # L2 Regularization (Ridge)
    scale_pos_weight=10,  # Balancing class weights
    random_state=42
)

xgb_model.fit(X_train_resampled, y_train_resampled)

# Predicting the test data
y_pred = xgb_model.predict(X_test)
y_prob = xgb_model.predict_proba(X_test)[:, 1]

# Evaluation
print("Classification Report:\n", classification_report(y_test, y_pred))
auc_score = roc_auc_score(y_test, y_prob)
print("AUC-ROC Score:", auc_score)

# Cross-validation for AUC-ROC
cv_scores = cross_val_score(xgb_model, X_train_resampled, y_train_resampled, cv=5, scoring='roc_auc')
print(f"Cross-validated AUC-ROC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


# Model Based on Undersampled Dataset

In [None]:
# Features and target
X = undersampled_dataset.drop(columns=['Machine failure', 'UDI', 'Product ID', 'Type',
                                       'TWF', 'HDF', 'PWF', 'OSF', 'RNF', 'Labled Type', 'Failure'])
y = undersampled_dataset['Machine failure']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Predictions
y_predict = rf.predict(X_test)

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_predict))

# Calculate AUC-ROC
y_pred_prob = rf.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"AUC-ROC: {roc_auc:.4f}")

# Cross-Validation Score to Check for Overfitting
cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='roc_auc')  # Use `X_train` and `y_train`
print(f"Cross-validated AUC-ROC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


The above model has the best result so far, redicting Majority and Minority Features Correctly Hence I a shoosing this model for Deployment.

# Model Deployment

In [None]:
# Save the model
joblib.dump(rf, 'random_forest_model.pkl')