##**Predictive Maintenance ANOMALY DETECTION WITH PYOD**

#IMPORTING PYOD AND DATA SET

In [10]:
#code snippet 1
#Installing pyod package for anormaly detection algorithms (knn)
!pip install pyod



In [11]:
#code snippet 2
#Importing pyod
from pyod.models.knn import KNN

In [12]:
# Code snippet 3
# Importing the dataset
import pandas as pd
data = pd.read_csv('https://raw.githubusercontent.com/Imjuandiaz/MTTF-Predictive-Maintenance-analysis/refs/heads/main/Data/ai4i2020.csv')


data

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0


In [13]:
# Code snippet 4
# Create new dataset with only numerical columns
numeric_dataset = data.select_dtypes(include=['float64', 'int64'])
numeric_dataset


Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,9997,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,9998,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,9999,299.0,308.7,1408,48.5,25,0,0,0,0,0,0


In [14]:
# Code snippet 5
# Training with the KNN algorithm to identify the anomalies
ad_model = KNN()

# Handle missing values by imputing with the mean
numeric_dataset_cleaned = numeric_dataset.fillna(numeric_dataset.mean())

ad_model.fit(numeric_dataset_cleaned)

KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2,
  radius=1.0)

In [15]:
# Code snippet 6
#Predict anomalies (1 = normal, -1 = Anomaly)
anomaly_labels = ad_model.labels_
anomaly_scores = ad_model.decision_scores_


#ADDING LABEL TO ANOMALY LABEL AND SCORE


In [16]:
# Code snippet 7
# Adding the anomaly results to the original dataframe
data['Anomaly_Label'] = anomaly_labels
data['Anomaly_Score'] = anomaly_scores
data


Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,Anomaly_Label,Anomaly_Score
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0,0,74.884578
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0,0,46.872060
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0,0,65.787765
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0,0,48.608847
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0,0,40.606157
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0,0,77.772874
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0,0,78.717851
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0,0,81.238230
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0,1,86.577653


#IMPORTING DATA SET WITH ANOMALY COLUMNS

In [17]:
# Code snippet 8
# Exporting the dataset to a csv file with anomaly info
data.to_csv('Predictive_Maintenance_KNN_Anomalies.csv', index=False)

#TOTAL COUNT OF ANOMALY ON DATASET

In [18]:
# Code snippet 9
# Displaying the number of anomalies
num_anomalies = (data['Anomaly_Label'] == 1).sum()
print(f"Number of anomalies: {num_anomalies}")

Number of anomalies: 1000


##**Predictive Maintenance Clustering With h2o**

#IMPORTING H2O AND DATA SET

In [19]:
#code snippet 1
#Installing h2o
!pip install h2o

Collecting h2o
  Downloading h2o-3.46.0.8-py2.py3-none-any.whl.metadata (2.1 kB)
Downloading h2o-3.46.0.8-py2.py3-none-any.whl (266.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m266.0/266.0 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: h2o
Successfully installed h2o-3.46.0.8


In [20]:
#code snippet 2
#Importing h2o
import h2o

#initialize h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "17.0.16" 2025-07-15; OpenJDK Runtime Environment (build 17.0.16+8-Ubuntu-0ubuntu122.04.1); OpenJDK 64-Bit Server VM (build 17.0.16+8-Ubuntu-0ubuntu122.04.1, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.12/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp1nqi0jyi
  JVM stdout: /tmp/tmp1nqi0jyi/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp1nqi0jyi/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,06 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.8
H2O_cluster_version_age:,1 month and 15 days
H2O_cluster_name:,H2O_from_python_unknownUser_gbyio8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.147 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [21]:
# Code snippet 3
# Importing dataset
data = h2o.import_file('https://raw.githubusercontent.com/Imjuandiaz/MTTF-Predictive-Maintenance-analysis/refs/heads/main/Data/ai4i2020.csv')


data


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
6,M14865,M,298.1,308.6,1425,41.9,11,0,0,0,0,0,0
7,L47186,L,298.1,308.6,1558,42.4,14,0,0,0,0,0,0
8,L47187,L,298.1,308.6,1527,40.2,16,0,0,0,0,0,0
9,M14868,M,298.3,308.7,1667,28.6,18,0,0,0,0,0,0
10,M14869,M,298.5,309.0,1741,28.0,21,0,0,0,0,0,0


In [22]:
#code snippet 4
#Importing Kmeans Estimator
from h2o.estimators.kmeans import H2OKMeansEstimator

In [46]:
#code snippet 5
#setting the number of clusters in the data train as 2
kmeans_model = H2OKMeansEstimator(k=2)

#Train the clustering model
kmeans_model.train(training_frame=data)

kmeans Model Build progress: |███████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_rows,number_of_clusters,number_of_categorical_columns,number_of_iterations,within_cluster_sum_of_squares,total_sum_of_squares,between_cluster_sum_of_squares
,10000.0,2.0,1.0,3.0,107738.3402142,123988.0,16249.6597858

Unnamed: 0,centroid,size,within_cluster_sum_of_squares
,1.0,9819.0,93950.782001
,2.0,181.0,13787.5570052

Unnamed: 0,timestamp,duration,iterations,number_of_reassigned_observations,within_cluster_sum_of_squares
,2025-11-23 18:19:17,0.021 sec,0.0,,
,2025-11-23 18:19:18,0.211 sec,1.0,10000.0,173916.2758276
,2025-11-23 18:19:18,0.231 sec,2.0,31.0,108596.7801699
,2025-11-23 18:19:18,0.249 sec,3.0,0.0,107738.3402142


In [47]:
#code snippet 6
#Displaying the cluster size
kmeans_model.size()

[9819.0, 181.0]

In [48]:
# Code Snippet 7
# Import libraries
import pandas as pd

# Predict clusters for each row
pred = kmeans_model.predict(data)

# Join prediction to the original dataset
data_with_clusters = data.cbind(pred)

# Convert to pandas DataFrame
df = data_with_clusters.as_data_frame()

# Clean column names to remove any invisible characters (like BOM)
df.columns = df.columns.str.strip().str.replace('\ufeff', '')

# ==========================
# Numeric variables (según tu dataset)
# ==========================
numeric_cols = [
    "UDI",
    "Air temperature [K]",
    "Process temperature [K]",
    "Rotational speed [rpm]",
    "Torque [Nm]",
    "Tool wear [min]",
    "Machine failure",
    "TWF",
    "HDF",
    "PWF",
    "OSF",
    "RNF"
]

numeric_summary = df.groupby("predict")[numeric_cols].mean()

# ==========================
# Categorical variables (según tu dataset)
# ==========================
categorical_cols = [
    "Product ID",
    "Type"
]

categorical_summary = {}
for col in categorical_cols:
    categorical_summary[col] = (
        df.groupby("predict")[col]
        .value_counts(normalize=True)
        .unstack(fill_value=0)
    )

kmeans prediction progress: |████████████████████████████████████████████████████| (done) 100%





In [49]:
# ==========================
# Code Snippet 8
# ==========================
# Combine summaries
# ==========================

# Merge all categorical summary DataFrames into a single one
categorical_df = pd.concat(categorical_summary.values(), axis=1)

# Combine with numeric summary
cluster_profiles = numeric_summary.join(categorical_df, how="left")

# Export to CSV
cluster_profiles.to_csv("Predictive_Maintenance_cluster_profiles.csv", index=True)

# Display in console
print(cluster_profiles)


                 UDI  Air temperature [K]  Process temperature [K]  \
predict                                                              
0        5007.841939           300.003687               310.005123   
1        4602.209945           300.072376               310.029282   

         Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  \
predict                                                         
0                   1538.245952    39.754130       107.123027   
1                   1567.535912    52.614917       152.867403   

         Machine failure       TWF       HDF       PWF  ...    M24846  \
predict                                                 ...             
0               0.016091  0.004379  0.010795  0.000000  ...  0.000102   
1               1.000000  0.016575  0.049724  0.524862  ...  0.000000   

           M24847    M24849    M24851    M24855    M24857    M24859         H  \
predict                                                                         
0  