In [3]:
# imports
import warnings
import os
import pandas as pd
from src.utils import transform_dataframe, plot_history, regression_metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Activation, Dropout, LSTM, Dense, TimeDistributed
from tensorflow.keras.ops import concatenate
from tensorflow.keras.callbacks import ModelCheckpoint, BackupAndRestore, EarlyStopping
from tensorflow.keras import Input, Model
from tensorflow.keras.optimizers import Adam
from sklearn.dummy import DummyRegressor
from sklearn.metrics import make_scorer, r2_score, mean_squared_error, mean_absolute_error

In [4]:
# configurate general settings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [5]:
# get the raw data directory
raw_data_dir = os.path.dirname(os.getcwd())

In [6]:
# load data
df_telemetry = pd.read_csv(os.path.join(raw_data_dir, "raw_data/PdM_telemetry.csv"))
df_errors = pd.read_csv(os.path.join(raw_data_dir, "raw_data/PdM_errors.csv"))
df_failures = pd.read_csv(os.path.join(raw_data_dir, "raw_data/PdM_failures.csv"))
df_machines = pd.read_csv(os.path.join(raw_data_dir, "raw_data/PdM_machines.csv"))
df_components = pd.read_csv(os.path.join(raw_data_dir, "raw_data/PdM_maint.csv"))

# Feature Engineering

In [7]:
# create an error count column for each datetime and machine id pair
df_components = df_components.groupby(by=["datetime", "machineID"], as_index=False).count()
df_components.rename(columns={"comp": "comp_count"}, inplace=True)
df_components


Unnamed: 0,datetime,machineID,comp_count
0,2014-06-01 06:00:00,1,1
1,2014-06-01 06:00:00,6,1
2,2014-06-01 06:00:00,9,2
3,2014-06-01 06:00:00,11,1
4,2014-06-01 06:00:00,16,1
...,...,...,...
2523,2016-01-01 06:00:00,35,1
2524,2016-01-01 06:00:00,42,1
2525,2016-01-01 06:00:00,43,1
2526,2016-01-01 06:00:00,55,1


In [8]:
# create an error count column for each datetime and machine id pair
df_errors = df_errors.groupby(by=["datetime", "machineID"], as_index=False).count()
df_errors.rename(columns={"errorID": "error_count"}, inplace=True)
df_errors

Unnamed: 0,datetime,machineID,error_count
0,2015-01-01 06:00:00,24,1
1,2015-01-01 06:00:00,73,1
2,2015-01-01 06:00:00,81,1
3,2015-01-01 07:00:00,43,1
4,2015-01-01 08:00:00,14,1
...,...,...,...
3611,2015-12-31 18:00:00,91,1
3612,2015-12-31 20:00:00,23,1
3613,2015-12-31 23:00:00,94,1
3614,2016-01-01 05:00:00,8,1


In [9]:
# create an failures comonent count column for each datetime and machine id pair
df_failures = df_failures.groupby(by=["datetime", "machineID"], as_index=False).count()
df_failures.rename(columns={"failure": "failure_component_count"}, inplace=True)
df_failures

Unnamed: 0,datetime,machineID,failure_component_count
0,2015-01-02 03:00:00,16,2
1,2015-01-02 03:00:00,17,1
2,2015-01-02 03:00:00,22,1
3,2015-01-02 03:00:00,35,1
4,2015-01-02 03:00:00,45,1
...,...,...,...
714,2015-12-30 06:00:00,88,1
715,2015-12-31 06:00:00,15,1
716,2015-12-31 06:00:00,64,1
717,2015-12-31 06:00:00,90,1


In [10]:
# join dataframes
df_maintenance = df_telemetry.merge(df_machines, on="machineID", how="left").merge(df_components, on=["datetime", "machineID"], how="left").merge(df_errors, on=["datetime", "machineID"], how="left").merge(df_failures, on=["datetime", "machineID"], how="left")
df_maintenance

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration,model,age,comp_count,error_count,failure_component_count
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686,model3,18,,,
1,2015-01-01 07:00:00,1,162.879223,402.747490,95.460525,43.413973,model3,18,,,
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847,model3,18,,,
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144,model3,18,,,
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511,model3,18,,,
...,...,...,...,...,...,...,...,...,...,...,...
876095,2016-01-01 02:00:00,100,179.438162,395.222827,102.290715,50.771941,model4,5,,,
876096,2016-01-01 03:00:00,100,189.617555,446.207972,98.180607,35.123072,model4,5,,,
876097,2016-01-01 04:00:00,100,192.483414,447.816524,94.132837,48.314561,model4,5,,,
876098,2016-01-01 05:00:00,100,165.475310,413.771670,104.081073,44.835259,model4,5,,,


In [11]:
# replace nan values by 0
df_maintenance.fillna(0, inplace=True)

In [12]:
# change the age column type to float
df_maintenance["age"] = df_maintenance["age"].astype(float)

In [13]:
df_maintenance["datetime"] = pd.to_datetime(df_maintenance["datetime"]) # change datetime column type to datetime
df_maintenance.set_index("datetime", inplace=True) # set datetime column as index

In [None]:
df_transformed = transform_dataframe(df_maintenance) # transform dataframe
df_transformed.reset_index(inplace=True, drop=True) # reset index
df_transformed

In [13]:
df_transformed[df_transformed.RUL<0].model.value_counts()

Series([], Name: count, dtype: int64)

# Data Preprocessing

In [14]:
# defining features and target
X = df_transformed.drop(columns="RUL")
y = df_transformed["RUL"]

In [15]:
# splitting dataset into training, validation and testing data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [16]:
# define numerical columns
num_cols = df_transformed.select_dtypes(float).columns
# define categorical columns
cat_cols = df_transformed.select_dtypes(object).columns
# create a column transformer for features
cols_transformer = ColumnTransformer((("num_transformer", StandardScaler(), num_cols),
                                    ("cat_transformer", OneHotEncoder(sparse_output=False), cat_cols)))

In [17]:
X_train_transformed = cols_transformer.fit_transform(X_train)
X_validation_transformed = cols_transformer.transform(X_validation)
X_test_transformed = cols_transformer.transform(X_test)

# Modeling & Evaluation

In [18]:
# define a beseline model
baseline_reg = DummyRegressor(strategy='mean')

In [19]:
# fit the baseline regressor
baseline_reg.fit(X_train_transformed, y_train)
# baseline model metrics
print(regression_metrics(baseline_reg, X_train_transformed, X_test_transformed, y_train, y_test))

Metric    Training Set    Test Set
--------  --------------  -----------
r2        0               -5.2454e-05
MSE       1604.4          1607.51
MAE       28.0813         28.4118
RMSE      40.055          40.0937


In [20]:
# define two sets of inputs representing time steps and non time steps features
# train inputs
time_steps_train_input = X_train_transformed[:, :-5].reshape((-1,30,7))
non_time_steps_train_input = X_train_transformed[:, -5:]

# validation inputs
time_steps_validation_input = X_validation_transformed[:, :-5].reshape((-1,30,7))
non_time_steps_validation_input = X_validation_transformed[:, -5:]

# test inputs
time_steps_test_input = X_test_transformed[:, :-5].reshape((-1,30,7))
non_time_steps_test_input = X_test_transformed[:, -5:]

In [22]:
# define two sets of inputs
time_steps_inputs = Input(shape=(30,7))
non_time_steps_inputs = Input(shape=(5,))

# define the lstm layers for the time steps inputs
x = LSTM(64, activation="relu", return_sequences=True)(time_steps_inputs)
x = LSTM(64, activation="relu", return_sequences=True)(x)
x = LSTM(16, activation="relu")(x)
x = Model(inputs=time_steps_inputs, outputs=x)

# combine the outputs of the lstm layers outputs and the other inputs
combined = concatenate([x.output, non_time_steps_inputs], axis=1)

# define dense layers
y = Dense(64, activation="relu")(combined)
y = Dense(64, activation="relu")(y)
y = Dense(4, activation="relu")(y)
y = Dense(1, activation="relu")(y)

# define the model
model = Model(inputs=[x.input, non_time_steps_inputs], outputs=y)

In [31]:
# define callbacks for the baseline model
backup_dir_path = "C:/Users/acer/Projects/predictive_maintenance/artifacts/callbacks/backup"
model_checkpoint_path = "C:/Users/acer/Projects/predictive_maintenance/artifacts/callbacks/model_checkpoint.keras"
model_checkpoint = ModelCheckpoint(
    filepath=model_checkpoint_path,
    monitor="val_loss",
    mode="min",
    save_best_only=True)
backup = BackupAndRestore(backup_dir=backup_dir_path)
model_callbacks=[backup, model_checkpoint]

In [24]:
# compile model
model.compile(loss="mse",
              optimizer=Adam(learning_rate=0.0005),
              metrics=["mae"])

In [None]:
# train model
model_history = model.fit(x=[time_steps_train_input, non_time_steps_train_input],
        y=y_train,                         
        batch_size=32,
        epochs=10000,
        validation_data=([time_steps_validation_input, non_time_steps_validation_input], y_validation),
        validation_batch_size=32,
        callbacks=model_callbacks)

In [33]:
# load the trained model
trained_model = load_model(model_checkpoint_path)

In [36]:
print(regression_metrics(trained_model, [time_steps_train_input, non_time_steps_train_input], 
                         [time_steps_test_input, non_time_steps_test_input], 
                         y_train, y_test))

[1m627/627[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 22ms/step
[1m135/135[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step
Metric    Training Set    Test Set
--------  --------------  ----------
r2        0.99967         0.982749
MSE       0.528809        27.7295
MAE       0.548513        1.77508
RMSE      0.727193        5.26588


The R2 score shows that the model is explaining about 98% of the variance in the target which is a good result. Depending on the mean absolute error score, the model tends to make an error of about 2 days, and since the root mean squared error is significantly higher than the mean absolute error, this shows us that it has made some larger errors on some samples.