In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.fft import fft
import matplotlib.dates as mdates
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb

### Reading the data

In [4]:
df = pd.read_csv("/mnt/d/ML-Datasets/MetroPT/clean_data.csv")
df.head()

Unnamed: 0,timestamp,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Flowmeter,Motor_current,COMP,...,Pressure_switch,Oil_level,Caudal_impulses,failure_type_No Failure,failure_type_Air Leak,failure_type_Oil Leak,failure_component_No Failur Component,failure_component_Clients,failure_component_Air Dryer,failure_component_Compressor
0,2022-01-01 06:00:00,-0.012,9.76,9.76,-0.028,1.576,63.34,19.05,3.955,1,...,0,0,0,1,0,0,1,0,0,0
1,2022-01-01 06:00:01,-0.012,9.76,9.76,-0.028,1.578,63.25,19.05,4.027,1,...,0,0,0,1,0,0,1,0,0,0
2,2022-01-01 06:00:02,-0.01,9.76,9.76,-0.028,1.578,63.3,19.05,3.945,1,...,0,0,0,1,0,0,1,0,0,0
3,2022-01-01 06:00:03,-0.012,9.76,9.76,-0.03,1.576,63.2,19.05,3.93,1,...,0,0,0,1,0,0,1,0,0,0
4,2022-01-01 06:00:04,-0.012,9.76,9.76,-0.03,1.578,63.16,19.05,3.994,1,...,0,0,0,1,0,0,1,0,0,0


In [5]:
df.columns

Index(['timestamp', 'TP2', 'TP3', 'H1', 'DV_pressure', 'Reservoirs',
       'Oil_temperature', 'Flowmeter', 'Motor_current', 'COMP', 'DV_eletric',
       'Towers', 'MPG', 'LPS', 'Pressure_switch', 'Oil_level',
       'Caudal_impulses', 'failure_type_No Failure', 'failure_type_Air Leak',
       'failure_type_Oil Leak', 'failure_component_No Failur Component',
       'failure_component_Clients', 'failure_component_Air Dryer',
       'failure_component_Compressor'],
      dtype='object')

In [6]:
sensor_columns = ['TP2', 'TP3', 'H1', 'DV_pressure', 'Reservoirs',
                  'Oil_temperature', 'Flowmeter', 'Motor_current', 'COMP', 'DV_eletric',
                  'Towers', 'MPG', 'LPS', 'Pressure_switch', 'Oil_level',
                  'Caudal_impulses']

# Find the data types of the sensor columns
print("Data types of sensor columns:")
print(df[sensor_columns].dtypes)

Data types of sensor columns:
TP2                float64
TP3                float64
H1                 float64
DV_pressure        float64
Reservoirs         float64
Oil_temperature    float64
Flowmeter          float64
Motor_current      float64
COMP                 int64
DV_eletric           int64
Towers               int64
MPG                  int64
LPS                  int64
Pressure_switch      int64
Oil_level            int64
Caudal_impulses      int64
dtype: object


In [7]:
# Filter the integer columns
integer_columns = df[sensor_columns].select_dtypes(include=['int64']).columns

# Get the minimum and maximum values for the integer columns
print("Minimum values of integer columns:")
print(df[integer_columns].min())

print("Maximum values of integer columns:")
print(df[integer_columns].max())

Minimum values of integer columns:
COMP               0
DV_eletric         0
Towers             0
MPG                0
LPS                0
Pressure_switch    0
Oil_level          0
Caudal_impulses    0
dtype: int64
Maximum values of integer columns:
COMP               1
DV_eletric         1
Towers             1
MPG                1
LPS                1
Pressure_switch    0
Oil_level          1
Caudal_impulses    1
dtype: int64


In [6]:
# Remove the 'Pressure_switch' column
df = df.drop('Pressure_switch', axis=1)

# Specify the columns to be scaled
columns_to_scale = ['TP2', 'TP3', 'H1', 'DV_pressure', 'Reservoirs',
                    'Oil_temperature', 'Flowmeter', 'Motor_current']

# Create a scaler object
scaler = MinMaxScaler()

# Fit the scaler on the specified columns
scaler.fit(df[columns_to_scale])

# Transform the specified columns using the fitted scaler
df[columns_to_scale] = scaler.transform(df[columns_to_scale])

# Print the updated DataFrame
df.head()

Unnamed: 0,timestamp,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Flowmeter,Motor_current,COMP,...,LPS,Oil_level,Caudal_impulses,failure_type_No Failure,failure_type_Air Leak,failure_type_Oil Leak,failure_component_No Failur Component,failure_component_Clients,failure_component_Air Dryer,failure_component_Compressor
0,2022-01-01 06:00:00,0.001651,0.937524,0.937404,0.001195,0.320567,0.588694,0.00908,0.408915,1,...,0,0,0,1,0,0,1,0,0,0
1,2022-01-01 06:00:01,0.001651,0.937524,0.937404,0.001195,0.323404,0.587623,0.00908,0.416336,1,...,0,0,0,1,0,0,1,0,0,0
2,2022-01-01 06:00:02,0.001834,0.937524,0.937404,0.001195,0.323404,0.588218,0.00908,0.407885,1,...,0,0,0,1,0,0,1,0,0,0
3,2022-01-01 06:00:03,0.001651,0.937524,0.937404,0.000956,0.320567,0.587028,0.00908,0.406339,1,...,0,0,0,1,0,0,1,0,0,0
4,2022-01-01 06:00:04,0.001651,0.937524,0.937404,0.000956,0.323404,0.586552,0.00908,0.412935,1,...,0,0,0,1,0,0,1,0,0,0


In [8]:
# Assuming your DataFrame is named 'df'
X = df[['TP2', 'TP3', 'H1', 'DV_pressure', 'Reservoirs', 'Oil_temperature', 'Flowmeter', 'Motor_current', 'COMP', 'DV_eletric', 'Towers', 'MPG', 'LPS', 'Oil_level', 'Caudal_impulses']]
y = df['failure_type_No Failure']

# Set the number of folds (e.g., 5)
n_splits = 5

# Create TimeSeriesSplit object
tscv = TimeSeriesSplit(n_splits=n_splits)

# Initialize variables to store the start and end index of fold 5
fold_5_start_index = None
fold_5_end_index = None

# Iterate over each fold
for fold, (train_index, test_index) in enumerate(tscv.split(X), 1):
    print(f"Fold {fold}:")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Print the distribution of failure_type_No Failure for each fold
    train_normal = y_train.sum()
    train_faulty = len(y_train) - train_normal
    test_normal = y_test.sum()
    test_faulty = len(y_test) - test_normal
    
    print(f"  Train: Normal={train_normal}, Faulty={train_faulty}")
    print(f"  Test: Normal={test_normal}, Faulty={test_faulty}")
    print()
    
    # Check if the current fold is fold 5
    if fold == 5:
        fold_5_start_index = test_index[0]
        fold_5_end_index = test_index[-1]

# Print the start and end index of fold 5
if fold_5_start_index is not None and fold_5_end_index is not None:
    print(f"Fold 5 Start Index: {fold_5_start_index}")
    print(f"Fold 5 End Index: {fold_5_end_index}")
else:
    print("Fold 5 not found.")

Fold 1:
  Train: Normal=1795598, Faulty=0
  Test: Normal=1795598, Faulty=0

Fold 2:
  Train: Normal=3591196, Faulty=0
  Test: Normal=1780778, Faulty=14820

Fold 3:
  Train: Normal=5371974, Faulty=14820
  Test: Normal=1793798, Faulty=1800

Fold 4:
  Train: Normal=7165772, Faulty=16620
  Test: Normal=1795598, Faulty=0

Fold 5:
  Train: Normal=8961370, Faulty=16620
  Test: Normal=1600115, Faulty=195483

Fold 5 Start Index: 8977990
Fold 5 End Index: 10773587


In [None]:
import gc
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Assuming your DataFrame is named 'df'
X = df[['TP2', 'TP3', 'H1', 'DV_pressure', 'Reservoirs', 'Oil_temperature', 'Flowmeter', 'Motor_current', 'COMP', 'DV_eletric', 'Towers', 'MPG', 'LPS', 'Oil_level', 'Caudal_impulses']]
y = df[['failure_type_No Failure', 'failure_type_Air Leak', 'failure_type_Oil Leak', 'failure_component_No Failur Component', 'failure_component_Clients', 'failure_component_Air Dryer', 'failure_component_Compressor']]

# Extract the data for fold 5
fold_5_start_index = 8977990
fold_5_end_index = 10773587
X_fold_5 = X.iloc[fold_5_start_index:fold_5_end_index+1]
y_fold_5 = y.iloc[fold_5_start_index:fold_5_end_index+1]

# Delete the original DataFrame to free up memory
del df
gc.collect()

# Split the data within fold 5 into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_fold_5, y_fold_5, test_size=0.4, random_state=42)

# Delete the fold 5 data to free up memory
del X_fold_5, y_fold_5
gc.collect()

# Define the parameter grid for XGBoost
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [50, 100, 200],
    'objective': ['binary:logistic'],
    'eval_metric': ['logloss']
}

# Define the scoring metrics
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

# Perform grid search for each binary variable
for column in y_train.columns:
    print(f"Optimizing hyperparameters for {column}:")
    y_train_column = y_train[column]
    y_test_column = y_test[column]
    
    # Create the XGBoost classifier
    model = xgb.XGBClassifier(random_state=42)
    
    # Perform grid search
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, refit='f1', cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train_column)
    
    # Print the best hyperparameters and scores
    print("Best hyperparameters:", grid_search.best_params_)
    print("Best F1 score:", grid_search.best_score_)
    
    # Evaluate the best model on the test set
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    test_accuracy = accuracy_score(y_test_column, y_pred)
    test_f1 = f1_score(y_test_column, y_pred)
    print("Test Accuracy:", test_accuracy)
    print("Test F1 Score:", test_f1)
    print()

# Release memory
del X_train, X_test, y_train, y_test
gc.collect()

Optimizing hyperparameters for failure_type_No Failure:
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best hyperparameters: {'eval_metric': 'logloss', 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'objective': 'binary:logistic'}
Best F1 score: 0.9511743268037038
Test Accuracy: 0.9089983849409669
Test F1 Score: 0.9511819304203772

Optimizing hyperparameters for failure_type_Air Leak:
Fitting 5 folds for each of 27 candidates, totalling 135 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Best hyperparameters: {'eval_metric': 'logloss', 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'objective': 'binary:logistic'}
Best F1 score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Test Accuracy: 1.0
Test F1 Score: 0.0

Optimizing hyperparameters for failure_type_Oil Leak:
Fitting 5 folds for each of 27 candidates, totalling 135 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize