In [1]:
import pandas as pd
import numpy as np

### Reading the data

In [2]:
!wget https://zenodo.org/records/6854240/files/dataset_train.csv?download=1

--2024-05-21 21:44:45--  https://zenodo.org/records/6854240/files/dataset_train.csv?download=1
Resolving zenodo.org (zenodo.org)... 188.184.98.238, 188.185.79.172, 188.184.103.159, ...
Connecting to zenodo.org (zenodo.org)|188.184.98.238|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1646201046 (1.5G) [text/plain]
Saving to: ‘dataset_train.csv?download=1’


2024-05-21 21:46:10 (18.6 MB/s) - ‘dataset_train.csv?download=1’ saved [1646201046/1646201046]



In [3]:
df = pd.read_csv('/content/dataset_train.csv')
df.head()

Unnamed: 0,timestamp,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Flowmeter,Motor_current,COMP,...,Towers,MPG,LPS,Pressure_switch,Oil_level,Caudal_impulses,gpsLong,gpsLat,gpsSpeed,gpsQuality
0,2022-01-01 06:00:00,-0.012,9.758,9.76,-0.028,1.576,63.35,19.049625,3.955,1,...,1,1,0,0,0,0,-8.65934,41.2124,0,1
1,2022-01-01 06:00:01,-0.012,9.76,9.76,-0.028,1.578,63.25,19.049625,4.0275,1,...,1,1,0,0,0,0,-8.65934,41.2124,0,1
2,2022-01-01 06:00:02,-0.01,9.76,9.76,-0.028,1.578,63.325,19.040281,3.945,1,...,1,1,0,0,0,0,-8.65934,41.2124,0,1
3,2022-01-01 06:00:03,-0.012,9.756,9.756,-0.03,1.576,63.2,19.040281,3.93,1,...,1,1,0,0,0,0,-8.65934,41.2124,0,1
4,2022-01-01 06:00:04,-0.012,9.756,9.756,-0.03,1.578,63.15,19.049625,3.995,1,...,1,1,0,0,0,0,-8.65934,41.2124,0,1


In [4]:
len(df)

10773588

In [5]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler


df.drop(['gpsLong', 'gpsLat', 'gpsSpeed', 'gpsQuality'], axis=1, inplace=True)

#####################

# Calculate the initial memory usage
initial_memory = df.memory_usage(deep=True).sum()
print(f'Initial memory usage: {initial_memory // 1e6} Mb')

# Convert integer columns to the smallest possible int type
for col in df.select_dtypes(include='int64').columns:
    df[col] = pd.to_numeric(df[col], downcast='integer')

# Convert float columns to float32
for col in df.select_dtypes(include='float64').columns:
    df[col] = df[col].astype('float16')

# Calculate the final memory usage after type conversion
final_memory = df.memory_usage(deep=True).sum()
print(f'Final memory usage: {final_memory // 1e6} Mb')

# Calculate and print the difference in memory usage
memory_difference = initial_memory - final_memory
print(f'Memory saved: {memory_difference // 1e3} byte')



###########################################################

# Add 'Failure Type' and 'Failure Component' columns with default values
df['Failure Type'] = 'No Failure'
df['Failure Component'] = 'No Failur Component'

# Define conditions and corresponding values
conditions_values = [
    ((df['timestamp'] >= '2022-02-28 21:53:00') &
     (df['timestamp'] < '2022-03-01 02:00:00'), 'Air Leak', 'Clients'),

    ((df['timestamp'] >= '2022-03-23 14:54:00') &
     (df['timestamp'] < '2022-03-23 15:24:00'), 'Air Leak', 'Air Dryer'),

    ((df['timestamp'] >= '2022-05-30 12:00:00') &
     (df['timestamp'] < '2022-06-02 06:18:00'), 'Oil Leak', 'Compressor')
]

# Loop through conditions and update values
for condition, failure_type, failure_component in conditions_values:
    df.loc[condition, 'Failure Type'] = failure_type
    df.loc[condition, 'Failure Component'] = failure_component



#############################################################

# Convert 'timestamp' column to datetime and set it as the index
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.set_index('timestamp', inplace=True)

################################################################

# Define the time window (2 hours)
time_window = pd.Timedelta(hours=2)

# Calculate the average time difference between consecutive rows
freq = pd.Timedelta(df.index.to_series().diff().mean())

# Convert the time window to the number of periods based on the calculated frequency
periods = int(time_window / freq)

# Calculate the future failure and future failure component using shift
df['future_failure'] = df['Failure Type'].shift(-periods)
df['future_failure_component'] = df['Failure Component'].shift(-periods)

# Fill the missing values (NaN) with the last available data using ffill()
df['future_failure'] = df['future_failure'].ffill()
df['future_failure_component'] = df['future_failure_component'].ffill()

# Replace remaining NaN values with 'No Failure' and 'No Failur Component'
df['future_failure'] = df['future_failure'].fillna('No Failure')
df['future_failure_component'] = df['future_failure_component'].fillna('No Failur Component')



########################################################################

# Remove the 'Failure Type' and 'Failure Component' columns
df = df.drop(['Failure Type', 'Failure Component'], axis=1)

# Create binary columns for each unique value in 'future_failure'
for failure_type in df['future_failure'].unique():
    if failure_type != 0:  # Exclude 'No Failure' (0)
        column_name = f"failure_type_{failure_type}"
        df[column_name] = df['future_failure'].apply(lambda x: 1 if x == failure_type else 0)

# Create binary columns for each unique value in 'future_failure_component'
for failure_component in df['future_failure_component'].unique():
    if failure_component != 0:  # Exclude 'No Failur Component' (0)
        column_name = f"failure_component_{failure_component}"
        df[column_name] = df['future_failure_component'].apply(lambda x: 1 if x == failure_component else 0)

# Drop the original 'future_failure' and 'future_failure_component' columns
df = df.drop(['future_failure', 'future_failure_component'], axis=1)

# Print the updated DataFrame
print("Updated DataFrame:")


############################################################################






Initial memory usage: 2197.0 Mb
Final memory usage: 1077.0 Mb
Memory saved: 1120453.0 byte
Updated DataFrame:


In [8]:
# Remove the 'Pressure_switch' column
df = df.drop(['Pressure_switch'], axis=1)

# Specify the columns to be scaled
columns_to_scale = ['TP2', 'TP3', 'H1', 'DV_pressure', 'Reservoirs',
                    'Oil_temperature', 'Flowmeter', 'Motor_current']

# Create a scaler object
scaler = MinMaxScaler()

# Fit the scaler on the specified columns
scaler.fit(df[columns_to_scale])

# Transform the specified columns using the fitted scaler
df[columns_to_scale] = scaler.transform(df[columns_to_scale])


In [9]:
len(df)

10773588

In [10]:
# Assuming df is your original DataFrame
start_date = pd.Timestamp('2022-02-25 21:53:00')
end_date = pd.Timestamp('2022-03-26 21:53:00')
df = df.loc[start_date:end_date]

In [11]:
len(df)

1883126

In [13]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard, CSVLogger
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os

# Assuming X and y are already defined
X = df[['TP2', 'TP3', 'H1', 'DV_pressure', 'Reservoirs', 'Oil_temperature', 'Flowmeter', 'Motor_current', 'COMP', 'DV_eletric', 'Towers', 'MPG', 'LPS', 'Oil_level', 'Caudal_impulses']]
y = df[['failure_type_No Failure', 'failure_type_Air Leak', 'failure_type_Oil Leak', 'failure_component_No Failur Component', 'failure_component_Clients', 'failure_component_Air Dryer', 'failure_component_Compressor']]

# Calculate the size of the training set (60% of the data)
train_set_size = int(X.shape[0] * 0.6)

# Split the data into training and validation sets
X_train, X_val = X.iloc[:train_set_size], X.iloc[train_set_size:]
y_train, y_val = y.iloc[:train_set_size], y.iloc[train_set_size:]

# Reshape the input data for LSTM
X_train = X_train.values.reshape((-1, X_train.shape[1], 1))
X_val = X_val.values.reshape((-1, X_val.shape[1], 1))

# Define the LSTM model
model = Sequential()
model.add(LSTM(64, input_shape=(X_train.shape[1], 1)))
model.add(Dense(y_train.shape[1], activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Create a directory for TensorBoard logs
log_dir = 'logs'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Create TensorBoard and CSVLogger callbacks
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
csv_logger = CSVLogger(os.path.join(log_dir, 'training_log.csv'), append=True)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.fit(X_train, y_train, epochs=1, batch_size=32, validation_data=(X_val, y_val), verbose=1, callbacks=[tensorboard_callback, csv_logger])

    # Evaluate the model on the validation set
    y_pred = model.predict(X_val)
    y_pred = (y_pred > 0.5).astype(int)

    accuracy = accuracy_score(y_val.values, y_pred)
    precision = precision_score(y_val.values, y_pred, average='macro', zero_division='warn')
    recall = recall_score(y_val.values, y_pred, average='macro', zero_division='warn')
    f1 = f1_score(y_val.values, y_pred, average='macro', zero_division='warn')

    _, val_loss = model.evaluate(X_val, y_val, verbose=0)

    print(f'Epoch [{epoch+1}/{num_epochs}], Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}')



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


Epoch [1/10], Val Loss: 0.0040, Accuracy: 0.9976, Precision: 0.2850, Recall: 0.2857, F1-score: 0.2854


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


Epoch [2/10], Val Loss: 0.7055, Accuracy: 0.9976, Precision: 0.2850, Recall: 0.2857, F1-score: 0.2854


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


Epoch [3/10], Val Loss: 0.7513, Accuracy: 0.9975, Precision: 0.2850, Recall: 0.2857, F1-score: 0.2854
 5478/35309 [===>..........................] - ETA: 2:42 - loss: 0.0149 - accuracy: 0.6751

KeyboardInterrupt: 