Lets try a regression model on this problem:
Now the question is "How many more cycles will the machine last before it fail?" instead of "Will the machine fail within certain number of cycles?"

The code is largely the same except when building the model. More specifically, the difference is the activation function on the final dense layer. Now is linear instead of sigmoid in the classification model. And some other changes as well, such as the optimization algorithm is RMSprop and loss function is MSE.

Below is the code:


In [26]:
import os
import glob
import re
import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, LSTM
import keras.backend as K
from keras.layers.core import Activation


# Setting seed for reproducibility
np.random.seed(1234)  
PYTHONHASHSEED = 0
model_path = ''

# function for sort the csv file in numerical order when entering for loop
# to ensure the ID value is correctly assigned
numbers = re.compile(r'(\d+)')
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

def filter(df):
    low = .01
    high = .99
    quant_df = df.quantile([low, high])
    fil_df = df.apply(lambda x: x[(x>=quant_df.loc[low,x.name]) 
                                                & (x <= quant_df.loc[high,x.name])], axis=0)
    return fil_df

##########################
## Data Preprocessing  ##
##########################

dfTrain_List = []
ID = 0

for train_file in sorted(glob.glob('smalldata/train/*_rms.csv'), key=numericalSort):
    trainDf = pd.read_csv(train_file)
    trainDf.drop('timestamp', axis=1, inplace=True)
    trainDf = filter(trainDf)
    # fill NaN with first interpolate then fillna function
    trainDf = trainDf.interpolate()
    trainDf = trainDf.fillna(0)
    # assign id to each machine
    trainDf["id"] = ID
    ID += 1
    # numbering each monitor cycle
    trainDf["cycle"] = trainDf.index + 1
    # Tof: Time to failure
    trainDf["TTF"] = trainDf["cycle"].values[::-1]
    # add all training data to a list
    dfTrain_List.append(trainDf)
    

# combine all training data together    
concatTrain = pd.concat(dfTrain_List, axis=0)

# generate label columns for training data
# is a machine going to fail within x cycles?
# every day monitors will run ~150 cycles
# we can set x to any numbers, but I set it as 150 to see whether a machine will fail within a day

x = 150
concatTrain['label'] = np.where(concatTrain['TTF'] <= x, 1, 0 )

# Training data MinMax normalization 
concatTrain['cycle_norm'] = concatTrain['cycle']

# df.columns.difference returns as output a new list of columns 
# from the existing columns excluding the ones given as arguments.
# separate out the cols that need to be normalized as cols_normalized
cols_normalize = concatTrain.columns.difference(['id','cycle','TTF','label'])

min_max_scaler = preprocessing.MinMaxScaler()
norm_train_df = pd.DataFrame(min_max_scaler.fit_transform(concatTrain[cols_normalize]), 
                             columns=cols_normalize, 
                             index=concatTrain.index)

# combine normalized data columns with non-normalized colums
join_df = pd.concat([concatTrain[concatTrain.columns.difference(cols_normalize)], norm_train_df], axis=1)
concatTrain = join_df.reindex(columns = concatTrain.columns)


# same preprocessing to test data
dfTest_List = []

for test_file in sorted(glob.glob('data/test/*_rms.csv'), key=numericalSort):
    testDf = pd.read_csv(test_file)
    testDf.drop('timestamp', axis=1, inplace=True)
    testDf = filter(testDf)
    # fill NaN with first interpolate then fillna function
    testDf = testDf.interpolate()
    testDf = testDf.fillna(0)
    testDf["id"] = ID
    ID += 1
    testDf["cycle"] = testDf.index + 1
    testDf["TTF"] = testDf["cycle"].values[::-1]

    dfTest_List.append(testDf)
    
concatTest = pd.concat(dfTest_List, axis=0)

concatTest['cycle_norm'] = concatTest['cycle']

norm_test_df = pd.DataFrame(min_max_scaler.fit_transform(concatTest[cols_normalize]), 
                             columns=cols_normalize, 
                             index=concatTest.index)

join_df = pd.concat([concatTest[concatTest.columns.difference(cols_normalize)], norm_test_df], axis=1)

concatTest = join_df.reindex(columns = concatTest.columns)

In [27]:
##############################
##  Generate data sequence  ##
##############################

sequence_length = 150

def gen_sequence(id_df, seq_length, seq_cols):
    data_matrix = id_df[seq_cols].values
    num_elements = data_matrix.shape[0]
    for start, stop in zip(range(0, num_elements-seq_length), 
                           range(seq_length, num_elements)):
        yield data_matrix[start:stop, :]
        
sequence_cols = ['rpm', 'motor_voltage', 'motor_current', 'motor_temp', 
                 'inlet_temp', 'cycle_norm']

seq_gen = (list(gen_sequence(concatTrain[concatTrain['id']==id], sequence_length, sequence_cols)) 
           for id in concatTrain['id'].unique())

seq_array = np.concatenate(list(seq_gen)).astype(np.float32)

def gen_labels(id_df, seq_length, label):
    data_matrix = id_df[label].values
    num_elements = data_matrix.shape[0]
    return data_matrix[seq_length:num_elements, :]

label_gen = [gen_labels(concatTrain[concatTrain['id']==id], sequence_length, ['TTF']) 
             for id in concatTrain['id'].unique()]

label_array = np.concatenate(label_gen).astype(np.float32)

In [28]:
##################################
##  Model training and testing  ##
##################################
def r2_keras(y_true, y_pred):
    """Coefficient of Determination 
    """
    SS_res =  K.sum(K.square( y_true - y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )


nb_features = seq_array.shape[2]
nb_out = label_array.shape[1]

model = Sequential()
model.add(LSTM(
         input_shape=(sequence_length, nb_features),
         units=100,
         return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(
          units=50,
          return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(units=nb_out))
model.add(Activation("linear"))
model.compile(loss='mean_squared_error', optimizer='rmsprop',metrics=['mae', r2_keras])


print("Regression Model Summary:")
print(model.summary())

model.fit(seq_array, label_array, epochs=1, batch_size=2000, validation_split=0.05, verbose=1,
          callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=0, 
                                                     verbose=0, mode='min')])

scores = model.evaluate(seq_array, label_array, verbose=1, batch_size=200)
print('Accurracy: {}'.format(scores[1]))

y_pred = model.predict_classes(seq_array,verbose=1, batch_size=200)
y_true = label_array


seq_array_test_last = [concatTest[concatTest['id']==id][sequence_cols].values[-sequence_length:] 
                       for id in concatTest['id'].unique() 
                       if len(concatTest[concatTest['id']==id]) >= sequence_length]

seq_array_test_last = np.asarray(seq_array_test_last).astype(np.float32)

print(seq_array_test_last.shape)

# make predictions and compute confusion matrix

# if os.path.isfile(model_path):
#     estimator = load_model(model_path, custom_objects={'r2_keras': r2_keras})
y_pred_test = model.predict(seq_array_test_last)

print("PREDIECTED TEST remaining cycles:")
# print(y_pred_test.shape())
print(y_pred_test)


Regression Model Summary:
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_21 (LSTM)               (None, 150, 100)          42800     
_________________________________________________________________
dropout_21 (Dropout)         (None, 150, 100)          0         
_________________________________________________________________
lstm_22 (LSTM)               (None, 50)                30200     
_________________________________________________________________
dropout_22 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 51        
_________________________________________________________________
activation_6 (Activation)    (None, 1)                 0         
Total params: 73,051
Trainable params: 73,051
Non-trainable params: 0
______________________________________________