In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score, log_loss
import matplotlib.pyplot as plt
import os
import math
%matplotlib inline


import warnings
warnings.filterwarnings('ignore')


In [2]:
%load_ext autoreload

In [3]:
def plot_predictions(rul_filename, predictions):
    rul_true = pd.read_csv(rul_filename, sep=" ", header=None)
    rul_true.columns = ['True', '']
    rul_true['Predicted'] = predictions
    rul_true = rul_true[['True', 'Predicted']]

    plt.figure(figsize=(7,4))
    plt.scatter(rul_true['True'], rul_true['Predicted'])
    plt.ylabel('Predicted RUL')
    plt.xlabel('True RUL')
    plt.title('Comparing Model Projections to Baseline')

def run(file_number):
    test_df = pd.read_csv('Data/test_FD00{}.txt'.format(file_number), sep=" ", header=None)
    test_df = RUL_Model.preprocess_and_predict(test_df)
    predictions = RUL_Model.summarize_predictions_by_unit(test_df)
    print("Number of predictions:", len(predictions))
    plot_predictions('Data/RUL_FD00{}.txt'.format(file_number), predictions=predictions)

In [4]:
%autoreload 1

In [5]:
%aimport

Modules to reload:


Modules to skip:



In [6]:
%aimport RUL_Model

In [7]:
%aimport

Modules to reload:
RUL_Model

Modules to skip:



In [8]:
w = [5] #Bin definitions associated with current models

### Scores for Test File 1

In [None]:
file_number = 1
test_df = pd.read_csv('Data/test_FD00{}.txt'.format(file_number), sep=" ", header=None)
test_df = RUL_Model.preprocess_and_predict(test_df)
predictions = RUL_Model.summarize_predictions_by_unit(test_df)

rul_filename = 'Data/RUL_FD00{}.txt'.format(file_number)
rul_true = pd.read_csv(rul_filename, sep=" ", header=None)
rul_true.columns = ['True', '']
rul_true['Predicted'] = predictions
rul_true = rul_true[['True', 'Predicted']]

rul_true['True_label'] = np.where(rul_true['True'] <= w[0], 1, 0 )
for i in range(1,len(w)):
    rul_true.loc[rul_true['True'] <= w[i], 'True_label'] = i+1
    
rul_true['Predicted_label'] = np.where(rul_true['Predicted'] <= w[0], 1, 0 )
for i in range(1,len(w)):
    rul_true.loc[rul_true['Predicted'] <= w[i], 'Predicted_label'] = i+1
    
print("recall_score:", recall_score(rul_true['True_label'], rul_true['Predicted_label'], average="weighted"))
print("precision_score:", precision_score(rul_true['True_label'], rul_true['Predicted_label'], average="weighted"))
print("f1_score:", f1_score(rul_true['True_label'], rul_true['Predicted_label'], average="weighted"))

### Scores for Test File 2

In [None]:
file_number = 2
test_df = pd.read_csv('Data/test_FD00{}.txt'.format(file_number), sep=" ", header=None)
test_df = RUL_Model.preprocess_and_predict(test_df)
predictions = RUL_Model.summarize_predictions_by_unit(test_df)

rul_filename = 'Data/RUL_FD00{}.txt'.format(file_number)
rul_true = pd.read_csv(rul_filename, sep=" ", header=None)
rul_true.columns = ['True', '']
rul_true['Predicted'] = predictions
rul_true = rul_true[['True', 'Predicted']]

rul_true['True_label'] = np.where(rul_true['True'] <= w[0], 1, 0 )
for i in range(1,len(w)):
    rul_true.loc[rul_true['True'] <= w[i], 'True_label'] = i+1
    
rul_true['Predicted_label'] = np.where(rul_true['Predicted'] <= w[0], 1, 0 )
for i in range(1,len(w)):
    rul_true.loc[rul_true['Predicted'] <= w[i], 'Predicted_label'] = i+1
    
print("recall_score:", recall_score(rul_true['True_label'], rul_true['Predicted_label'], average="weighted"))
print("precision_score:", precision_score(rul_true['True_label'], rul_true['Predicted_label'], average="weighted"))
print("f1_score:", f1_score(rul_true['True_label'], rul_true['Predicted_label'], average="weighted"))

## Break Training Data into Batches to Simulate Streaming Data

In [36]:
# lookback period
history_size = 50

# how often will we train it
retrain_periodicity = 10

# cycle window for binary failure prediction
failure_prediction_size = w[0]

file_number = 1
train_df = pd.read_csv('Data/train_FD00{}.txt'.format(file_number), sep=" ", header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3',
                     's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14',
                     's15', 's16', 's17', 's18', 's19', 's20', 's21']
max_cycle = train_df['cycle'].max()

num_retrain_periods = math.ceil(max_cycle/retrain_periodicity)
unit_end_cycle = train_df.groupby('id')['cycle'].max()

train_df.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [45]:
for i in range(0, num_retrain_periods):
    start = (i * retrain_periodicity) + 1
    end = (start+history_size) - 1 #(start+history_size-failure_prediction_size) - 1
    
    dir = 'Data/train_FD00{}'.format(file_number)
    if not os.path.exists(dir):
        os.mkdir(dir)
    batch_df = train_df[(train_df.cycle >= start) & (train_df.cycle <= end)] #Subset batch based on cycle number
    batch_df['MinRUL']=batch_df.groupby(['id'])['cycle'].transform(max)-batch_df['cycle']

    batch_df['label'] = np.where(batch_df['MinRUL'] <= failure_prediction_size, 1, 0 )
    end = end-failure_prediction_size
    batch_df = batch_df[(batch_df.cycle <= end)]
    batch_df = batch_df.drop(['MinRUL'],axis=1)
    
#Previous formulation
#     for i in range(1,len(w)):
#         train_df.loc[train_df['RUL'] <= w[i], 'label'] = i+1
    batch_df.to_csv('{}/cycles_{}_to_{}.txt'.format(dir, start,end), sep=" ", header=None, index=False)

In [46]:
ls -l Data/train_FD001/

total 13912
-rw-r--r-- 1 jovyan users 744876 Nov 20 15:40 cycles_101_to_145.txt
-rw-r--r-- 1 jovyan users 734031 Nov 20 15:40 cycles_111_to_155.txt
-rw-r--r-- 1 jovyan users 747553 Nov 20 15:40 cycles_11_to_55.txt
-rw-r--r-- 1 jovyan users 708521 Nov 20 15:40 cycles_121_to_165.txt
-rw-r--r-- 1 jovyan users 672196 Nov 20 15:40 cycles_131_to_175.txt
-rw-r--r-- 1 jovyan users 629396 Nov 20 15:40 cycles_141_to_185.txt
-rw-r--r-- 1 jovyan users 575042 Nov 20 15:40 cycles_151_to_195.txt
-rw-r--r-- 1 jovyan users 505082 Nov 20 15:40 cycles_161_to_205.txt
-rw-r--r-- 1 jovyan users 431281 Nov 20 15:40 cycles_171_to_215.txt
-rw-r--r-- 1 jovyan users 350514 Nov 20 15:40 cycles_181_to_225.txt
-rw-r--r-- 1 jovyan users 276284 Nov 20 15:40 cycles_191_to_235.txt
-rw-r--r-- 1 jovyan users 746521 Nov 20 15:40 cycles_1_to_45.txt
-rw-r--r-- 1 jovyan users 213266 Nov 20 15:40 cycles_201_to_245.txt
-rw-r--r-- 1 jovyan users 171817 Nov 20 15:40 cycles_211_to_255.txt
-rw-r--r-- 1 jovyan users 

In [52]:

sorted(list(set(train_df.columns) - set(['id','cycle','label'])))

['s1',
 's10',
 's11',
 's12',
 's13',
 's14',
 's15',
 's16',
 's17',
 's18',
 's19',
 's2',
 's20',
 's21',
 's3',
 's4',
 's5',
 's6',
 's7',
 's8',
 's9',
 'setting1',
 'setting2',
 'setting3']

## Iterate through Stream and generate Streaming Models

In [79]:
previous_data = []
for i in range(0, num_retrain_periods):
    #Calculate start and end cycles
    start = (i * retrain_periodicity) + 1
    end = (start+history_size-failure_prediction_size) - 1
    
    #Generate file path
    dir = 'Data/train_FD00{}'.format(file_number)
    print('{}/cycles_{}_to_{}.txt'.format(dir, start,end))
    
    #Read in batch data
    batch_df = pd.read_csv('{}/cycles_{}_to_{}.txt'.format(dir, start,end), sep=" ", header=None)
    
    if batch_df[0].nunique()<=5:
        print('Data is too limited for training')
    else:
        #Append new batch data to historical
    #     previous_data.append(batch_df)
    #     historical_df = pd.merge(previous_data)

        #Preprocess Input
        x_train_img, y_train = RUL_Model.preprocess_train(batch_df)
        #Train Model
        model = RUL_Model.train_model(x_train_img, y_train)
        #Save Model to File
        dir = 'Data/train_FD00{}_model'.format(file_number)
        if not os.path.exists(dir):
            os.mkdir(dir)
        model.save('{}/cycles_{}_to_{}.h5'.format(dir, start,end))

Data/train_FD001/cycles_1_to_45.txt
#id: 100
Epoch 1/25
2/2 - 1s - loss: 1.1080 - accuracy: 0.5750 - val_loss: 0.5672 - val_accuracy: 0.9000
Epoch 2/25
2/2 - 1s - loss: 0.7088 - accuracy: 0.9000 - val_loss: 0.3393 - val_accuracy: 0.9000
Epoch 3/25
2/2 - 1s - loss: 0.3627 - accuracy: 0.9000 - val_loss: 0.4728 - val_accuracy: 0.9000
Epoch 4/25
2/2 - 1s - loss: 0.4011 - accuracy: 0.9000 - val_loss: 0.4436 - val_accuracy: 0.9000
Epoch 5/25
2/2 - 1s - loss: 0.3739 - accuracy: 0.9000 - val_loss: 0.3505 - val_accuracy: 0.9000
Epoch 6/25
2/2 - 1s - loss: 0.3413 - accuracy: 0.9000 - val_loss: 0.3410 - val_accuracy: 0.9000
Epoch 7/25
Restoring model weights from the end of the best epoch.
2/2 - 1s - loss: 0.3713 - accuracy: 0.9000 - val_loss: 0.3362 - val_accuracy: 0.9000
Epoch 00007: early stopping
Data/train_FD001/cycles_11_to_55.txt
#id: 100
Epoch 1/25
2/2 - 1s - loss: 1.0373 - accuracy: 0.7475 - val_loss: 0.8227 - val_accuracy: 0.9000
Epoch 2/25
2/2 - 1s - loss: 0.9208 - accuracy: 0.9000 - v

ValueError: Training data contains 1 samples, which is not sufficient to split it into a validation and training set as specified by `validation_split=0.2`. Either provide more data, or a different value for the `validation_split` argument.

## Comparing Updated Models on Old Data

In [None]:
# Calculate various metrics for a model

In [None]:
print(x_train_img.shape)
print(y_train.shape)

### Outline:

1. Iterater Over Future Train Data for first n cycles:
    1. Through cycle 50, pause generate predictions
    2. Over the next 50 cycles, calculate model drift between predictions from above and what came to be.
        * Assumptions: mechanics can determine whether motor needed service during maintenance for motors flagged
        * This will be used to determine precision and model drift
        
        * !!! Warning:  Any motors not flagged that fail in the next n cycles will affect the recall score, and must trigger an immediate model retuning. This Drift is unacceptable!
        

If Recall score ever falls below 1: Immediate retraining and [Jira ticket] (Phase 1.5 tbd)
If precision falls below .8 or .9? then wasted resources, and also retraining needed
Alternatively, precision can be tracked as a cost score based on total maintenance cost