In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report


import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [3]:
BATCH_SIZE = 64

In [4]:
path = "/content/drive/MyDrive/Thesis/Datasets/Turbofan_Dataset/final_datasets_normalized/"

In [5]:
# Load data and drop irrelevant columns

df_train = pd.read_csv(path + "TRAINING_SET_FULL.csv")
df_test = pd.read_csv(path + "TEST_SET_FULL.csv")

drop_cols = ["cycle","setting3","s1","s5","s10","s16","s18","s19","RUL"]
corr_cols = ["s11","s4","s15","s17","s2","s3","s8","s13","s9","s14","s12","s7","s20"]
feature_cols = ['cycle_norm', 'setting1', 'setting2', 's2', 's3', 's4', 's6', 's7',
       's8', 's9', 's11', 's12', 's13', 's14', 's15', 's17', 's20', 's21']
prediction_col = 'fail_30'

train_set = df_train.drop(drop_cols, axis=1)
test_set = df_test.drop(drop_cols, axis=1)

In [6]:
# Move cycle_norm column first for convenience

# column_to_move = train_set.pop("cycle_norm")
# train_set.insert(0, "cycle_norm", column_to_move)

# column_to_move = test_set.pop("cycle_norm")
# test_set.insert(0, "cycle_norm", column_to_move)

In [7]:
# Receives single engine dataframe, window size and features -> sequences of length==window_size
def gen_train_data(df, sequence_length, columns):
    data = df[columns].values
    num_elements = data.shape[0]

    # -1 and +1 because of Python indexing
    for start, stop in zip(range(0, num_elements-(sequence_length-1)), range(sequence_length, num_elements+1)):
        yield data[start:stop, :]

In [8]:
# gen = gen_train_data(train_set[train_set['id']==1], sequence_length=4, columns=feature_cols)
# engines = list(gen)

In [9]:
# Generates sequences for multiple engines
def gen_data_wrapper(df, sequence_length, columns, ids=np.array([])):
    if ids.size <= 0:
        ids = df['id'].unique()
        
    data_gen = (list(gen_train_data(df[df['id']==id], sequence_length, columns))
               for id in ids)
    data_array = np.concatenate(list(data_gen)).astype(np.float32)
    return data_array

In [10]:
data_array = gen_data_wrapper(train_set, sequence_length=4, columns=feature_cols)
data_array.shape

(20331, 4, 18)

In [11]:
# Functions to generate sequences for the labals
def gen_labels(df, sequence_length, label):
    data_matrix = df[label].values
    num_elements = data_matrix.shape[0]

    # -1 because I want to predict the rul of that last row in the sequence, not the next row
    return data_matrix[sequence_length-1:num_elements, :]  

def gen_label_wrapper(df, sequence_length, label, ids=np.array([])):
    if ids.size <= 0:
        ids = df['id'].unique()
        
    label_gen = [gen_labels(df[df['id']==id], sequence_length, label) 
                for id in ids]
    label_array = np.concatenate(label_gen).astype(np.float32)
    return label_array

In [12]:
def gen_test_data(df, sequence_length, columns, mask_value):
    if df.shape[0] < sequence_length:
        data_matrix = np.full(shape=(sequence_length, len(columns)), fill_value=mask_value) # pad
        idx = data_matrix.shape[0] - df.shape[0]
        data_matrix[idx:,:] = df[columns].values  # fill with available data
    else:
        data_matrix = df[columns].values
        
    # specifically yield the last possible sequence
    stop = num_elements = data_matrix.shape[0]
    start = stop - sequence_length
    for i in list(range(1)):
        yield data_matrix[start:stop, :]
def gen_test_label_wrapper(df, sequence_length, label, ids=np.array([])):
    if ids.size <= 0:
        ids = df['id'].unique()
    
    label_gen = [gen_labels(df[df['id']==id], sequence_length, label) 
                for id in ids]
    # keep only last window
    if sequence_length > 31:
      print("Too big window")
    else:
      last_labels = [label[-1] for label in label_gen] 
      



    last_labels = np.concatenate(last_labels).astype(np.float32)
    # return label_array
    return last_labels


In [13]:
sequence_length = 20

gss = GroupShuffleSplit(n_splits=1, train_size=0.80, random_state=42)


for train_unit, val_unit in gss.split(train_set['id'].unique(), groups=train_set['id'].unique()):
    train_unit = train_set['id'].unique()[train_unit]  # gss returns indexes and index starts at 1
    val_unit = train_set['id'].unique()[val_unit]

    train_split_array = gen_data_wrapper(train_set, sequence_length, feature_cols, train_unit)
    train_split_label = gen_label_wrapper(train_set, sequence_length, ['fail_30'], train_unit)
    
    val_split_array = gen_data_wrapper(train_set, sequence_length, feature_cols, val_unit)
    val_split_label = gen_label_wrapper(train_set, sequence_length, ['fail_30'], val_unit)

# create sequences train, test 
X_train = gen_data_wrapper(train_set, sequence_length, feature_cols)
y_train = gen_label_wrapper(train_set, sequence_length, ['fail_30'])

test_gen = (list(gen_test_data(test_set[test_set['id']==id], sequence_length, feature_cols, -99.))
           for id in test_set['id'].unique())
X_test = np.concatenate(list(test_gen)).astype(np.float32)

y_test = gen_test_label_wrapper(test_set, sequence_length, ['fail_30'])


In [14]:
y_test

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
       1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.],
      dtype=float32)

In [15]:
# Defining custom dataset class for convenience

class CustomDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [16]:
# Initialize custom datasets

train_data = CustomDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
test_data = CustomDataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test))

In [17]:
# Initialize dataloaders

train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(dataset=test_data, batch_size=BATCH_SIZE, shuffle=False)

In [27]:
for i,j in train_loader:
  print(i.shape,j.shape)
  break

torch.Size([64, 20, 18]) torch.Size([64, 1])


In [19]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        self.num_features = 18
        self.hidden_units = 32
        self.num_layers = 1

        self.lstm = nn.LSTM(
            input_size=self.num_features,
            hidden_size=self.hidden_units,
            batch_first=True,
            num_layers=self.num_layers) 
        
        self.linear = nn.Linear(in_features=self.hidden_units, out_features=1)

        self.tanh = nn.Tanh()
        # self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
      batch_size = x.shape[0]
      h0 = torch.zeros(self.num_layers, batch_size, self.hidden_units).requires_grad_()
      c0 = torch.zeros(self.num_layers, batch_size, self.hidden_units).requires_grad_()
      _, (hn, _) = self.lstm(x, (h0, c0))
      out = self.linear(hn[0]).flatten()  # First dim of Hn is num_layers, which is set to 1 above.

      return out
        # x, (hn, cn) = self.lstm1(inputs)
        # x = self.linear(x)
        # x = self.tanh(x) 
        # return x

In [20]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [21]:
EPOCHS = 20
LEARNING_RATE = 0.001

In [22]:
model = BinaryClassification()
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

BinaryClassification(
  (lstm): LSTM(18, 32, batch_first=True)
  (linear): Linear(in_features=32, out_features=1, bias=True)
  (tanh): Tanh()
)


In [23]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [24]:
# Model training

model.train() #tells pytorch that we are in training mode

y_pred_train_list = []

for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        
        
        y_pred = model(X_batch)


        y_train_pred = torch.sigmoid(y_pred)
        y_pred_tag = torch.round(y_train_pred)
        y_pred_train_list.append(y_pred_tag.cpu().detach().numpy())
        loss = criterion(y_pred, y_batch.view(-1))
        acc = binary_acc(y_pred, y_batch.view(-1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

y_pred_train_list = [a.squeeze().tolist() for a in y_pred_train_list]
y_pred_train_list = [item for sublist in y_pred_train_list for item in sublist]

Epoch 001: | Loss: 0.32786 | Acc: 87.857
Epoch 002: | Loss: 0.17089 | Acc: 93.464
Epoch 003: | Loss: 0.15031 | Acc: 94.089
Epoch 004: | Loss: 0.13785 | Acc: 94.638
Epoch 005: | Loss: 0.12989 | Acc: 94.857
Epoch 006: | Loss: 0.12334 | Acc: 95.024
Epoch 007: | Loss: 0.12010 | Acc: 95.092
Epoch 008: | Loss: 0.11734 | Acc: 95.266
Epoch 009: | Loss: 0.11158 | Acc: 95.601
Epoch 010: | Loss: 0.10863 | Acc: 95.703
Epoch 011: | Loss: 0.10612 | Acc: 95.823
Epoch 012: | Loss: 0.10368 | Acc: 95.932
Epoch 013: | Loss: 0.10144 | Acc: 96.020
Epoch 014: | Loss: 0.09932 | Acc: 96.119
Epoch 015: | Loss: 0.09733 | Acc: 96.212
Epoch 016: | Loss: 0.09539 | Acc: 96.307
Epoch 017: | Loss: 0.09343 | Acc: 96.362
Epoch 018: | Loss: 0.09143 | Acc: 96.358
Epoch 019: | Loss: 0.08944 | Acc: 96.471
Epoch 020: | Loss: 0.08763 | Acc: 96.454


In [25]:
# Model testing

model.eval()

y_pred_test_list = []


with torch.no_grad():

  test_loss = 0
  test_accuracy = 0

  for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        y_test_pred = model(X_batch)
        
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_test_list.append(y_pred_tag.cpu().numpy())
        
        loss = criterion(y_pred_tag, y_batch.view(-1))
        acc = binary_acc(y_pred_tag, y_batch.view(-1))
        
        test_loss += loss.item()
        test_accuracy += acc.item()

y_pred_test_list = [a.squeeze().tolist() for a in y_pred_test_list]
y_pred_test_list = [item for sublist in y_pred_test_list for item in sublist]


print(f'Test set evaluation : | Loss: {test_loss/len(test_loader):.5f} | Acc: {test_accuracy/len(test_loader):.3f}')

Test set evaluation : | Loss: 0.61988 | Acc: 96.000


In [26]:
print(classification_report(y_test, y_pred_test_list))

              precision    recall  f1-score   support

         0.0       0.97      0.97      0.97        75
         1.0       0.92      0.92      0.92        25

    accuracy                           0.96       100
   macro avg       0.95      0.95      0.95       100
weighted avg       0.96      0.96      0.96       100

