In [None]:
!pip install syft==0.2.9

In [None]:
import pandas as pd
import torch as th
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
from sklearn.metrics import confusion_matrix, classification_report
import syft

In [None]:
BATCH_SIZE = 64

In [None]:
path = "/content/drive/MyDrive/Thesis/Datasets/Turbofan_Dataset/"

In [None]:
hook = syft.TorchHook(th)

bob = syft.VirtualWorker(hook, id="bob")
alice = syft.VirtualWorker(hook, id="alice")

In [None]:
# Load data and drop irrelevant columns

alice_set = pd.read_csv(path + "TRAINING_SET_1.csv")
bob_set = pd.read_csv(path + "TRAINING_SET_2.csv")

test_set = pd.read_csv(path + "TEST_SET_FULL.csv")

drop_cols = ["id","cycle","setting3","s1","s5","s10","s16","s18","s19","RUL"]

alice_set = alice_set.drop(drop_cols, axis=1)
bob_set = bob_set.drop(drop_cols, axis=1)

test_set = test_set.drop(drop_cols, axis=1)

In [None]:
# Move cycle_norm column first for convenience

column_to_move = alice_set.pop("cycle_norm")
alice_set.insert(0, "cycle_norm", column_to_move)
column_to_move = bob_set.pop("cycle_norm")
bob_set.insert(0, "cycle_norm", column_to_move)

column_to_move = test_set.pop("cycle_norm")
test_set.insert(0, "cycle_norm", column_to_move)

In [None]:
# Convert pandas dataframes to numpy arrays

X_alice = alice_set.iloc[:, 0:-1].to_numpy()
y_alice = alice_set.iloc[:, -1].to_numpy()
X_bob = bob_set.iloc[:, 0:-1].to_numpy()
y_bob = bob_set.iloc[:, -1].to_numpy()

X_test = test_set.iloc[:, 0:-1].to_numpy()
y_test = test_set.iloc[:, -1].to_numpy()

In [None]:
# Defining custom dataset class for convenience

class CustomDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [None]:
# Initialize custom datasets

# train_alice = CustomDataset(torch.FloatTensor(X_alice), torch.FloatTensor(y_alice))
# train_bob = CustomDataset(torch.FloatTensor(X_bob), torch.FloatTensor(y_bob))

# test_data = CustomDataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test))

In [None]:
# Initialize dataloaders

# train_loader_alice = DataLoader(dataset=train_alice, batch_size=BATCH_SIZE, shuffle=False)
# train_loader_bob = DataLoader(dataset=train_bob, batch_size=BATCH_SIZE, shuffle=False)

# test_loader = DataLoader(dataset=test_data, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
datasets = [(data_alice,target_alice),(data_bob,target_bob)]

In [None]:
# Check how many inputs and outputs the model should have

print(f"Our model should have {train_loader_alice.dataset.X_data.shape[-1]} inputs and 1 output")

Our model should have 18 inputs and 1 output


In [None]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 18.
        self.layer_1 = nn.Linear(18, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        # x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        # x = self.batchnorm2(x)
        # x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
EPOCHS = 20
LEARNING_RATE = 0.001

In [None]:
model = BinaryClassification()
# model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

BinaryClassification(
  (layer_1): Linear(in_features=18, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
def train():
    # Training Logic
    opt = optim.SGD(params=model.parameters(),lr=LEARNING_RATE)
    for e in range(1, EPOCHS+1):
        
        # NEW) iterate through each worker's dataset separately
        for data,target in datasets:
            
            # NEW) send model to correct worker - either Alice or Bob
            model.send(data.location)

            # 1) Reset the optimizer so that we can develop a new model
            opt.zero_grad()

            # 2) Predict on new (unseen) data using the model from the cloud
            pred = model(data)

            # 3) See how well (or not) we did on that prediction
            loss = ((pred - target)**2).sum()

            # 4) Figure out why we performed poorly
            loss.backward()

            # 5) Update the model's weights 
            opt.step()
            
            # NEW) Get the new model, to be tested and improved on a new, separate dataset 
            model.get()

            # 6) print our progress
            print(loss.get()) # NEW) slight edit... need to call .get() on loss\

In [None]:
train()

In [None]:
# Model training

model.train() #tells pytorch that we are in training mode

y_pred_train_list = []

for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)

        y_train_pred = torch.sigmoid(y_pred)
        y_pred_tag = torch.round(y_train_pred)
        y_pred_train_list.append(y_pred_tag.cpu().detach().numpy())
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

y_pred_train_list = [a.squeeze().tolist() for a in y_pred_train_list]
y_pred_train_list = [item for sublist in y_pred_train_list for item in sublist]

Epoch 001: | Loss: 0.27169 | Acc: 89.158
Epoch 002: | Loss: 0.12309 | Acc: 94.876
Epoch 003: | Loss: 0.11607 | Acc: 95.149
Epoch 004: | Loss: 0.11303 | Acc: 95.232
Epoch 005: | Loss: 0.11106 | Acc: 95.316
Epoch 006: | Loss: 0.10954 | Acc: 95.415
Epoch 007: | Loss: 0.10837 | Acc: 95.514
Epoch 008: | Loss: 0.10738 | Acc: 95.563
Epoch 009: | Loss: 0.10657 | Acc: 95.598
Epoch 010: | Loss: 0.10585 | Acc: 95.632
Epoch 011: | Loss: 0.10524 | Acc: 95.656
Epoch 012: | Loss: 0.10468 | Acc: 95.690
Epoch 013: | Loss: 0.10417 | Acc: 95.684
Epoch 014: | Loss: 0.10371 | Acc: 95.706
Epoch 015: | Loss: 0.10333 | Acc: 95.728
Epoch 016: | Loss: 0.10287 | Acc: 95.740
Epoch 017: | Loss: 0.10257 | Acc: 95.718
Epoch 018: | Loss: 0.10213 | Acc: 95.740
Epoch 019: | Loss: 0.10176 | Acc: 95.746
Epoch 020: | Loss: 0.10136 | Acc: 95.783


In [None]:
# print(classification_report(y_train, y_pred_train_list))

In [None]:
len(y_pred_train_list)

412620

In [None]:
len(y_train)

20631

In [None]:
# Model testing

model.eval()

y_pred_test_list = []


with torch.no_grad():

  test_loss = 0
  test_accuracy = 0

  for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        y_test_pred = model(X_batch)
        
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_test_list.append(y_pred_tag.cpu().numpy())
        
        loss = criterion(y_pred_tag, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred_tag, y_batch.unsqueeze(1))
        
        test_loss += loss.item()
        test_accuracy += acc.item()

y_pred_test_list = [a.squeeze().tolist() for a in y_pred_test_list]
y_pred_test_list = [item for sublist in y_pred_test_list for item in sublist]


print(f'Test set evaluation : | Loss: {test_loss/len(test_loader):.5f} | Acc: {test_accuracy/len(test_loader):.3f}')

Test set evaluation : | Loss: 0.69030 | Acc: 98.541


In [None]:
# no batchnorm and dropout
print(classification_report(y_test, y_pred_test_list))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     12764
           1       0.74      0.67      0.70       332

    accuracy                           0.99     13096
   macro avg       0.87      0.83      0.85     13096
weighted avg       0.98      0.99      0.99     13096



In [None]:
# batchnorm and dropout
# print(classification_report(y_test, y_pred_test_list))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99     12764
           1       0.46      0.75      0.57       332

    accuracy                           0.97     13096
   macro avg       0.73      0.86      0.78     13096
weighted avg       0.98      0.97      0.97     13096

