In [1]:
# !pip install syft==0.2.9

In [2]:
import pandas as pd
import torch as th
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
from sklearn.metrics import confusion_matrix, classification_report
import syft

In [3]:
BATCH_SIZE = 64

In [4]:
path = "/content/drive/MyDrive/Thesis/Datasets/Turbofan_Dataset/final_datasets_normalized/"

In [5]:
hook = syft.TorchHook(th)

alice = syft.VirtualWorker(hook, id="alice")
bob = syft.VirtualWorker(hook, id="bob")

In [6]:
# Load data and drop irrelevant columns

alice_set = pd.read_csv(path + "TRAINING_SET_1.csv")
bob_set = pd.read_csv(path + "TRAINING_SET_2.csv")

test_set = pd.read_csv(path + "TEST_SET_FULL.csv")

drop_cols = ["id","cycle","setting3","s1","s5","s10","s16","s18","s19","RUL"]

alice_set = alice_set.drop(drop_cols, axis=1)
bob_set = bob_set.drop(drop_cols, axis=1)

test_set = test_set.drop(drop_cols, axis=1)

In [7]:
# Move cycle_norm column first for convenience

column_to_move = alice_set.pop("cycle_norm")
alice_set.insert(0, "cycle_norm", column_to_move)
column_to_move = bob_set.pop("cycle_norm")
bob_set.insert(0, "cycle_norm", column_to_move)

column_to_move = test_set.pop("cycle_norm")
test_set.insert(0, "cycle_norm", column_to_move)

In [8]:
def tensor_convert(data):
  return th.FloatTensor(data.to_numpy()).requires_grad_()

In [9]:
# Convert pandas dataframes to numpy arrays

X_alice = tensor_convert(alice_set.iloc[:, 0:-1])
y_alice = tensor_convert(alice_set.iloc[:, -1])
X_bob = tensor_convert(bob_set.iloc[:, 0:-1])
y_bob = tensor_convert(bob_set.iloc[:, -1])

X_test = tensor_convert(test_set.iloc[:, 0:-1])
y_test = tensor_convert(test_set.iloc[:, -1])

In [10]:
# Defining custom dataset class

class CustomDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [11]:
# use BaseDataset class which takes data, targets as args
# This class has .data attribute , .targets attribute to make it able to manipulate
# It has .send attibute to send to workers
alice_train_dataset = syft.BaseDataset(X_alice, y_alice).send(alice)
bob_train_dataset = syft.BaseDataset(X_bob, y_bob).send(bob)

In [12]:
test_dataset = CustomDataset(th.FloatTensor(X_test), th.FloatTensor(y_test))
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [13]:
print(alice_train_dataset.data)
print(alice_train_dataset.targets)

print(bob_train_dataset.data)
print(bob_train_dataset.targets)

(Wrapper)>[PointerTensor | me:7167473598 -> alice:40768333104]
(Wrapper)>[PointerTensor | me:70691717458 -> alice:10414746032]
(Wrapper)>[PointerTensor | me:37210034927 -> bob:49764933992]
(Wrapper)>[PointerTensor | me:26844288109 -> bob:8050794697]


In [14]:
# FederatedDataset class takes a list of remote datasets as args
# It takes the datasets already sent to remote workers
# This also acts as input to FederatedDataLoader
federated_train_dataset = syft.FederatedDataset([alice_train_dataset,bob_train_dataset])

In [15]:
print(federated_train_dataset)

FederatedDataset
    Distributed accross: alice, bob
    Number of datapoints: 20631



In [16]:
# This FederatedDataLoader class iterates over the objects created by FederatedDataset class
federated_train_loader = syft.FederatedDataLoader(federated_train_dataset, shuffle=False, batch_size=128)

In [17]:
# for inputs, labels in federated_train_loader:
#   print(inputs,labels)

In [18]:
# inputs.get()

In [19]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 18.
        self.layer_1 = nn.Linear(18, 128) 
        self.layer_2 = nn.Linear(128, 128)
        self.layer_out = nn.Linear(128, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(128)
        self.batchnorm2 = nn.BatchNorm1d(128)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [20]:
device = th.device("cuda:0" if th.cuda.is_available() else "cpu")
print(device)

cpu


In [21]:
model = BinaryClassification()
# model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
# optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

BinaryClassification(
  (layer_1): Linear(in_features=18, out_features=128, bias=True)
  (layer_2): Linear(in_features=128, out_features=128, bias=True)
  (layer_out): Linear(in_features=128, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [22]:
def binary_acc(y_pred, y_test):
    y_pred_tag = th.round(th.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = th.round(acc * 100)
    
    return acc

In [23]:
# model.to(device)

In [24]:
EPOCHS = 50
LEARNING_RATE = 0.0005

In [25]:
def train():
    # Training Logic
    opt = optim.SGD(params=model.parameters(),lr=LEARNING_RATE)
    for e in range(1, EPOCHS+1):
        epoch_loss = 0
        epoch_acc = 0
        # NEW) iterate through each worker's dataset separately
        for data,target in federated_train_loader:

            # data, target = data.to(device), target.to(device)
            # print(data.location , target.location)
            # NEW) send model to correct worker - either Alice or Bob
            model.send(data.location)
            # 1) Reset the optimizer so that we can develop a new model
            opt.zero_grad()

            # 2) Predict on new (unseen) data using the model from the cloud
            pred = model(data)

            # 3) See how well (or not) we did on that prediction
            loss = criterion(pred, target.unsqueeze(1))
            acc = binary_acc(pred, target.unsqueeze(1))

            # 4) Figure out why we performed poorly
            loss.backward()

            # 5) Update the model's weights 
            opt.step()
            
            # NEW) Get the new model, to be tested and improved on a new, separate dataset 
            model.get()

            epoch_loss += loss.get()
            epoch_acc += acc.get()

            # 6) print our progress
            # print(loss.get()) # NEW) slight edit... need to call .get() on loss\
        print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(federated_train_loader):.5f} | Acc: {epoch_acc/len(federated_train_loader):.3f}')


train()

Epoch 001: | Loss: 0.58772 | Acc: 69.784
Epoch 002: | Loss: 0.52026 | Acc: 78.611
Epoch 003: | Loss: 0.48523 | Acc: 81.716
Epoch 004: | Loss: 0.46210 | Acc: 83.488
Epoch 005: | Loss: 0.44417 | Acc: 84.741
Epoch 006: | Loss: 0.42672 | Acc: 86.401
Epoch 007: | Loss: 0.41335 | Acc: 87.025
Epoch 008: | Loss: 0.40083 | Acc: 87.704
Epoch 009: | Loss: 0.38888 | Acc: 88.438
Epoch 010: | Loss: 0.37891 | Acc: 88.864
Epoch 011: | Loss: 0.36924 | Acc: 89.346
Epoch 012: | Loss: 0.36044 | Acc: 89.679
Epoch 013: | Loss: 0.35227 | Acc: 90.130
Epoch 014: | Loss: 0.34507 | Acc: 90.500
Epoch 015: | Loss: 0.33778 | Acc: 90.642
Epoch 016: | Loss: 0.33048 | Acc: 91.185
Epoch 017: | Loss: 0.32324 | Acc: 91.364
Epoch 018: | Loss: 0.31800 | Acc: 91.586
Epoch 019: | Loss: 0.31213 | Acc: 91.648
Epoch 020: | Loss: 0.30641 | Acc: 91.864
Epoch 021: | Loss: 0.30139 | Acc: 91.938
Epoch 022: | Loss: 0.29621 | Acc: 92.309
Epoch 023: | Loss: 0.29111 | Acc: 92.395
Epoch 024: | Loss: 0.28657 | Acc: 92.599
Epoch 025: | Los

In [26]:
# Model testing

model.eval()

y_pred_test_list = []

# model.to(device)

with th.no_grad():

  test_loss = 0
  test_accuracy = 0

  for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        
        y_test_pred = model(data)
        
        y_test_pred = th.sigmoid(y_test_pred)
        y_pred_tag = th.round(y_test_pred)
        y_pred_test_list.append(y_pred_tag.cpu().numpy())
        
        loss = criterion(y_pred_tag, target.unsqueeze(1))
        acc = binary_acc(y_pred_tag, target.unsqueeze(1))
        
        test_loss += loss.item()
        test_accuracy += acc.item()

y_pred_test_list = [a.squeeze().tolist() for a in y_pred_test_list]
y_pred_test_list = [item for sublist in y_pred_test_list for item in sublist]


print(f'Test set evaluation : | Loss: {test_loss/len(test_loader):.5f} | Acc: {test_accuracy/len(test_loader):.3f}')

Test set evaluation : | Loss: 0.69219 | Acc: 98.346


In [27]:
# 64 64
# print(classification_report(y_test.detach().numpy(), y_pred_test_list))

In [28]:
# 128 64
# print(classification_report(y_test.detach().numpy(), y_pred_test_list))

In [29]:
# 128 64
print(classification_report(y_test.detach().numpy(), y_pred_test_list))

              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99     12764
         1.0       0.65      0.78      0.71       332

    accuracy                           0.98     13096
   macro avg       0.82      0.88      0.85     13096
weighted avg       0.99      0.98      0.98     13096

