In [None]:
# !pip install syft==0.5.0
# !pip install torch==1.8.1

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report

import syft as sy
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [None]:
alice = sy.VirtualMachine(name="alice")
bob = sy.VirtualMachine(name="bob")

alice_client = alice.get_root_client()
alice_torch = alice_client.torch

bob_client = bob.get_root_client()
bob_torch = alice_client.torch

clients = []
clients.append({'hook': alice_client})
clients.append({'hook': bob_client})

clients[0]['torch'] = alice_torch
clients[1]['torch'] = bob_torch

In [None]:
path = "/content/drive/MyDrive/Thesis/Datasets/Turbofan_Dataset/final_datasets_normalized/"

In [None]:
class Arguments():
    def __init__(self):
        self.images = 60000
        self.clients = 2
        self.rounds = 5
        self.epochs = 5
        self.local_batches = 64
        self.lr = 0.01
        self.C = 0.9
        self.drop_rate = 0.1
        self.torch_seed = 0
        self.log_interval = 10
        self.iid = 'iid'
        self.split_size = int(self.images / self.clients)
        self.samples = self.split_size / self.images 
        self.use_cuda = False
        self.save_model = False

args = Arguments()

use_cuda = args.use_cuda and th.cuda.is_available()
device = th.device("cuda" if use_cuda else "cpu")
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = th.round(th.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = th.round(acc * 100)
    
    return acc

In [None]:
# Load data and drop irrelevant columns

alice_set = pd.read_csv(path + "TRAINING_SET_1.csv")
bob_set = pd.read_csv(path + "TRAINING_SET_2.csv")

test_set = pd.read_csv(path + "TEST_SET_FULL.csv")

drop_cols = ["cycle","setting3","s1","s5","s10","s16","s18","s19","RUL"]
corr_cols = ["s11","s4","s15","s17","s2","s3","s8","s13","s9","s14","s12","s7","s20"]
feature_cols = ['cycle_norm', 'setting1', 'setting2', 's2', 's3', 's4', 's6', 's7',
       's8', 's9', 's11', 's12', 's13', 's14', 's15', 's17', 's20', 's21']
prediction_col = 'fail_30'

alice_set = alice_set.drop(drop_cols, axis=1)
bob_set = bob_set.drop(drop_cols, axis=1)

test_set = test_set.drop(drop_cols, axis=1)

In [None]:
# Receives single engine dataframe, window size and features -> sequences of length==window_size
def gen_train_data(df, sequence_length, columns):
    data = df[columns].values
    num_elements = data.shape[0]

    # -1 and +1 because of Python indexing
    for start, stop in zip(range(0, num_elements-(sequence_length-1)), range(sequence_length, num_elements+1)):
        yield data[start:stop, :]

In [None]:
# Generates sequences for multiple engines
def gen_data_wrapper(df, sequence_length, columns, ids=np.array([])):
    if ids.size <= 0:
        ids = df['id'].unique()
        
    data_gen = (list(gen_train_data(df[df['id']==id], sequence_length, columns))
               for id in ids)
    data_array = np.concatenate(list(data_gen)).astype(np.float32)
    return data_array

In [None]:
# Functions to generate sequences for the labals
def gen_labels(df, sequence_length, label):
    data_matrix = df[label].values
    num_elements = data_matrix.shape[0]

    # -1 because I want to predict the rul of that last row in the sequence, not the next row
    return data_matrix[sequence_length-1:num_elements, :]  

def gen_label_wrapper(df, sequence_length, label, ids=np.array([])):
    if ids.size <= 0:
        ids = df['id'].unique()
        
    label_gen = [gen_labels(df[df['id']==id], sequence_length, label) 
                for id in ids]
    label_array = np.concatenate(label_gen).astype(np.float32)
    return label_array

In [None]:
def gen_test_data(df, sequence_length, columns, mask_value):
    if df.shape[0] < sequence_length:
        data_matrix = np.full(shape=(sequence_length, len(columns)), fill_value=mask_value) # pad
        idx = data_matrix.shape[0] - df.shape[0]
        data_matrix[idx:,:] = df[columns].values  # fill with available data
    else:
        data_matrix = df[columns].values
        
    # specifically yield the last possible sequence
    stop = num_elements = data_matrix.shape[0]
    start = stop - sequence_length
    for i in list(range(1)):
        yield data_matrix[start:stop, :]
def gen_test_label_wrapper(df, sequence_length, label, ids=np.array([])):
    if ids.size <= 0:
        ids = df['id'].unique()
    
    label_gen = [gen_labels(df[df['id']==id], sequence_length, label) 
                for id in ids]
    # keep only last window
    if sequence_length > 31:
      print("Too big window")
    else:
      last_labels = [label[-1] for label in label_gen] 
      



    last_labels = np.concatenate(last_labels).astype(np.float32)
    # return label_array
    return last_labels


In [None]:
# Functions to generate sequences for the labals
def gen_labels(df, sequence_length, label):
    data_matrix = df[label].values
    num_elements = data_matrix.shape[0]

    # -1 because I want to predict the rul of that last row in the sequence, not the next row
    return data_matrix[sequence_length-1:num_elements, :]  

def gen_label_wrapper(df, sequence_length, label, ids=np.array([])):
    if ids.size <= 0:
        ids = df['id'].unique()
        
    label_gen = [gen_labels(df[df['id']==id], sequence_length, label) 
                for id in ids]
    label_array = np.concatenate(label_gen).astype(np.float32)
    return label_array

In [None]:
def gen_test_data(df, sequence_length, columns, mask_value):
    if df.shape[0] < sequence_length:
        data_matrix = np.full(shape=(sequence_length, len(columns)), fill_value=mask_value) # pad
        idx = data_matrix.shape[0] - df.shape[0]
        data_matrix[idx:,:] = df[columns].values  # fill with available data
    else:
        data_matrix = df[columns].values
        
    # specifically yield the last possible sequence
    stop = num_elements = data_matrix.shape[0]
    start = stop - sequence_length
    for i in list(range(1)):
        yield data_matrix[start:stop, :]
def gen_test_label_wrapper(df, sequence_length, label, ids=np.array([])):
    if ids.size <= 0:
        ids = df['id'].unique()
    
    label_gen = [gen_labels(df[df['id']==id], sequence_length, label) 
                for id in ids]
    # keep only last window
    if sequence_length > 31:
      print("Too big window")
    else:
      last_labels = [label[-1] for label in label_gen] 
      
    last_labels = np.concatenate(last_labels).astype(np.float32)
    # return label_array
    return last_labels


In [None]:
sequence_length = 20

gss = GroupShuffleSplit(n_splits=1, train_size=0.80, random_state=42)


for alice_train_unit, alice_val_unit in gss.split(alice_set['id'].unique(), groups=alice_set['id'].unique()):
    alice_train_unit = alice_set['id'].unique()[alice_train_unit]  # gss returns indexes and index starts at 1
    alice_val_unit = alice_set['id'].unique()[alice_val_unit]

    train_split_array = gen_data_wrapper(alice_set, sequence_length, feature_cols, alice_train_unit)
    train_split_label = gen_label_wrapper(alice_set, sequence_length, ['fail_30'], alice_train_unit)
    
    val_split_array = gen_data_wrapper(alice_set, sequence_length, feature_cols, alice_val_unit)
    val_split_label = gen_label_wrapper(alice_set, sequence_length, ['fail_30'], alice_val_unit)

for bob_train_unit, bob_val_unit in gss.split(bob_set['id'].unique(), groups=bob_set['id'].unique()):
    bob_train_unit = bob_set['id'].unique()[bob_train_unit]  # gss returns indexes and index starts at 1
    bob_val_unit = bob_set['id'].unique()[bob_val_unit]

    train_split_array = gen_data_wrapper(bob_set, sequence_length, feature_cols, bob_train_unit)
    train_split_label = gen_label_wrapper(bob_set, sequence_length, ['fail_30'], bob_train_unit)
    
    val_split_array = gen_data_wrapper(bob_set, sequence_length, feature_cols, bob_val_unit)
    val_split_label = gen_label_wrapper(bob_set, sequence_length, ['fail_30'], bob_val_unit)

# create sequences train, test 
X_alice = gen_data_wrapper(alice_set, sequence_length, feature_cols)
X_bob = gen_data_wrapper(bob_set, sequence_length, feature_cols)

y_alice = gen_label_wrapper(alice_set, sequence_length, ['fail_30'])
y_bob = gen_label_wrapper(bob_set, sequence_length, ['fail_30'])


test_gen = (list(gen_test_data(test_set[test_set['id']==id], sequence_length, feature_cols, -99.))
           for id in test_set['id'].unique())
X_test = np.concatenate(list(test_gen)).astype(np.float32)

y_test = gen_test_label_wrapper(test_set, sequence_length, ['fail_30'])


In [None]:
# Defining custom dataset class for convenience

class CustomDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [None]:
alice_dataset = CustomDataset(X_alice, y_alice)
bob_dataset = CustomDataset(X_bob, y_bob)
test_dataset = CustomDataset(X_test, y_test)

In [None]:
alice_loader = DataLoader(dataset=alice_dataset, batch_size=args.local_batches, shuffle=False)
bob_loader = DataLoader(dataset=bob_dataset, batch_size=args.local_batches, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=args.local_batches, shuffle=False)

In [None]:
clients[0]['trainset'] = alice_loader
clients[1]['trainset'] = bob_loader


In [None]:
class BinaryClassification(sy.Module):
    def __init__(self, torch_ref):
        super(BinaryClassification, self).__init__(torch_ref=torch_ref)
        self.num_features = 18
        self.hidden_units = 32
        self.num_layers = 1

        self.lstm = nn.LSTM(
            input_size=self.num_features,
            hidden_size=self.hidden_units,
            batch_first=True,
            num_layers=self.num_layers) 
        
        self.linear = nn.Linear(in_features=self.hidden_units, out_features=1)

    def forward(self, x):
      batch_size = x.shape[0]
      h0 = th.zeros(self.num_layers, batch_size, self.hidden_units).requires_grad_()
      c0 = th.zeros(self.num_layers, batch_size, self.hidden_units).requires_grad_()
      _, (hn, _) = self.lstm(x, (h0, c0))
      out = self.linear(hn[0]).flatten()  # First dim of Hn is num_layers, which is set to 1 above.

      return out

In [None]:
EPOCHS = 5
LEARNING_RATE = 0.001
criterion = nn.BCEWithLogitsLoss()

In [None]:
# diko tous torch oi client

In [None]:
clients

In [None]:
th.manual_seed(args.torch_seed)
global_model = BinaryClassification(th)


for client in clients:
    th.manual_seed(args.torch_seed)
    client['model'] = BinaryClassification(client['torch'])

    client['optim'] = None

# clients[0]
# clients[0]['optim'] = optim.SGD(clients[0]['model'].parameters(), lr=args.lr)

# clients[0]['model'].send(clients[0]['hook'])



In [None]:
# print(model.parameters()[-1].grad) # exists

# model_ptr = model.send(alice_client)
# data_ptr = data.send(alice_client)
# labels_ptr = labels.send(alice_client)
# results_ptr = model_ptr(data_ptr)
# remote_loss_func = alice_client.torch.nn.L1Loss()
# remote_loss = remote_loss_func(results_ptr, labels_ptr)
# remote_loss.backward()

In [None]:
# input = X_train[0].view(1,20,18)




In [None]:
input = X_train[18667:]
output = y_train[18667:]
input_ptr = input.send(alice_client)
output_ptr = output.send(alice_client)

NameError: ignored

In [None]:
result_ptr.shape.get()

torch.Size([64])

In [None]:
output_ptr.view(-1).shape.get()

torch.Size([64])