# Federated Learning on NER





### Imports and model specifications

First we make the official imports

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

And than those specific to PySyft. In particular we define remote workers `alice` and `bob`.

In [24]:
import syft as sy  # <-- NEW: import the Pysyft library
hook = sy.TorchHook(torch)  # <-- NEW: hook PyTorch ie add extra functionalities to support Federated Learning
bob = sy.VirtualWorker(hook, id="bob")  # <-- NEW: define remote worker bob
alice = sy.VirtualWorker(hook, id="alice")  # <-- NEW: and alice



We define the setting of the learning task

In [137]:
class Arguments():
    def __init__(self):
        self.batch_size = 1
        self.test_batch_size = 1
        self.epochs = 50
        self.lr = 0.01
        self.momentum = 0.5
        self.no_cuda = False
        self.seed = 1
        self.log_interval = 30
        self.save_model = False

args = Arguments()

use_cuda = not args.no_cuda and torch.cuda.is_available()

torch.manual_seed(args.seed)

device = torch.device("cuda" if use_cuda else "cpu")

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

### Data loading and sending to workers
We first load the data and transform the training Dataset into a Federated Dataset split across the workers using the `.federate` method. This federated dataset is now given to a Federated DataLoader. The test dataset remains unchanged.

In [64]:
import os
from preprocessing import read_files, prepare_embeddings, text_to_indices, pad_sentences
train_data_path = "data/ner/train"

train_sentences = read_files([os.path.join(train_data_path, p) for p in os.listdir(train_data_path)])
word_embeddings, word2Idx, label2Idx, idx2Label = prepare_embeddings(train_sentences,"../embeddings/german.model" )
X,Y = text_to_indices(train_sentences, word2Idx, label2Idx)
X = flatten_list(X)
Y = flatten_list(Y)
num_classes = len(label2Idx)
Y_one_hot = F.one_hot(torch.tensor(Y), num_classes)

#X,Y = pad_sentences(X,Y, word2Idx, label2Idx)

Extracting words and labels...
Extracted 42 words and 7 labels.
Loading embeddings...
Completed in 4.1234214305877686 seconds.
Found embeddings for 53 of 42 words.
(53, 300)


In [65]:
def flatten_list(l):
    return [[item] for sublist in l for item in sublist]

In [66]:
from torch.utils.data import Dataset
class NERDataset(Dataset):

    def __init__(self, X, Y, word2Idx, label2Idx):
        self.X = X
        self.Y = Y
        self.word2Idx = word2Idx
        self.label2Idx = label2Idx
        
        

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.LongTensor(self.X[idx]), torch.LongTensor(self.Y[idx])

dataset = NERDataset(X, Y, word2Idx, label2Idx)

In [67]:
federated_train_loader = sy.FederatedDataLoader( # <-- this is now a FederatedDataLoader 
    dataset
    .federate((bob, alice)), # <-- NEW: we distribute the dataset across all the workers, it's now a FederatedDataset
    batch_size=args.batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(dataset,
    batch_size=args.batch_size, shuffle=True, **kwargs)

### CNN specification
Here we use exactly the same CNN as in the official example.

In [151]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    
    def __init__(self,  embedding_dim, hidden_dim, vocab_size, num_classes, word_embeddings=None):
        super(Net, self).__init__()
  
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes

        self.embedding = nn.Embedding(vocab_size, self.embedding_dim)
        self.fc1 = nn.Linear(self.embedding_dim, 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 30)
        self.fc4 = nn.Linear(30, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        #print(x.shape)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x
    
    def forward2(self, x):
        
        #self.embedding = nn.Embedding.from_pretrained(word_embeddings)
        #self.lstm = nn.LSTM(self.embedding_dim, self.hidden_dim, bidirectional=True, batch_first=True)
        #self.linear = nn.Linear(self.hidden_dim* 2, num_classes)
        #self.linear = nn.Linear(self.embedding_dim, num_classes)
        print(x.shape)
        x = self.embedding(x)
        print(x.shape)
        self.hidden = self._init_hidden(x.shape[0])
        print(x.shape)
        print(x.size() , x.shape[-1])
        x, self.hidden = self.lstm(x)
        print(x.shape)
        x = self.linear(x)

        return x
    
    def _init_hidden(self, batch_size):
        return (torch.randn(2, batch_size, self.hidden_dim),
                torch.randn(2, batch_size, self.hidden_dim))

In [158]:
def softmax_cross_entropy_with_logits(logits, targets, batch_size=1):
    """ Calculates softmax entropy
        Args:
            * logits: (NxC) outputs of dense layer
            * targets: (NxC) one-hot encoded labels
            * batch_size: value of N, temporarily required because Plan cannot trace .shape
    """
    # numstable logsoftmax
    norm_logits = logits - logits.max()
    log_probs = norm_logits - norm_logits.exp().sum(dim=1, keepdim=True).log()
    # NLL, reduction = mean
    return -(targets * log_probs).sum() / batch_size


In [167]:
def cross_entropy(input, target):
    #return -torch.gather(y_hat, 1, y.unsqueeze(dim=1)).log()
    logsoftmax = nn.LogSoftmax()
    return torch.mean(torch.sum(-target * logsoftmax(input), dim=1))

### Define the train and test functions
For the train function, because the data batches are distributed across `alice` and `bob`, you need to send the model to the right location for each batch. Then, you perform all the operations remotely with the same syntax like you're doing local PyTorch. When you're done, you get back the model updated and the loss to look for improvement.

In [168]:
def train(args, model, device, federated_train_loader, optimizer, criterion, epoch):
    model.train()
    
    for batch_idx, (data, target) in enumerate(federated_train_loader): # <-- now it is a distributed dataset
        model.send(data.location) # <-- NEW: send the model to the right location
        data, target = data.to(device), target.to(device)
        #print("Data" , data.shape,"Target", target.shape)
        optimizer.zero_grad()
        output = model.forward(data)
        loss = criterion(output, target)
        #print(loss)
        #loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        model.get() # <-- NEW: get the model back
        if batch_idx % args.log_interval == 0:
            loss = loss.get() # <-- NEW: get the loss back
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * args.batch_size, len(federated_train_loader) * args.batch_size,
                100. * batch_idx / len(federated_train_loader), loss.item()))

In [169]:
def test(args, model, device, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            #print(output.shape, target.shape)
            #test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            test_loss += criterion(output, target)
            pred = output.argmax(2, keepdim=True) # get the index of the max log-probability 
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

### Launch the training !

In [171]:
%%time

embedding_dim=300
hidden_dim=10
vocab_size=len(word2Idx)
num_classes=len(label2Idx)
criterion = softmax_cross_entropy_with_logits#cross_entropy#nn.CrossEntropyLoss()

#torch.FloatTensor(word_embeddings)
model = Net(embedding_dim, hidden_dim, vocab_size, num_classes).to(device)
optimizer = optim.SGD(model.parameters(), lr=args.lr) # TODO momentum is not supported at the moment

for epoch in range(1, args.epochs + 1):
    train(args, model, device, federated_train_loader, optimizer,criterion, epoch)
    test(args, model, device, test_loader, criterion)

if (args.save_model):
    torch.save(model.state_dict(), "net.pt")




Test set: Average loss: 0.0000, Accuracy: 8/100 (8%)


Test set: Average loss: 0.0000, Accuracy: 8/100 (8%)


Test set: Average loss: 0.0000, Accuracy: 8/100 (8%)



KeyboardInterrupt: 

# 