In [10]:
from openfl.interface.interactive_api.federation import Federation
from openfl.interface.interactive_api.experiment import ModelInterface, FLExperiment
from openfl.utilities.optimizers.torch import FedProxOptimizer, FedProxAdam
import torch
import torch.nn as nn
import torch.optim as optim
from copy import deepcopy

from gear_shard_dataset import GearSD
from kvasir_shard_dataset import KvasirSD

from loss import *
from models import *
from tasks import Task

import matplotlib.pyplot as plt
import time
import os
import copy

NUM_CLASSES=1
ROUND_TO_TRAIN=4

client_id = 'frontend'
director_node_fqdn = 'localhost'
director_port = 50053
experiment_name = 'gear_test_experiment'
LEARNING_RATE=5e-4
TRAIN_BS=4
VALID_BS=8

CRITERION=soft_dice_loss
CRITERION_VAL=soft_dice_coef


## don't forget to launch envoy service
### bash start_envoy.sh env_on localhost

In [11]:
# please use the same identificator that was used in signed certificate
federation = Federation(
    client_id=client_id,
    director_node_fqdn=director_node_fqdn,
    director_port=director_port,
    tls=False
)
shard_registry = federation.get_shard_registry()
shard_registry
federation.target_shape
dummy_shard_desc = federation.get_dummy_shard_descriptor(size=10)
dummy_shard_dataset = dummy_shard_desc.get_dataset('train')
sample, target = dummy_shard_dataset[0]
f"Sample shape: {sample.shape}, target shape: {target.shape}"

fed_dataset = KvasirSD(train_bs=TRAIN_BS, valid_bs=VALID_BS)
fed_dataset.shard_descriptor = dummy_shard_desc
for i, (sample, target) in enumerate(fed_dataset.get_train_loader()):
    print("Sample shape : "+str(sample.shape))
    print("Target shape : "+str(target.shape))

Sample shape : torch.Size([4, 3, 332, 332])
Target shape : torch.Size([4, 1, 332, 332])
Sample shape : torch.Size([4, 3, 332, 332])
Target shape : torch.Size([4, 1, 332, 332])
Sample shape : torch.Size([1, 3, 332, 332])
Target shape : torch.Size([1, 1, 332, 332])


In [12]:
d = DeepLabv3()
model= d.build_deeplab(NUM_CLASSES, alpha=0.7)
# take low learning rate for Tversky loss and to not change so much the current trained weights
optimizer_adam = optim.Adam(model.parameters(), lr=LEARNING_RATE)

framework_adapter = 'openfl.plugins.frameworks_adapters.pytorch_adapter.FrameworkAdapterPlugin'
MI = ModelInterface(model=model, optimizer=optimizer_adam, framework_plugin=framework_adapter)
# Save the initial model state
initial_model = deepcopy(model)

TI, validate = Task.createTask(CRITERION, CRITERION_VAL, d)

# create an experimnet in federation
fl_experiment = FLExperiment(federation=federation, experiment_name=experiment_name)

# The following command zips the workspace and python requirements to be transfered to collaborator nodes
fl_experiment.start(model_provider=MI, 
                    task_keeper=TI,
                    data_loader=fed_dataset,
                    rounds_to_train=ROUND_TO_TRAIN,
                    opt_treatment='CONTINUE_GLOBAL',
                    device_assignment_policy='CUDA_PREFERRED')

[*] Changing head for 1 classes and removing aux classifier
[!] This model will be trained using alpha freezing coef = 0.7 meaning 135/193 layers will be freeze


### You may use the same federation object to report another experiment or even schedule several experiments that will be executed in series.

# Stream from tensorboard


In [13]:
# we can stream metrics 
fl_experiment.stream_metrics()

In [None]:

best_model = fl_experiment.get_best_model()
# we can also retreive last model with get_last_model()

# We remove exremove_experiment_datamove_experiment_datamove_experiment_datariment data from director
fl_experiment.remove_experiment_data()

# Compare initial model 
validate(initial_model, fed_dataset.get_valid_loader(), 'cpu')

# With the best model 
validate(best_model, fed_dataset.get_valid_loader(), 'cpu')

# We can save the best and use it on runtime 
# TODO save ..

# We can also improve the model 

In [None]:
MI = ModelInterface(model=best_model, optimizer=optimizer_adam, framework_plugin=framework_adapter)
fl_experiment.start(model_provider=MI, task_keeper=TI, data_loader=fed_dataset, rounds_to_train=4, \
                              opt_treatment='CONTINUE_GLOBAL')

# optimizer treatment : RESET: the optimizer state is initialized each round from noise
#CONTINUE_LOCAL: the optimizer state will be reused locally by every collaborator
#CONTINUE_GLOBAL: the optimizer’s state will be aggregated 

## We can create specific task 

In [None]:
tasks = task_keeper.get_registered_tasks()


In [14]:
def filter_assigner(collaborators, round_number, **kwargs):
    collaborator_task_map = {}
    exclude_collaborators = ['env_two', 'env_three']
    for collaborator_name in collaborators:
        if collaborator_name in exclude_collaborators:
            continue
        collaborator_task_map[collaborator_name] = [
            tasks['train'],
            tasks['locally_tuned_model_validate'],
            tasks['aggregated_model_validate']
        ]
    return collaborator_task_map

In [None]:
def random_assigner(collaborators, round_number, **kwargs):
    """Assigning task groups randomly while ensuring target distribution"""
    import random
    random.shuffle(collaborators)
    collaborator_task_map = {}
    for idx, col in enumerate(collaborators):
        # select only 70% collaborators for training and validation, 30% for validation
        if (idx+1)/len(collaborators) <= 0.7:
            collaborator_task_map[col] = tasks.values()  # all three tasks
        else:
            collaborator_task_map[col] = [tasks['aggregated_model_validate']]
    return collaborator_task_map

# exclude collaborator

shard_registry = federation.get_shard_registry()
def filter_by_shard_registry_assigner(collaborators, round_number, **kwargs):
    collaborator_task_map = {}
    for collaborator in collaborators:
        col_status = shard_registry.get(collaborator)
        if not col_status or not col_status['is_online']:
            continue
        node_info = col_status['shard_info'].node_info
        # Assign train task if collaborator has GPU with total memory more that 8 GB
        if len(node_info.cuda_devices) > 0 and node_info.cuda_devices[0].memory_total > 8 * 1024**3:
            collaborator_task_map[collaborator] = [
                tasks['train'],
                tasks['locally_tuned_model_validate'],
                tasks['aggregated_model_validate'],
            ]
        else:
            collaborator_task_map[collaborator] = [
                tasks['aggregated_model_validate'],
            ]
    return collaborator_task_map

In [None]:
## Additional vlaidation round

rounds_to_train = 3
total_rounds = rounds_to_train + 1 # use fl_experiment.start(..., rounds_to_train=total_rounds,...)

def assigner_with_last_round_validation(collaborators, round_number, **kwargs):
    collaborator_task_map = {}
    for collaborator in collaborators:
        if round_number == total_rounds - 1:
            collaborator_task_map[collaborator] = [
                tasks['aggregated_model_validate'],
            ]
        else:
            collaborator_task_map[collaborator] = [
                tasks['train'],
                tasks['locally_tuned_model_validate'],
                tasks['aggregated_model_validate']
            ]
    return collaborator_task_map