In [None]:
%cd logs

In [None]:
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
from processfedndata import read_status, smoothen

In [None]:
import datetime

In [None]:
from collections import defaultdict
import numpy as np

In [None]:
def smoothen(xs, ys, size=10):
    _xs = []
    _ys = []
    _x = 0
    _y1 = 0
    _y2 = 0
    for i, (x, (y1, y2)) in enumerate(zip(xs, ys)):
        _x += x
        _y1 += y1
        _y2 += y2
        if (i + 1) % size == 0:
            _xs.append(_x / size)
            _ys.append((_y1 / size, _y2 / size))
            _x = 0
            _y1 = 0
            _y2 = 0
    return _xs, _ys

In [None]:
def plot_status(results,
                steps_per_round,
                metric="loss",
                time=False,
                smooth=0,
                fig=None,
                ax=None,
                do_plot=True,
                a_name=""):
    ys = defaultdict(list)
    for y, name, timestamp in results:
        ys[name].append(y[metric])
    if time:
        xs = {}
        start = None
        for i, (_, name, timestamp) in enumerate(results):
            time = datetime.datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S.%f')
            if start is None:
                start = time
            time_passed = time - start
            xs[name] = time_passed.total_seconds() / 60
        xs = xs.values()
    else:
        xs = np.arange(0, len(ys) * steps_per_round, steps_per_round)
    ys = list(ys.values())

    if not fig and do_plot:
        fig, ax = plt.subplots(figsize=(19, 6))
    if smooth:
        xs, ys = smoothen(xs, ys, smooth)
    ys1, ys2 = zip(*ys)
    if do_plot:
        plt.plot(xs, ys1, "-", label=f"sv - {a_name}")
        plt.plot(xs, ys2, "-", label=f"no - {a_name}")
    return xs, ys, fig, ax

In [None]:
def main(fn1, l1, fn2, l2, time_spent, smooth, title, xlabel, ylabel="Loss", a_name="", b_name=""):
    data1 = read_status(fn1)
    data2 = read_status(fn2)
    smv = 0
    if smooth:
        smv = 10 # int(len(data1) / len(data2))
    print(len(data1), len(data2))
    # print_status(data1, l1, "loss")
    _, _, fig, ax = plot_status(data1, l1, "loss", time=time_spent, smooth=smv, a_name=a_name)
    _, _, fig, ax = plot_status(data2, l2, "loss", time=time_spent, smooth=0, fig=fig, ax=ax, a_name=b_name)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend()
    _title = "_".join(title.lower().split())
    _xlabel = "_".join(xlabel.lower().split())
    plt.savefig(f"../figs/{_title}_{_xlabel}.pdf")
    plt.show()

## Experiments

In order to test the effect of the number of steps between each model aggregation step, we run ELECTRA training
with 100, 1000, 2000, and 5000 steps for every aggregation loop.

In [None]:
l1k = "electra_small_sv+no_1000-step-round.control.status"
l1k_ctd = "electra_small_sv+no_1000-step-round.control.status.continued"
l1k_extra = "electra_small_sv+no_1000-step-no_extra_round.control.status"
l100 = "electra_small_sv+no_100-step-round.control.status"
l2k = "electra_small_sv+no_2000-step-round.control.status"
l5k = "electra_small_sv+no_5000-step-round.control.status"

### 100 vs. 1000 Steps

While more frequent aggregations result in better model performance in the same total number of steps, this takes
much longer.

In [None]:
main(l100, 100, l1k, 1000, time_spent=False, smooth=True, title="100 vs. 1000 Steps", xlabel="Training Steps", a_name="100", b_name="1000")

Plotting performance instead over time spent instead of the number of steps shows, that it is reasonable to reduce
performance for the sake of speed.

In [None]:
main(l100, 100, l1k, 1000, time_spent=True, smooth=True, title="100 vs. 1000 Steps", xlabel="Training Time in Minutes", a_name="100", b_name="1000")

### 1000 vs. 2000 Steps

To see how much performance degrades while gaining speed in return we compare 1000 and 2000 steps per federated learning round.

In [None]:
main(l1k, 1000, l2k, 2000, False, False, title="1000 vs. 2000 Steps", xlabel="Training Steps", a_name="1000", b_name="2000")

The drop in performance is small but visible, but over time spent we do not get any real benefit.

In [None]:
main(l1k, 1000, l2k, 2000, True, False, title="1000 vs. 2000 Steps", xlabel="Training Time in Minutes", a_name="1000", b_name="2000")

### 1000 vs. 5000 Steps

To see if the speed gain justifies a performance drop we further increase the number of steps per round.

In [None]:
main(l1k, 1000, l5k, 5000, False, False, title="1000 vs. 5000 Steps", xlabel="Training Steps", a_name="1000", b_name="5000")

While the model is trained for about 300000 more steps in nearly the same amount of time, again the performance
drop argues against this higher amount of steps per round.

In [None]:
main(l1k, 1000, l5k, 5000, True, False, title="1000 vs. 5000 Steps", xlabel="Training Time in Minutes", a_name="1000", b_name="5000")

### With and without optimizer parameters

Language models such as ELECTRA use the Adam optimizer for training, which requires to keep track of additional momentum variables for every parameter.
When training large LMs in a federated fashion, the increased number of parameters required to be aggregated can result into long waiting times when sending the client's model data to the combiner/reducer.

We therefore test training ELECTRA without aggregating the optimizer parameters, federating only model parameters, while keeping the client's local optimization variables.

In [None]:
smv = 10
data1 = read_status(l1k)
data2 = read_status(l1k_ctd)
xs1, ys1, fig, ax = plot_status(data1, 1000, do_plot=False, smooth=smv)
xs2, ys2, _, _ = plot_status(data2, 1000, do_plot=False, smooth=smv)
xs = np.concatenate([xs1, xs1[-1] + xs2])
ys = np.concatenate([ys1, ys2])
fig, ax = plt.subplots(figsize=(19, 6))
xs3, ys3, _, _ = plot_status(read_status(l1k_extra), 1000, do_plot=False, smooth=smv)
plt.title("1000 Steps: With vs. Without Optimization Parameters")
plt.xlabel("Training Steps")
plt.ylabel("Loss")
ys1, ys2 = zip(*ys)
plt.plot(xs, ys1, label="sv - Federated Adam")
plt.plot(xs, ys2, label="no - Federated Adam")
ys3, ys4 = zip(*ys3)
plt.plot(xs3, ys3, label="sv - Local Adam")
plt.plot(xs3, ys4, label="no - Local Adam")
plt.legend()
plt.savefig("../figs/local_v_global_steps.pdf")

In [None]:
smv = 10
data1 = read_status(l1k)
data2 = read_status(l1k_ctd)
xs1, ys1, fig, ax = plot_status(data1, 1000, do_plot=False, smooth=smv, time=True)
xs2, ys2, _, _ = plot_status(data2, 1000, do_plot=False, smooth=smv, time=True)
xs1 = np.array(xs1)
xs2 = np.array(xs2)
xs = np.concatenate([xs1, xs1[-1] + xs2])
ys = np.concatenate([ys1, ys2])
fig, ax = plt.subplots(figsize=(19, 6))
xs3, ys3, _, _ = plot_status(read_status(l1k_extra), 1000, do_plot=False, smooth=smv, time=True)
plt.title("1000 Steps: With vs. Without Optimization Parameters")
plt.xlabel("Training Time in Minutes")
plt.ylabel("Loss")
ys1, ys2 = zip(*ys)
plt.plot(xs, ys1, label="sv - Federated Adam")
plt.plot(xs, ys2, label="no - Federated Adam")
ys3, ys4 = zip(*ys3)
plt.plot(xs3, ys3, label="sv - Local Adam")
plt.plot(xs3, ys4, label="no - Local Adam")
plt.legend()
plt.savefig("../figs/local_v_global_time.pdf")