In [5]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Python version: 3.6

import argparse
import multiprocessing as mp
import os

import os
import copy
import time
import pickle
import numpy as np
from tqdm import tqdm

import torch
from tensorboardX import SummaryWriter

from options import args_parser
from update import LocalUpdate, test_inference, ASRLocalUpdate
from models import MLP, CNNMnist, CNNFashion_Mnist, CNNCifar, Data2VecAudioForCTC, DataCollatorCTCWithPadding
from utils import get_dataset, average_weights, exp_details

from transformers import Data2VecAudioConfig, Wav2Vec2Processor
from multiprocessing import Pool
from collections import OrderedDict
parser = argparse.ArgumentParser()

# federated arguments (Notation for the arguments followed from paper)
parser.add_argument('--epochs', type=int, default=2,
                    help="number of rounds of training")
parser.add_argument('--num_users', type=int, default=2,
                    help="number of users: K")
parser.add_argument('--frac', type=float, default=1.0,
                    help='the fraction of clients: C')
parser.add_argument('--local_ep', type=int, default=1,
                    help="the number of local epochs: E")

parser.add_argument('--model', type=str, default='data2vec', help='model name')


# other arguments
parser.add_argument('--dataset', type=str, default='adress', help="name \
                    of dataset") #cifar
#parser.add_argument('--num_classes', type=int, default=10, help="number \
#                    of classes")
parser.add_argument('--gpu', default=1, help="To use cuda, set \
                    to a specific GPU ID. Default set to use CPU.")

# additional arguments
parser.add_argument('--pretrain_name', type=str, default='facebook/data2vec-audio-large-960h', help="str used to load pretrain model")
parser.add_argument('-lam', '--LAMBDA', type=float, default=0.5, help="Lambda for GRL")
parser.add_argument('-st', '--STAGE', type=int, default=2, help="Current training stage")
parser.add_argument('-GRL', '--GRL', action='store_true', default=False, help="True: GRL")
parser.add_argument('-model_in', '--model_in_path', type=str, default="/mnt/Internal/FedASR/weitung/HuggingFace/Pretrain/saves/data2vec-audio-large-960h_new1_recall/final/", help="Where the model is saved")
parser.add_argument('-model_out', '--model_out_path', type=str, default="./save/data2vec-audio-large-960h_new2_recall_FL", help="Where to save the model")
parser.add_argument('-log', '--log_path', type=str, default="data2vec-audio-large-960h_new2_recall_FL.txt", help="name for the txt file")
# 2023/01/08: loss type
parser.add_argument('-ad_loss', '--AD_loss', type=str, default="recall", help="loss to use for AD classifier")
# 2023/01/18: ckpt
parser.add_argument('-ckpt', '--checkpoint', type=str, default=None, help="path to checkpoint")
# 2023/02/13: TOGGLE_RATIO
parser.add_argument('-toggle_rt', '--TOGGLE_RATIO', type=float, default=0, help="To toggle more or less")
# 2023/02/15: GS_TAU, loss weight
parser.add_argument('-gs_tau', '--GS_TAU', type=float, default=1, help="Tau for gumbel_softmax")
parser.add_argument('-w_loss', '--W_LOSS', type=float, default=None, nargs='+', help="weight for HC and AD")

args = parser.parse_args(args=[]) # for jupyter notebook


# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3" # 或者其他你想要使用的 GPU 編號
lock = mp.Lock()
logger = SummaryWriter('../logs')

# 第一步我把logger拿掉了
# final_result = pool.starmap_async(
#     client_train, [(args, train_dataset, logger,
#                     test_dataset, idx, epoch, global_weights)
#                 for idx in idxs_users])
#傳logger會出現RuntimeError: Queue objects should only be shared between processes through inheritance
final_result = pool.starmap_async(
    client_train, [(args, train_dataset, None,
                    test_dataset, idx, epoch, global_weights)
                for idx in idxs_users])

In [6]:
import os
import multiprocessing
from client_train import client_train
#!!! [NOTE] 3.需要把client train需要獨立到別的模塊然後用import的方式叫進來
train_loss, test_wer = [], []
val_acc_list, net_list = [], []
cv_loss, cv_acc = [], []
print_every = 2
val_loss_pre, counter = 0, 0
global_weights = None                                                           # initial global_weights
train_dataset, test_dataset, user_groups = get_dataset(args)
if __name__ == "__main__":
    # multiprocessing.set_start_method('spawn')
    multiprocessing.set_start_method('spawn', force=True) #!!! [NOTE] 2.需要把multiprocessing的method從fork改成spawn，並且client train需要獨立到別的模塊然後用import的方式叫進來
    for epoch in range(2):
        m = max(int(args.frac * args.num_users), 1)                                 # num of clients to train, min:1
        idxs_users = np.random.choice(range(args.num_users), m, replace=False)      # select by client_id
        pool = multiprocessing.Pool(processes=m)
        try:
            # final_result = pool.starmap_async(
            #     client_train, [(args, train_dataset, copy.deepcopy(logger),
            #                     test_dataset, idx, epoch, global_weights)
            #                 for idx in idxs_users])
            #!!! [NOTE] 1.傳logger會出現RuntimeError: Queue objects should only be shared between processes through inheritance
            final_result = pool.starmap_async(
                client_train, [(args, train_dataset, None,
                                test_dataset, idx, epoch, global_weights)
                            for idx in idxs_users])
        except Exception as e:
            print(f"An error occurred while running local_model.update_weights(): {str(e)}")
        finally:
            final_result.wait()
            results = final_result.get()
        
        local_weights = []
        local_losses = []
        for idx in range(len(results)):
            w, loss = results[idx]
            local_weights.append(w)
            local_losses.append(loss)
        print("local weights: ", local_weights)
        # get global weights by averaging local weights
        global_weights = average_weights(local_weights)
        print("global wegiths: ", global_weights)
        # update global weights
        #global_model.load_state_dict(global_weights)
        loss_avg = sum(local_losses) / len(local_losses)                # average losses from participated client
        train_loss.append(loss_avg)                                     # save loss for this round
        print("All results done")


Loading cached processed dataset at /home/FedASR/dacs/federated/dataset/train/cache-b6f4c0d2143105d5_*_of_00010.arrow
Loading cached processed dataset at /home/FedASR/dacs/federated/dataset/test/cache-f07dc6d726972452_*_of_00010.arrow


Load data from local...
Load data from local...


2023-04-15 17:36:15.434395: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-15 17:36:15.743094: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  wer_metric = load_metric("wer")
  wer_metric = load_metric("wer")


load from  /mnt/Internal/FedASR/weitung/HuggingFace/Pretrain/saves/data2vec-audio-large-960h_new1_recall/final/
load from  /mnt/Internal/FedASR/weitung/HuggingFace/Pretrain/saves/data2vec-audio-large-960h_new1_recall/final/
model loaded
model loaded
initialize ASRLocalUpdate
Generating client training set for client  0 ...
load model


Loading cached processed dataset at /home/FedASR/dacs/federated/dataset/train/cache-5c6af673ecec18e7.arrow


initialize ASRLocalUpdate
Generating client training set for client  1 ...
load model


Loading cached processed dataset at /home/FedASR/dacs/federated/dataset/train/cache-fb985c2cd39883d3.arrow
Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `Data2VecAudioForCTC.forward` and have been ignored: array, path, text. If array, path, text are not expected by `Data2VecAudioForCTC.forward`,  you can safely ignore this message.


0  ready to train!


Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `Data2VecAudioForCTC.forward` and have been ignored: array, path, text. If array, path, text are not expected by `Data2VecAudioForCTC.forward`,  you can safely ignore this message.


1  ready to train!


***** Running training *****
  Num examples = 419
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 105
***** Running training *****
  Num examples = 543
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 136
100%|██████████| 105/105 [02:22<00:00,  1.07it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 105/105 [02:22<00:00,  1.36s/it]
Saving model checkpoint to ./save/data2vec-audio-large-960h_new2_recall_FL_client1_round0/final
Configuration saved in ./save/data2vec-audio-large-960h_new2_recall_FL_client1_round0/final/config.json


{'train_runtime': 142.7757, 'train_samples_per_second': 2.935, 'train_steps_per_second': 0.735, 'train_loss': 20.432831101190477, 'epoch': 1.0}


 81%|████████  | 110/136 [02:22<00:22,  1.17it/s]Model weights saved in ./save/data2vec-audio-large-960h_new2_recall_FL_client1_round0/final/pytorch_model.bin
Feature extractor saved in ./save/data2vec-audio-large-960h_new2_recall_FL_client1_round0/final/preprocessor_config.json
 82%|████████▏ | 111/136 [02:22<00:20,  1.24it/s]loading configuration file https://huggingface.co/facebook/data2vec-audio-large-960h/resolve/main/config.json from cache at /home/FedASR/.cache/huggingface/transformers/a5e291023d6dd7ec0034390cee6d97f07e340fb24c68c7b5f3ec8d017a6fd29d.ed9b9e83fb80348aa91a073138fc7a0f44e669fc412c9c4bc98857f45bfd4330
Model config Data2VecAudioConfig {
  "activation_dropout": 0.1,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Data2VecAudioForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "co

PID 3115950 Getting  Done


100%|██████████| 136/136 [02:38<00:00,  1.23it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 136/136 [02:38<00:00,  1.17s/it]
Saving model checkpoint to ./save/data2vec-audio-large-960h_new2_recall_FL_client0_round0/final
Configuration saved in ./save/data2vec-audio-large-960h_new2_recall_FL_client0_round0/final/config.json


{'train_runtime': 158.4837, 'train_samples_per_second': 3.426, 'train_steps_per_second': 0.858, 'train_loss': 51.411369772518384, 'epoch': 1.0}


Model weights saved in ./save/data2vec-audio-large-960h_new2_recall_FL_client0_round0/final/pytorch_model.bin
Feature extractor saved in ./save/data2vec-audio-large-960h_new2_recall_FL_client0_round0/final/preprocessor_config.json
loading configuration file https://huggingface.co/facebook/data2vec-audio-large-960h/resolve/main/config.json from cache at /home/FedASR/.cache/huggingface/transformers/a5e291023d6dd7ec0034390cee6d97f07e340fb24c68c7b5f3ec8d017a6fd29d.ed9b9e83fb80348aa91a073138fc7a0f44e669fc412c9c4bc98857f45bfd4330
Model config Data2VecAudioConfig {
  "activation_dropout": 0.1,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Data2VecAudioForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ]

PID 3115949 Getting  Done
local weights:  [OrderedDict([('weight', tensor([[-1.0249e-02,  2.9718e-02,  2.0970e-02,  ..., -3.0150e-02,
          2.3154e-02,  2.9893e-03],
        [ 1.7420e-02,  1.5693e-02,  3.6381e-02,  ...,  4.0969e-02,
         -1.9403e-03,  5.6305e-02],
        [-1.0268e-02, -4.6560e-02, -9.3584e-03,  ...,  4.6585e-03,
         -5.3232e-03, -8.8113e-04],
        ...,
        [ 3.0720e-03, -5.8826e-03, -1.4689e-02,  ...,  2.6226e-02,
         -1.4056e-02,  4.5269e-05],
        [-2.6933e-02, -2.4981e-02,  2.8530e-02,  ...,  6.8621e-03,
          2.6581e-02,  1.4394e-02],
        [-2.8212e-02,  1.9695e-02,  3.6847e-02,  ...,  2.2175e-02,
         -5.9058e-03,  1.0088e-02]])), ('bias', tensor([ 0.0009, -0.0022,  0.0030,  ...,  0.0086, -0.0057,  0.0071]))]), OrderedDict([('weight', tensor([[-0.0110,  0.0302,  0.0225,  ..., -0.0308,  0.0234,  0.0027],
        [ 0.0181,  0.0147,  0.0369,  ...,  0.0426, -0.0014,  0.0568],
        [-0.0089, -0.0472, -0.0091,  ...,  0.0045, -0

All model checkpoint weights were used when initializing Data2VecAudioForCTC.

All the weights of Data2VecAudioForCTC were initialized from the model checkpoint at ./save/data2vec-audio-large-960h_new2_recall_FL_client0_round0/final.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Data2VecAudioForCTC for predictions without further training.
2023-04-15 17:40:14.235971: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-15 17:40:14.235971: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable th

load from  /mnt/Internal/FedASR/weitung/HuggingFace/Pretrain/saves/data2vec-audio-large-960h_new1_recall/final/
load from  /mnt/Internal/FedASR/weitung/HuggingFace/Pretrain/saves/data2vec-audio-large-960h_new1_recall/final/
model loaded
model loaded
initialize ASRLocalUpdate
Generating client training set for client  1 ...
load model


Loading cached processed dataset at /home/FedASR/dacs/federated/dataset/train/cache-fb985c2cd39883d3.arrow


initialize ASRLocalUpdate
Generating client training set for client  0 ...
load model


Loading cached processed dataset at /home/FedASR/dacs/federated/dataset/train/cache-5c6af673ecec18e7.arrow
Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `Data2VecAudioForCTC.forward` and have been ignored: array, text, path. If array, text, path are not expected by `Data2VecAudioForCTC.forward`,  you can safely ignore this message.


1  ready to train!


Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `Data2VecAudioForCTC.forward` and have been ignored: text, path, array. If text, path, array are not expected by `Data2VecAudioForCTC.forward`,  you can safely ignore this message.


0  ready to train!


***** Running training *****
  Num examples = 419
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 105
***** Running training *****
  Num examples = 543
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 136
100%|██████████| 105/105 [02:35<00:00,  1.50s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 105/105 [02:35<00:00,  1.48s/it]
Saving model checkpoint to ./save/data2vec-audio-large-960h_new2_recall_FL_client1_round1/final
Configuration saved in ./save/data2vec-audio-large-960h_new2_recall_FL_client1_round1/final/config.json


{'train_runtime': 155.0405, 'train_samples_per_second': 2.703, 'train_steps_per_second': 0.677, 'train_loss': 440.5902157738095, 'epoch': 1.0}


 87%|████████▋ | 118/136 [02:48<00:25,  1.40s/it]Model weights saved in ./save/data2vec-audio-large-960h_new2_recall_FL_client1_round1/final/pytorch_model.bin
Feature extractor saved in ./save/data2vec-audio-large-960h_new2_recall_FL_client1_round1/final/preprocessor_config.json
loading configuration file https://huggingface.co/facebook/data2vec-audio-large-960h/resolve/main/config.json from cache at /home/FedASR/.cache/huggingface/transformers/a5e291023d6dd7ec0034390cee6d97f07e340fb24c68c7b5f3ec8d017a6fd29d.ed9b9e83fb80348aa91a073138fc7a0f44e669fc412c9c4bc98857f45bfd4330
Model config Data2VecAudioConfig {
  "activation_dropout": 0.1,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Data2VecAudioForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512

PID 3123215 Getting  Done


100%|██████████| 136/136 [03:05<00:00,  1.06s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 136/136 [03:05<00:00,  1.37s/it]
Saving model checkpoint to ./save/data2vec-audio-large-960h_new2_recall_FL_client0_round1/final
Configuration saved in ./save/data2vec-audio-large-960h_new2_recall_FL_client0_round1/final/config.json


{'train_runtime': 185.774, 'train_samples_per_second': 2.923, 'train_steps_per_second': 0.732, 'train_loss': 378.6973517922794, 'epoch': 1.0}


Model weights saved in ./save/data2vec-audio-large-960h_new2_recall_FL_client0_round1/final/pytorch_model.bin
Feature extractor saved in ./save/data2vec-audio-large-960h_new2_recall_FL_client0_round1/final/preprocessor_config.json
loading configuration file https://huggingface.co/facebook/data2vec-audio-large-960h/resolve/main/config.json from cache at /home/FedASR/.cache/huggingface/transformers/a5e291023d6dd7ec0034390cee6d97f07e340fb24c68c7b5f3ec8d017a6fd29d.ed9b9e83fb80348aa91a073138fc7a0f44e669fc412c9c4bc98857f45bfd4330
Model config Data2VecAudioConfig {
  "activation_dropout": 0.1,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Data2VecAudioForCTC"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ]

PID 3123214 Getting  Done
local weights:  [OrderedDict([('weight', tensor([[-0.0101,  0.0304,  0.0224,  ..., -0.0317,  0.0236,  0.0016],
        [ 0.0186,  0.0189,  0.0364,  ...,  0.0410, -0.0014,  0.0538],
        [-0.0100, -0.0479, -0.0108,  ...,  0.0059, -0.0054, -0.0010],
        ...,
        [ 0.0041, -0.0082, -0.0170,  ...,  0.0289, -0.0169,  0.0021],
        [-0.0290, -0.0239,  0.0330,  ...,  0.0013,  0.0308,  0.0093],
        [-0.0287,  0.0206,  0.0351,  ...,  0.0232, -0.0080,  0.0120]])), ('bias', tensor([ 0.0005, -0.0032,  0.0046,  ...,  0.0112, -0.0101,  0.0085]))]), OrderedDict([('weight', tensor([[-0.0079,  0.0306,  0.0205,  ..., -0.0311,  0.0234,  0.0026],
        [ 0.0187,  0.0223,  0.0359,  ...,  0.0393, -0.0015,  0.0530],
        [-0.0105, -0.0478, -0.0117,  ...,  0.0051, -0.0041, -0.0011],
        ...,
        [ 0.0021, -0.0081, -0.0190,  ...,  0.0318, -0.0178,  0.0043],
        [-0.0302, -0.0224,  0.0354,  ..., -0.0007,  0.0335,  0.0069],
        [-0.0291,  0.0202,  

All model checkpoint weights were used when initializing Data2VecAudioForCTC.

All the weights of Data2VecAudioForCTC were initialized from the model checkpoint at ./save/data2vec-audio-large-960h_new2_recall_FL_client0_round1/final.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Data2VecAudioForCTC for predictions without further training.
