In [1]:
from __future__ import absolute_import, division, print_function
import nest_asyncio

nest_asyncio.apply()

import time
import numpy as np
import tensorflow as tf
import tensorflow_federated as tff
import pandas as pd
import matplotlib.pyplot as plt

from user import User
from average import Average
from tensorflow import keras
from sklearn.model_selection import train_test_split

import collections
import warnings

from six.moves import range
import six

SEED = 0

# import os
# os.environ['PYTHONHASHSEED']=str(SEED)
# np.random.seed(SEED)
# import random
# random.seed(SEED)
# tf.set_random_seed(SEED)
# could need to force keras to not use parallelism, see documentation

%load_ext autoreload
%autoreload 2
%matplotlib inline

#@test {"skip": true}

# NOTE: If you are running a Jupyter notebook, and installing a locally built
# pip package, you may need to edit the following to point to the '.whl' file
# on your local filesystem.

# NOTE: The high-performance executor components used in this tutorial are not
# yet included in the released pip package; you may need to compile from source.

# NOTE: Jupyter requires a patch to asyncio.


warnings.simplefilter('ignore')

tf.compat.v1.enable_v2_behavior()

np.random.seed(0)

# NOTE: If the statement below fails, it means that you are
# using an older version of TFF without the high-performance
# executor stack. Call `tff.framework.set_default_executor()`
# instead to use the default reference runtime.
if six.PY3:
    tff.framework.set_default_executor(tff.framework.create_local_executor())

tff.federated_computation(lambda: 'Hello, World!')()

b'Hello, World!'

In [2]:
def read_file(file):
    """
    return 2d df after imputing with 0s"""

    # read data
    df = pd.read_csv(file)

    # replace the question marks with NaN and then change data type to float 32
    df.replace(["?"],np.nan, inplace = True)
    df = df.astype(np.float32)

    # imputation
    df.fillna(0,inplace=True) # fill nulls with 0
    return df

def shuffle_df(df, seed = None):
    """Shuffle dataframe and reset the index"""
    df = df.take(np.random.RandomState(seed=SEED).permutation(df.shape[0]))
    df.reset_index(drop = True, inplace = True)
    
    return df

def acquire_user_data(df, for_user = None, seed = None):
    """
    split the dataframe into train, validation and test splits based on the same seed
    Empty dataframes if no data present
    """
    # split into train, validation and test data using sklearn and return dfs for each
    if for_user!=None:
        df = df[df["User"] == for_user]
    if df.shape[0] == 0:
        # if no data for the user, then return 9 empty dfs as per the api
        # print(f"Dataframe for user {user} is of shape {df.shape}, no data. Skipping...")
        df = pd.DataFrame()
        return df, df
    target = df["Class"]

    # drop the class and user identifier columns from data frame
    df   = df.drop(df.columns[[0,1]], axis=1)
    return df, target





def create_hdf5(df,name, seed=None):
    """
    create hdf5 files of structure 
    examples
        userID
            points
            label
    returns the number of clients created
    """
    n = 0
    with h5py.File(name, "w") as f:
        examples = f.create_group("examples")
        u_users = df["User"].unique()
        for user_id in u_users:
            grp = examples.create_group(f"{str(user_id)}")
            user_df, target = acquire_user_data(df = df, for_user=user_id, seed = 0)
            if user_df.shape[0]==0:
                print(f"User {user_id} has no data, no instance created...")
                continue
            n+=1
            grp.create_dataset('points',data=user_df.values)
            grp.create_dataset('label',data=target.values)
    return n

In [3]:
import numpy as np
import h5py
from tensorflow_federated.python.simulation import hdf5_client_data
# https://github.com/tensorflow/federated/blob/master/tensorflow_federated/python/simulation/hdf5_client_data_test.py
# https://github.com/tensorflow/federated/blob/v0.11.0/tensorflow_federated/python/simulation/hdf5_client_data.py
# http://docs.h5py.org/en/stable/high/group.html#Group.create_dataset
# https://stackoverflow.com/questions/55434004/create-a-custom-federated-data-set-in-tensorflow-federated
# https://stackoverflow.com/questions/58965488/how-to-create-federated-dataset-from-a-csv-file

file = "dataset.hdf5"

df = read_file("../dataset/allUsers.lcl.csv")
NUM_CLIENTS = create_hdf5(df,file,0)
"""
def printname(name):
    print(name)
with h5py.File(file,"r") as f:
    pass
    f.visit(printname)
"""

'\ndef printname(name):\n    print(name)\nwith h5py.File(file,"r") as f:\n    pass\n    f.visit(printname)\n'

In [4]:
# emnist_train, emnist_test = tff.simulation.datasets.emnist.load_data()
myclient = hdf5_client_data.HDF5ClientData(file)
train = myclient
example_dataset = train.create_tf_dataset_for_client(
    train.client_ids[0])

example_element = iter(example_dataset).next()

print(example_element['points'].numpy())

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [5]:
# NUM_CLIENTS = 10
NUM_EPOCHS = 10
BATCH_SIZE = 20
SHUFFLE_BUFFER = 500

def preprocess(dataset):

    def element_fn(element):
        return collections.OrderedDict([
            ('x', tf.reshape(element['points'], [-1])),
            ('y', tf.reshape(element['label'], [1])),
        ])

    return dataset.repeat(NUM_EPOCHS).map(element_fn).shuffle(
      SHUFFLE_BUFFER).batch(BATCH_SIZE)

In [6]:
type(train)

tensorflow_federated.python.simulation.hdf5_client_data.HDF5ClientData

In [7]:
preprocessed_example_dataset = preprocess(example_dataset)

sample_batch = tf.nest.map_structure(
    lambda x: x.numpy(), iter(preprocessed_example_dataset).next())

sample_batch

OrderedDict([('x',
              array([[ 54.67766  ,  72.19174  , -63.592896 ,  86.29736  ,  69.0259   ,
                      -71.57185  ,  77.674194 ,  43.537884 , -71.30554  ,  61.061195 ,
                       11.303073 , -69.65476  ,  35.26109  ,  81.730225 , -54.04047  ,
                        0.       ,   0.       ,   0.       ,   0.       ,   0.       ,
                        0.       ,   0.       ,   0.       ,   0.       ,   0.       ,
                        0.       ,   0.       ,   0.       ,   0.       ,   0.       ,
                        0.       ,   0.       ,   0.       ,   0.       ,   0.       ,
                        0.       ],
                     [ 37.39168  ,  81.85215  , -51.971466 ,  62.47534  ,  11.078077 ,
                      -68.00474  ,  77.56011  ,  42.85021  , -71.73097  ,  86.40394  ,
                       68.36682  , -71.948814 ,  55.353733 ,  71.76356  , -63.652836 ,
                        0.       ,   0.       ,   0.       ,   0.       ,  

In [8]:
def make_federated_data(client_data, client_ids):
    return [preprocess(client_data.create_tf_dataset_for_client(x))
          for x in client_ids]

sample_clients = train.client_ids[0:NUM_CLIENTS]

federated_train_data = make_federated_data(train, sample_clients)

len(federated_train_data), federated_train_data[0]

(14,
 <BatchDataset shapes: OrderedDict([(x, (None, 36)), (y, (None, 1))]), types: OrderedDict([(x, tf.float32), (y, tf.float32)])>)

In [9]:
def create_compiled_keras_model():
    model = tf.keras.models.Sequential([
      tf.keras.layers.Dense(
          10, activation=tf.nn.softmax, kernel_initializer='zeros')])

    model.compile(
      loss=tf.keras.losses.SparseCategoricalCrossentropy(),
      optimizer=tf.keras.optimizers.SGD(learning_rate=0.02),
      metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
    return model
def init_model(init_seed=None):
    """
    initialise and return a model 
    """
    model = keras.Sequential([
        keras.layers.Flatten(),
#         keras.layers.Dense(4096, activation='relu',
#             kernel_initializer=keras.initializers.glorot_uniform(seed=init_seed)),
#         keras.layers.Dense(1024, activation='relu',
#             kernel_initializer=keras.initializers.glorot_uniform(seed=init_seed)),
#         keras.layers.Dense(128, activation='relu',
#             kernel_initializer=keras.initializers.glorot_uniform(seed=init_seed)),
        keras.layers.Dense(32, activation='relu',
            kernel_initializer=keras.initializers.glorot_uniform(seed=init_seed)),
        keras.layers.Dense(6, activation='softmax',
            kernel_initializer=keras.initializers.glorot_uniform(seed=init_seed))
    ])

    model.compile(
        optimizer = 'adam',
        loss = 'sparse_categorical_crossentropy',
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
#         metrics = ["accuracy"]
    )

    return model


def model_fn():
    keras_model = init_model()
    return tff.learning.from_compiled_keras_model(keras_model, sample_batch)

In [10]:
iterative_process = tff.learning.build_federated_averaging_process(model_fn)
print(str(iterative_process.initialize.type_signature))
state = iterative_process.initialize()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


( -> <model=<trainable=<sequential/dense/kernel=float32[36,32],sequential/dense/bias=float32[32],sequential/dense_1/kernel=float32[32,6],sequential/dense_1/bias=float32[6]>,non_trainable=<>>,optimizer_state=<int64>,delta_aggregate_state=<>,model_broadcast_state=<>>@SERVER)


In [11]:
state, metrics = iterative_process.next(state, federated_train_data)
print('round  1, metrics={}'.format(metrics))
NUM_ROUNDS = 11
for round_num in range(2, NUM_ROUNDS):
    state, metrics = iterative_process.next(state, federated_train_data)
    print('round {:2d}, metrics={}'.format(round_num, metrics))

round  1, metrics=<sparse_categorical_accuracy=0.9383515119552612,loss=0.5496358871459961>
round  2, metrics=<sparse_categorical_accuracy=0.9629532694816589,loss=0.18527714908123016>
round  3, metrics=<sparse_categorical_accuracy=0.9692147970199585,loss=0.13197511434555054>
round  4, metrics=<sparse_categorical_accuracy=0.9709460139274597,loss=0.11831852793693542>
round  5, metrics=<sparse_categorical_accuracy=0.9749155044555664,loss=0.10432572662830353>
round  6, metrics=<sparse_categorical_accuracy=0.9751062989234924,loss=0.10409539192914963>
round  7, metrics=<sparse_categorical_accuracy=0.9757708311080933,loss=0.09923163801431656>
round  8, metrics=<sparse_categorical_accuracy=0.975578784942627,loss=0.09917626529932022>
round  9, metrics=<sparse_categorical_accuracy=0.9780449867248535,loss=0.09046626836061478>
round 10, metrics=<sparse_categorical_accuracy=0.9793152213096619,loss=0.0887361615896225>
