In [1]:
import io
import pickle
import requests
import json

import numpy as np
import pandas as pd

from minio import Minio
from mlflow import MlflowClient



## Preprocessing

In [11]:
source_data_df = pd.read_csv('data/Fraud_Detection.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/Fraud_Detection.csv'

In [None]:
def formatting(
    source_df: any
) -> any:
    print('Formatting data')
    formated_df = source_df.copy()
    
    irrelevant_columns = [
        'oldbalanceOrg',
        'newbalanceOrig',
        'oldbalanceDest',
        'newbalanceDest'
    ]
    formated_df.drop(
        columns = irrelevant_columns, 
        inplace = True
    )
    print('Columns dropped')
    formated_df = pd.get_dummies(
        data = formated_df, 
        columns = ['type']
    )
    
    for column in formated_df.columns:
        if 'type' in column:
            formated_df[column] = formated_df[column].astype(int)
    print('One hot coded type')

    unique_values_orig = formated_df['nameOrig'].unique()
    unique_values_dest = formated_df['nameDest'].unique()
    
    unique_value_list_orig = unique_values_orig.tolist()
    unique_value_list_dest = unique_values_dest.tolist()

    print('Orig amount:', len(unique_value_list_orig))
    print('Dest amount:', len(unique_value_list_dest))
    
    set_orig_ids = set(unique_value_list_orig)
    set_dest_ids = set(unique_value_list_dest)
    intersection = set_dest_ids.intersection(set_orig_ids)

    print('Orig and Dest duplicates', len(intersection))
    
    set_dest_ids.difference_update(intersection)
    fixed_unique_value_list_dest = list(set_dest_ids)
    print('Fixed Dest amount:',len(fixed_unique_value_list_dest))
    
    orig_encoding_dict = {}
    index = 1
    for string in unique_value_list_orig:
        if not string in orig_encoding_dict:
            orig_encoding_dict[string] = index
            index = index + 1

    dest_encoding_dict = {}
    cont_index = len(orig_encoding_dict) + 1
    for string in fixed_unique_value_list_dest:
        if not string in dest_encoding_dict:
            dest_encoding_dict[string] = cont_index
            cont_index = cont_index + 1
    print('Orig dict amount:', len(orig_encoding_dict))
    print('Dest dict amount:', len(dest_encoding_dict))
    
    print('Orig and dest string-integer encodings created')

    string_orig_values = formated_df['nameOrig'].tolist()
    string_dest_values = formated_df['nameDest'].tolist()

    orig_encoded_values = []
    for string in string_orig_values:
        orig_encoded_values.append(orig_encoding_dict[string])

    dest_encoded_values = []
    for string in string_dest_values:
        if not string in dest_encoding_dict:
            dest_encoded_values.append(orig_encoding_dict[string])
            continue
        dest_encoded_values.append(dest_encoding_dict[string])

    formated_df['nameOrig'] = orig_encoded_values
    formated_df['nameDest'] = dest_encoded_values

    print('Orig encoded values amount:', len(orig_encoded_values))
    print('Dest encoded values amount:', len(dest_encoded_values))
    
    print('Orig and dest encodings set')

    formated_df['amount'] = formated_df['amount'].round(0).astype(int)
    print('Amount rounded')

    column_order = [
        'step',
        'amount',
        'nameOrig',
        'nameDest',
        'type_CASH_IN',
        'type_CASH_OUT',
        'type_DEBIT',
        'type_PAYMENT',
        'type_TRANSFER',
        'isFraud',
        'isFlaggedFraud'
    ]
    formated_df = formated_df[column_order]
    print('Columns reordered')
    print('Dataframe shape:', formated_df.shape)
    print('Formatting done')
    return formated_df

In [None]:
formated_data_df = formatting(
    source_df = source_data_df
)

In [None]:
formated_data_df.to_csv('data/Formated_Fraud_Detection_Data.csv', index = False)

## Training Context

In [51]:
formated_data_df = pd.read_csv('../data/Formated_Fraud_Detection_Data.csv')

In [132]:
experiment = {
    'name': 'federated-learning',
    'tags': {}
}

In [133]:
parameters = {
    'model':{
        'seed': 42,
        'used-columns': [
            'amount',
            'type_CASH_IN',
            'type_CASH_OUT',
            'type_DEBIT',
            'type_PAYMENT',
            'type_TRANSFER',
            'isFraud'
        ],
        'input-size': 6,
        'target-column': 'isFraud',
        'scaled-columns': [
            'amount'
        ],
        'learning-rate': 0.15,
        'sample-rate': 0.10,
        'optimizer':'SGD',
        'epochs': 10
    },
    'central':{
        'sample-pool': 50000,
        'data-augmentation': {
            'active': True,
            'sample-pool': 200000,
            '1-0-ratio': 0.2
        },
        'eval-ratio': 0.5,
        'train-ratio': 0.8,
        'min-update-amount': 5,
        'max-cycles':5,
        'min-metric-success': 9,
        'metric-thresholds': {
            'true-positives': 1000,
            'false-positives': 1000,
            'true-negatives': 1000, 
            'false-negatives': 1000,
            'recall': 0.20,
            'selectivity': 0.50,
            'precision': 0.60,
            'miss-rate': 0.50,
            'fall-out': 0.25,
            'balanced-accuracy': 0.60,
            'accuracy': 0.60
        },
        'metric-conditions': {
            'true-positives': '>=',
            'false-positives': '<=',
            'true-negatives': '>=', 
            'false-negatives': '<=',
            'recall': '>=',
            'selectivity': '>=',
            'precision': '>=',
            'miss-rate': '<=',
            'fall-out': '<=',
            'balanced-accuracy': '>=',
            'accuracy': '>='
        }
    },
    'worker':{
        'sample-pool': 50000,
        'data-augmentation': {
            'active': True,
            'sample-pool': 200000,
            '1-0-ratio': 0.2
        },
        'eval-ratio': 0.5,
        'train-ratio': 0.8
    }
}

In [134]:
data = formated_data_df.iloc[:100000].values.tolist()
columns = formated_data_df.columns.tolist()

## Starting Training

In [135]:
context = {
    'experiment': experiment,
    'parameters': parameters,
    'data': data,
    'columns': columns
}

payload = json.dumps(context)

In [16]:
len(payload)

4933793

In [136]:
response = requests.post(
    url = 'http://127.0.0.1:7500/start',
    json = payload
)

print(response.status_code)

200


## Inference

In [11]:
def central_worker_inference(
    address: str,
    experiment_name: str,
    experiment: str,
    cycle: str,
    data_df: any,
    relevant_columns: list,
    rows: int
):
    sample_df = data_df.iloc[:rows,:]
    relevant_df = sample_df[relevant_columns]
    input_df = relevant_df.iloc[:rows,:-2]
    mean = input_df['amount'].mean()
    std_dev = input_df['amount'].std()
    input_df['amount'] = (input_df['amount'] - mean)/std_dev

    payload = {
        'experiment-name': experiment_name,
        'experiment': experiment,
        'cycle': cycle,
        'input': input_df.values.tolist()
    }
    payload = json.dumps(payload)
    central_address = address + '/predict' 
    response = requests.post(
        url = central_address,
        json = payload
    )

    text_output = json.loads(response.text)
    sample_df['pred'] = np.array(text_output['predictions']).astype(int)
    return sample_df

In [25]:
inference_df = central_worker_inference(
    address = 'http://127.0.0.1:7501',
    experiment_name = 'central-federated-learning-test',
    experiment = '2',
    cycle = '4',
    data_df = formated_data_df,
    relevant_columns = [
        'amount',
        'type_CASH_IN',
        'type_CASH_OUT',
        'type_DEBIT',
        'type_PAYMENT',
        'type_TRANSFER',
        'isFraud',
        'isFlaggedFraud'
    ],
    rows = 50
)
inference_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df['pred'] = np.array(text_output['predictions']).astype(int)


Unnamed: 0,step,amount,nameOrig,nameDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,isFraud,isFlaggedFraud,pred
0,1,9840,1,7233461,0,0,0,1,0,0,0,0
1,1,1864,2,7735206,0,0,0,1,0,0,0,0
2,1,181,3,8598945,0,0,0,0,1,1,0,1
3,1,181,4,7880837,0,1,0,0,0,1,0,0
4,1,11668,5,7670940,0,0,0,1,0,0,0,0
5,1,7818,6,6477257,0,0,0,1,0,0,0,0
6,1,7108,7,8194799,0,0,0,1,0,0,0,0
7,1,7862,8,8738506,0,0,0,1,0,0,0,0
8,1,4024,9,6735336,0,0,0,1,0,0,0,0
9,1,5338,10,6427877,0,0,1,0,0,0,0,0


## MinIO Interaction

In [2]:
minio_client = Minio(
    endpoint = "127.0.0.1:9000", 
    access_key = '23034opsdjhksd', 
    secret_key = 'sdkl3slömdm',
    secure = False
)

In [3]:
def create_bucket(
    minio_client: any,
    bucket_name: str
) -> bool:
    MINIO_CLIENT = minio_client 
    try:
        MINIO_CLIENT.make_bucket(
            bucket_name = bucket_name
        )
        return True
    except Exception as e:
        print(e)
        return False
    
def check_bucket(
    minio_client: any,
    bucket_name:str
) -> bool:
    MINIO_CLIENT = minio_client
    try:
        status = MINIO_CLIENT.bucket_exists(bucket_name = bucket_name)
        return status
    except Exception as e:
        print(e)
        return False 
       
def delete_bucket(
    minio_client: any,
    bucket_name:str
) -> bool:
    MINIO_CLIENT = minio_client
    try:
        MINIO_CLIENT.remove_bucket(
            bucket_name = bucket_name
        )
        return True
    except Exception as e:
        print(e)
        return False
# Works
def create_object(
    minio_client: any,
    bucket_name: str, 
    object_path: str, 
    data: any,
    metadata: dict
) -> bool: 
    # Be aware that MinIO objects have a size limit of 1GB, 
    # which might result to large header error
    MINIO_CLIENT = minio_client
    
    pickled_data = pickle.dumps(data)
    length = len(pickled_data)
    buffer = io.BytesIO()
    buffer.write(pickled_data)
    buffer.seek(0)
    try:
        MINIO_CLIENT.put_object(
            bucket_name = bucket_name,
            object_name = object_path + '.pkl',
            data = buffer,
            length = length,
            metadata = metadata
        )
        return True
    except Exception as e:
        print(e)
        return False
# Works
def check_object(
    minio_client: any,
    bucket_name: str, 
    object_path: str
) -> bool: 
    MINIO_CLIENT = minio_client
    try:
        object_info = MINIO_CLIENT.stat_object(
            bucket_name = bucket_name,
            object_name = object_path + '.pkl'
        )      
        return True
    except Exception as e:
        return False 
# Works
def delete_object(
    minio_client: any,
    bucket_name: str, 
    object_path: str
) -> bool: 
    MINIO_CLIENT = minio_client
    try:
        MINIO_CLIENT.remove_object(
            bucket_name = bucket_name, 
            object_name = object_path + '.pkl'
        )
        return True
    except Exception as e:
        print(e)
        return False
# Works
def update_object(
    minio_client: any,
    bucket_name: str, 
    object_path: str, 
    data: any,
    metadata: dict
) -> bool:  
    remove = delete_object(minio_client,bucket_name, object_path)
    if remove:
        create = create_object(minio_client, bucket_name, object_path, data, metadata)
        if create:
            return True
    return False
# works
def create_or_update_object(
    minio_client: any,
    bucket_name: str, 
    object_path: str, 
    data: any, 
    metadata: dict
) -> any:
    bucket_status = check_bucket(minio_client,bucket_name)
    if not bucket_status:
        creation_status = create_bucket(minio_client,bucket_name)
        if not creation_status:
            return None
    object_status = check_object(minio_client,bucket_name, object_path)
    if not object_status:
        return create_object(minio_client,bucket_name, object_path, data, metadata)
    else:
        return update_object(minio_client,bucket_name, object_path, data, metadata)

def get_object_data_and_metadata(
    minio_client: any,
    bucket_name: str, 
    object_path: str
) -> dict:
    MINIO_CLIENT = minio_client
    
    try:
        given_object_info = MINIO_CLIENT.stat_object(
            bucket_name = bucket_name, 
            object_name = object_path + '.pkl'
        )
        # There seems to be some kind of a limit
        # with the amount of request a client 
        # can make, which is why this variable
        # is set here to give more time got the client
        # to complete the request
        given_metadata = given_object_info.metadata
        
        given_object_data = MINIO_CLIENT.get_object(
            bucket_name = bucket_name, 
            object_name = object_path + '.pkl'
        )
        given_pickled_data = given_object_data.data
        
        try:
            given_data = pickle.loads(given_pickled_data)
            relevant_metadata = {} 
            for key, value in given_metadata.items():
                if 'x-amz-meta' in key:
                    key_name = key[11:]
                    relevant_metadata[key_name] = value
            return {'data': given_data, 'metadata': relevant_metadata}
        except Exception as e:
            print('MinIO object pickle decoding error')
            print(e)
            return None 
    except Exception as e:
        print('MinIO object fetching error')
        print(e)
        return None
# Works
def get_object_list(
    minio_client: any,
    bucket_name: str,
    path_prefix: str
) -> dict:
    MINIO_CLIENT = minio_client
    try:
        objects = MINIO_CLIENT.list_objects(bucket_name = bucket_name, prefix = path_prefix, recursive = True)
        object_dict = {}
        for obj in objects:
            object_name = obj.object_name
            object_info = MINIO_CLIENT.stat_object(
                bucket_name = bucket_name,
                object_name = object_name
            )
            given_metadata = {} 
            for key, value in object_info.metadata.items():
                if 'X-Amz-Meta' in key:
                    key_name = key[11:]
                    given_metadata[key_name] = value
            object_dict[obj.object_name] = given_metadata
        return object_dict
    except Exception as e:
        return None  

In [137]:
minio_object = get_object_data_and_metadata(
    minio_client = minio_client,
    bucket_name = 'central', 
    object_path = 'experiments/status'
)
minio_object

{'data': {'experiment-name': 'federated-learning',
  'experiment': 2,
  'experiment-id': '1',
  'start': True,
  'data-split': True,
  'preprocessed': True,
  'trained': True,
  'worker-split': True,
  'sent': True,
  'updated': True,
  'evaluated': True,
  'complete': True,
  'train-amount': 80000,
  'test-amount': 20000,
  'eval-amount': 100000,
  'collective-amount': 400000,
  'worker-updates': 4,
  'cycle': 6,
  'run-id': 'df8eee217bea4c0695ebd3ebfa54921b'},
 'metadata': {}}

In [138]:
minio_object = get_object_data_and_metadata(
    minio_client = minio_client,
    bucket_name = 'workers', 
    object_path = '1314ab77-8d29-4ed0-87fe-ea42eb8a4816/experiments/status'
)
minio_object

{'data': {'worker-id': '1314ab77-8d29-4ed0-87fe-ea42eb8a4816',
  'network-id': '3',
  'central-address': '172.28.0.8',
  'central-port': '7500',
  'worker-address': '172.28.0.12',
  'worker-port': '7501',
  'experiment-name': 'federated-learning',
  'experiment': 2,
  'experiment-id': '5',
  'stored': True,
  'preprocessed': False,
  'trained': False,
  'updated': False,
  'complete': True,
  'train-amount': 80000,
  'test-amount': 20000,
  'eval-amount': 100000,
  'cycle': 6},
 'metadata': {}}

In [73]:
modified_object = minio_object['data']
modified_object['min-update-amount'] = 1

In [74]:
modified_object

{'sample-pool': 50000,
 'data-augmentation': {'active': True,
  'sample-pool': 100000,
  '1-0-ratio': 0.5},
 'eval-ratio': 0.5,
 'train-ratio': 0.8,
 'min-update-amount': 1,
 'max-cycles': 5,
 'min-metric-success': 6,
 'metric-thresholds': {'true-positives': 1000,
  'false-positives': 100,
  'true-negatives': 1000,
  'false-negatives': 100,
  'recall': 0.4,
  'selectivity': 0.5,
  'precision': 0.6,
  'miss-rate': 0.25,
  'fall-out': 0.25,
  'balanced-accuracy': 0.5,
  'accuracy': 0.6},
 'metric-conditions': {'true-positives': '>=',
  'false-positives': '<=',
  'true-negatives': '>=',
  'false-negatives': '<=',
  'recall': '>=',
  'selectivity': '>=',
  'precision': '>=',
  'miss-rate': '<=',
  'fall-out': '<=',
  'balanced-accuracy': '>=',
  'accuracy': '>='}}

In [75]:
create_or_update_object(
    minio_client = minio_client,
    bucket_name = 'central', 
    object_path = 'experiments/federated-learning/1/parameters/central',
    data = modified_object, 
    metadata = {}
)

True

In [21]:
minio_object = get_object_data_and_metadata(
    minio_client = minio_client,
    bucket_name = 'central', 
    object_path = 'experiments/central-federated-learning-test/1/1/workers'
)
minio_object

{'data': {'8681149e-b5c1-440e-830f-43355110017f': {'worker-id': '8681149e-b5c1-440e-830f-43355110017f',
   'network-id': '1',
   'central-address': '10.30.10.07',
   'central-port': '7500',
   'worker-address': '10.18.0.7',
   'worker-port': '7501',
   'status': 'waiting',
   'experiment-name': 'central-federated-learning-test',
   'experiment': 1,
   'experiment-id': '',
   'stored': False,
   'preprocessed': False,
   'trained': False,
   'updated': False,
   'complete': False,
   'train-amount': 0,
   'test-amount': 0,
   'eval-amount': 0,
   'cycle': 1},
  'c9edb895-9a90-44f0-ac3f-0feda8740a7b': {'worker-id': 'c9edb895-9a90-44f0-ac3f-0feda8740a7b',
   'network-id': '2',
   'central-address': '10.30.10.07',
   'central-port': '7500',
   'worker-address': '10.18.0.10',
   'worker-port': '7501',
   'status': 'waiting',
   'experiment-name': 'central-federated-learning-test',
   'experiment': 1,
   'experiment-id': '2',
   'stored': True,
   'preprocessed': True,
   'trained': False,
 

## MLflow Interactions

In [39]:
import os
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://127.0.0.1:9000"
os.environ['AWS_ACCESS_KEY_ID'] = 'minio'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'
mlflow_client = MlflowClient(
    tracking_uri = "http://127.0.0.1:5000"
)

In [43]:
# Refactored and works
def start_experiment(
    mlflow_client: any,
    experiment_name: str,
    experiment_tags: dict
) -> int:
    MLFLOW_CLIENT = mlflow_client
    try:
        experiment_id = MLFLOW_CLIENT.create_experiment(
            name = experiment_name,
            tags = experiment_tags,
            artifact_location="s3://mlflow/mlruns"
        )
        return experiment_id
    except Exception as e:
        print(e)
        return None
# Refactored
def check_experiment(
    mlflow_client: any,
    experiment_name: str
) -> dict:
    MLFLOW_CLIENT = mlflow_client
    try:
        experiment_object = MLFLOW_CLIENT.get_experiment_by_name(
            name = experiment_name
        )
        return experiment_object
    except Exception as e:
        print(e)
        return None
# Refactored
def get_experiments(
    mlflow_client: any,
    experiment_type: int,
    max_amount: int,
    filter: str
) -> dict:
    # Types are ACTIVE_ONLY = 1, DELETED_ONLY = 2 and ALL = 3
    MLFLOW_CLIENT = mlflow_client
    try:
        experiment_objects = MLFLOW_CLIENT.search_experiments(
            view_type = experiment_type,
            max_results = max_amount,
            filter_string = filter
        )
        experiment_dict = {}
        for experiment in experiment_objects:
            experiment_dict[experiment.name] = {
                'id': experiment.experiment_id,
                'stage': experiment.lifecycle_stage,
                'tags': experiment.tags,
                'location': experiment.artifact_location,
                'created': experiment.creation_time,
                'updated': experiment.last_update_time
            }
        return experiment_dict
    except Exception as e:
        print(e)
        return None
# Refactored
def start_run(
    mlflow_client: any,
    experiment_id: str,
    tags: dict,
    name: str
) -> dict:
    MLFLOW_CLIENT = mlflow_client 
    try:
        run_object = MLFLOW_CLIENT.create_run(
            experiment_id = experiment_id,
            tags = tags,
            run_name = name
        )
        run_dict = {
            'e_id': run_object.info.experiment_id,
            'id': run_object.info.run_id,
            'name': run_object.info.run_name,
            'stage': run_object.info.lifecycle_stage,
            'status': run_object.info.status
        }
        return run_dict
    except Exception as e:
        print(e)
        return None
# Refactored
def check_run(
    mlflow_client: any,
    run_id: str
) -> dict:
    MLFLOW_CLIENT = mlflow_client
    try:
        run_object = MLFLOW_CLIENT.get_run(
            run_id = run_id
        )
        run_dict = {
            'e_id': run_object.info.experiment_id,
            'id': run_object.info.run_id,
            'name': run_object.info.run_name,
            'stage': run_object.info.lifecycle_stage,
            'status': run_object.info.status,
            'parameters': run_object.data.params,
            'metrics': run_object.data.metrics,
            'start_time': run_object.info.start_time,
            'end_time': run_object.info.end_time
        }
        return run_dict
    except Exception as e:
        print(e)
        return None
# Refactored
def update_run(
    logger: any,
    mlflow_client: any,
    run_id: str,
    parameters: dict,
    metrics: dict,
    artifacts: dict
) -> bool:
    MLFLOW_CLIENT = mlflow_client 
    try:
        for param_key, param_value in parameters.items():
            MLFLOW_CLIENT.log_param(
                run_id = run_id,
                key = param_key,
                value = param_value
            )
        for metric_key,metric_value in metrics.items():
            MLFLOW_CLIENT.log_metric(
                run_id = run_id,
                key = metric_key,
                value = metric_value
            )
        for path in artifacts:
            MLFLOW_CLIENT.log_artifact(
                run_id = run_id,
                local_path = path
            )
        return True
    except Exception as e:
        print(e)
        return False
# Recatored
def end_run(
    logger: any,
    mlflow_client: any,
    run_id: str,
    status: str
) -> bool:
    # run status are FAILED = 4, FINISHED = 3, KILLED = 5, RUNNING = 1 and SCHEDULED = 2
    MLFLOW_CLIENT = mlflow_client
    try:
        MLFLOW_CLIENT.set_terminated(
            run_id = run_id,
            status = status
        )
        return True
    except Exception as e:
        print(e)
        return False
# Refactored
def get_runs(
    logger: any,
    mlflow_client: any,
    experiment_ids: list,
    filter: str,
    type: int,
    max_amount: int
) -> dict:
    MLFLOW_CLIENT = mlflow_client
    try:
        runs = MLFLOW_CLIENT.search_runs(
            experiment_ids = experiment_ids,
            filter_string = filter,
            run_view_type = type,
            max_results = max_amount
        )
        run_dict = {}
        for run in runs:
            run_dict[run.info.run_id] = {
                'e_id': run.info.experiment_id,
                'id': run.info.run_id,
                'name': run.info.run_name,
                'stage': run.info.lifecycle_stage,
                'status': run.info.status,
                'start_time': run.info.start_time,
                'end_time': run.info.end_time,
                'parameters': run.data.params,
                'metrics': run.data.metrics
            }
        return run_dict
    except Exception as e:
        print(e)
        return None

In [45]:
run_dict = check_run(
    mlflow_client = mlflow_client,
    run_id = 'e139ce671ad44909a8f58175adb2e2b9'
)
print(run_dict['status'])

FINISHED


## Experiment Results

In [8]:
bucket = 'central'
experiment_name = 'federated-learning'
experiment = '1'

experiment_objects = get_object_list(
    minio_client = minio_client,
    bucket_name = bucket,
    path_prefix = 'experiments/central-federated-learning/1'
)

In [40]:
collected_central_objects = {
    'status': None,
    'specifications': None,
    'workers': None,
    'central-pool': None,
    'workers-pool': None,
    'global-models': pd.DataFrame(),
    'local-models': pd.DataFrame(),
    'data': pd.DataFrame(),
    'times': pd.DataFrame(),
    'function': pd.DataFrame(),
    'network': pd.DataFrame(),
    'training': pd.DataFrame(),
    'inference': pd.DataFrame(),
    'metrics': pd.DataFrame(),
    'system': pd.DataFrame(),
    'server': pd.DataFrame()
}

In [60]:
experiments_folder = 'experiments'
experiment_name = 'central-federated-learning'
experiment = '1'

central_object_paths = {
    'status': experiments_folder + '/status',
    'specifications': experiments_folder + '/specifications',
    'workers': experiments_folder + '/' + str(experiment_name) + '/' + str(experiment) + '/c/workers',
    'central-pool': experiments_folder + '/' + str(experiment_name) + '/' + str(experiment) + '/data/central-pool',
    'workers-pool': experiments_folder + '/' + str(experiment_name) + '/' + str(experiment) + '/data/workers-pool',
    'global-models': experiments_folder + '/' + str(experiment_name) + '/' + str(experiment) + '/c/global-model',
    'local-models': experiments_folder + '/' + str(experiment_name) + '/' + str(experiment) + '/c/local-models',
    'data': experiments_folder + '/' + str(experiment_name) + '/' + str(experiment) + '/c/data', # worker-id
    'times': experiments_folder + '/' + str(experiment_name) + '/' + str(experiment) + '/times',
    'function': experiments_folder + '/' + str(experiment_name) + '/' + str(experiment) + '/c/times/function',
    'network': experiments_folder + '/' + str(experiment_name) + '/' + str(experiment) + '/c/times/network',
    'training': experiments_folder + '/' + str(experiment_name) + '/' + str(experiment) + '/c/times/training',
    'inference': experiments_folder + '/' + str(experiment_name) + '/' + str(experiment) + '/c/times/inference',
    'metrics': experiments_folder + '/' + str(experiment_name) + '/' + str(experiment) + '/c/metrics',
    'system': experiments_folder + '/' + str(experiment_name) + '/' + str(experiment) + '/c/resources/system',
    'server': experiments_folder + '/' + str(experiment_name) + '/' + str(experiment) + '/c/resources/server'
}

In [64]:
max_cycles = 4
for name in collected_central_objects.keys():
    whole_path = central_object_paths[name]
    path_split = whole_path.split('/')
    
    if 4 < len(path_split):
        if path_split[3] == 'c':
            #print(path_split)
            for cycle in range(1,max_cycles + 1):
                path_split[3] = str(cycle)
                cycle_path = '/'.join(path_split)
                #print(cycle_path)
            continue
    #print(whole_path)
    wanted_object = get_object_data_and_metadata(
        minio_client = minio_client,
        bucket_name = 'central', 
        object_path = whole_path
    )
    wanted_data = wanted_object['data']
    collected_central_objects[path_split[-1]] = wanted_data

['experiments', 'central-federated-learning', '1', 'c', 'workers']
['experiments', 'central-federated-learning', '1', 'c', 'global-model']
['experiments', 'central-federated-learning', '1', 'c', 'local-models']
['experiments', 'central-federated-learning', '1', 'c', 'data']
['experiments', 'central-federated-learning', '1', 'c', 'times', 'function']
['experiments', 'central-federated-learning', '1', 'c', 'times', 'network']
['experiments', 'central-federated-learning', '1', 'c', 'times', 'training']
['experiments', 'central-federated-learning', '1', 'c', 'times', 'inference']
['experiments', 'central-federated-learning', '1', 'c', 'metrics']
['experiments', 'central-federated-learning', '1', 'c', 'resources', 'system']
['experiments', 'central-federated-learning', '1', 'c', 'resources', 'server']


In [65]:
collected_central_objects['central-pool']

[[22345, 24978, 7695769, 0, 0, 0, 1, 0, 0, 0],
 [274, 86866, 8386081, 0, 0, 0, 1, 0, 0, 0],
 [309147, 57776, 7754521, 0, 1, 0, 0, 0, 0, 0],
 [223779, 98845, 6670676, 0, 1, 0, 0, 0, 0, 0],
 [387462, 41457, 8430822, 0, 1, 0, 0, 0, 0, 0],
 [105790, 55008, 7071156, 0, 1, 0, 0, 0, 0, 0],
 [13110, 40862, 6913088, 0, 0, 0, 1, 0, 0, 0],
 [626539, 63035, 7135680, 0, 0, 0, 0, 1, 0, 0],
 [37753, 62651, 8185503, 1, 0, 0, 0, 0, 0, 0],
 [35461, 77831, 6956576, 1, 0, 0, 0, 0, 0, 0],
 [7593, 43336, 7271255, 0, 0, 0, 1, 0, 0, 0],
 [61579, 47959, 8746806, 0, 1, 0, 0, 0, 0, 0],
 [334335, 67192, 6709521, 1, 0, 0, 0, 0, 0, 0],
 [21159, 11454, 6717415, 0, 0, 0, 1, 0, 0, 0],
 [146286, 38754, 8432674, 1, 0, 0, 0, 0, 0, 0],
 [6726, 27430, 7203005, 0, 0, 0, 1, 0, 0, 0],
 [41950, 18408, 7111473, 1, 0, 0, 0, 0, 0, 0],
 [118181, 80794, 7056122, 0, 1, 0, 0, 0, 0, 0],
 [1037, 90978, 6704639, 0, 0, 0, 1, 0, 0, 0],
 [13110, 59071, 7375470, 0, 0, 0, 1, 0, 0, 0],
 [59228, 44725, 7973906, 0, 1, 0, 0, 0, 0, 0],
 [107472, 

In [46]:
cycle_paths

[]

In [16]:
relevant_object_names = [
    'central',
    'model',
    'worker',
    'times',
    'function',
    'network',
    'training',
    'server',
    'system',
    'global-model',
    'local-models'
]

objects = ['times']

wanted_paths = []

for path in experiment_objects.keys():
    path_split = path.split('.')[0].split('/')
    experiment = None
    cycle = None
    if path_split[2].isnumeric() and path_split[3].isnumeric():
        experiment = path_split[2]
        cycle = path_split[3]
        name = path_split[4]
        if name in objects:
            wanted_paths.append(path.split('.')[0])

In [18]:
function_times = wanted_paths[0]
function_object = get_object_data_and_metadata(
    minio_client = minio_client,
    bucket_name = 'central', 
    object_path = function_times
)

function_times_data = function_object['data']

In [20]:
function_times_df = pd.DataFrame.from_dict(function_times_data,orient='index')

In [21]:
function_times_df

Unnamed: 0,name,action-time-start,action-time-end,action-total-seconds
1,store-worker-2107ac55-1ff2-4ae8-a597-50538c49696e,1.712836e+09,1.712836e+09,0.18951
2,store-worker-5dd2e69e-a1ec-4bcb-8387-e71b960bf04c,1.712836e+09,1.712836e+09,0.09705
3,store-worker-5ad482cd-ee0c-42e9-b262-d39a653f2d81,1.712836e+09,1.712836e+09,0.24430
4,store-worker-6221634a-7c71-4d38-aa36-f2b293736118,1.712836e+09,1.712836e+09,1.12745
5,store-worker-5dd2e69e-a1ec-4bcb-8387-e71b960bf04c,1.712836e+09,1.712836e+09,0.11219
...,...,...,...,...
288,store-worker-e5c5fe7c-f3d0-4bca-a410-b4351b225fce,1.712837e+09,1.712837e+09,0.20629
289,store-worker-98aec564-6715-43e3-9c88-1d51f42de2b8,1.712837e+09,1.712837e+09,0.18161
290,store-worker-317bc003-4005-4c75-a785-ad9f210c9fa8,1.712837e+09,1.712837e+09,0.34402
291,update-global-model,1.712837e+09,1.712837e+09,1.18833


In [22]:
network_times = wanted_paths[1]
network_object = get_object_data_and_metadata(
    minio_client = minio_client,
    bucket_name = 'central', 
    object_path = network_times
)

network_times_data = network_object['data']

In [24]:
network_times_df = pd.DataFrame.from_dict(network_times_data,orient='index')

In [25]:
network_times_df

Unnamed: 0,name,status-code,payload-size-bytes,processing-time-seconds,elapsed-time-seconds,action-time-start,action-time-end,action-total-seconds
1,sending-context-to-worker-5ad482cd-ee0c-42e9-b...,200,4584293,1.842412,0.809238,1712837000.0,1712837000.0,4.23333
2,sending-context-to-worker-6221634a-7c71-4d38-a...,200,4585204,1.401571,0.59903,1712837000.0,1712837000.0,5.73104
3,sending-context-to-worker-5dd2e69e-a1ec-4bcb-8...,200,4584674,1.827662,0.850947,1712837000.0,1712837000.0,7.66621
4,sending-context-to-worker-2107ac55-1ff2-4ae8-a...,200,4584999,1.70673,0.801461,1712837000.0,1712837000.0,9.42017
5,sending-context-to-worker-98aec564-6715-43e3-9...,200,4584083,2.099613,1.278047,1712837000.0,1712837000.0,11.63626
6,sending-context-to-worker-317bc003-4005-4c75-a...,200,4585078,3.462919,2.620762,1712837000.0,1712837000.0,15.42791
7,sending-context-to-worker-5952b17d-9084-41d5-a...,200,4585350,2.313907,1.259625,1712837000.0,1712837000.0,17.81718
8,sending-context-to-worker-22368b0f-3b75-4677-a...,200,4584766,4.218611,3.235667,1712837000.0,1712837000.0,22.08499
9,sending-context-to-worker-e5c5fe7c-f3d0-4bca-a...,200,4584911,2.202625,1.170221,1712837000.0,1712837000.0,24.34652


In [34]:
collected_dfs = {
    'function': pd.DataFrame(),
    'network': pd.DataFrame(),
    'training': pd.DataFrame()
}

#function_times_dfs = pd.DataFrame()
for path in wanted_paths:
    path_split = path.split('.')[0].split('/')
    fetched_object = get_object_data_and_metadata(
        minio_client = minio_client,
        bucket_name = 'central', 
        object_path = path
    )
    created_df = pd.DataFrame.from_dict(fetched_object['data'],orient='index')
    created_df['cycle'] = int(path_split[3])
    collected_dfs[path_split[-1]] = pd.concat([collected_dfs[path_split[-1]],created_df])

In [35]:
collected_dfs['function']

Unnamed: 0,name,action-time-start,action-time-end,action-total-seconds,cycle
1,store-worker-2107ac55-1ff2-4ae8-a597-50538c49696e,1.712836e+09,1.712836e+09,0.18951,1
2,store-worker-5dd2e69e-a1ec-4bcb-8387-e71b960bf04c,1.712836e+09,1.712836e+09,0.09705,1
3,store-worker-5ad482cd-ee0c-42e9-b262-d39a653f2d81,1.712836e+09,1.712836e+09,0.24430,1
4,store-worker-6221634a-7c71-4d38-aa36-f2b293736118,1.712836e+09,1.712836e+09,1.12745,1
5,store-worker-5dd2e69e-a1ec-4bcb-8387-e71b960bf04c,1.712836e+09,1.712836e+09,0.11219,1
...,...,...,...,...,...
356,store-worker-22368b0f-3b75-4677-a23e-71d81865f7b0,1.712839e+09,1.712839e+09,0.22648,4
357,store-worker-5952b17d-9084-41d5-aae1-1b0820f1367d,1.712839e+09,1.712839e+09,0.13205,4
358,store-worker-317bc003-4005-4c75-a785-ad9f210c9fa8,1.712839e+09,1.712839e+09,0.11808,4
359,store-worker-e5c5fe7c-f3d0-4bca-a410-b4351b225fce,1.712839e+09,1.712839e+09,0.12699,4


In [36]:
collected_dfs['network']

Unnamed: 0,name,status-code,payload-size-bytes,processing-time-seconds,elapsed-time-seconds,action-time-start,action-time-end,action-total-seconds,cycle
1,sending-context-to-worker-5ad482cd-ee0c-42e9-b...,200,4584293,1.842412,0.809238,1.712837e+09,1.712837e+09,4.23333,1
2,sending-context-to-worker-6221634a-7c71-4d38-a...,200,4585204,1.401571,0.599030,1.712837e+09,1.712837e+09,5.73104,1
3,sending-context-to-worker-5dd2e69e-a1ec-4bcb-8...,200,4584674,1.827662,0.850947,1.712837e+09,1.712837e+09,7.66621,1
4,sending-context-to-worker-2107ac55-1ff2-4ae8-a...,200,4584999,1.706730,0.801461,1.712837e+09,1.712837e+09,9.42017,1
5,sending-context-to-worker-98aec564-6715-43e3-9...,200,4584083,2.099613,1.278047,1.712837e+09,1.712837e+09,11.63626,1
...,...,...,...,...,...,...,...,...,...
46,sending-context-to-worker-5952b17d-9084-41d5-a...,200,413,0.029724,0.027156,1.712837e+09,1.712837e+09,1.17027,4
47,sending-context-to-worker-5952b17d-9084-41d5-a...,200,413,0.021980,0.019747,1.712837e+09,1.712837e+09,1.27268,4
48,sending-context-to-worker-5952b17d-9084-41d5-a...,200,413,0.022083,0.019490,1.712837e+09,1.712837e+09,1.43205,4
49,sending-context-to-worker-5952b17d-9084-41d5-a...,200,413,0.021021,0.018611,1.712837e+09,1.712837e+09,1.50781,4


In [37]:
collected_dfs['training']

Unnamed: 0,name,epochs,batches,average-batch-size,action-time-start,action-time-end,action-total-seconds,cycle
1,logistic-regression-training,5.0,10,20000.0,1712836000.0,1712836000.0,5.73558,1
2,logistic-regression-testing,,782,63.938619,1712836000.0,1712836000.0,2.20654,1
3,logistic-regression-evaluation,,782,63.938619,1712836000.0,1712836000.0,1.96068,1
4,logistic-regression-evaluation,,782,63.938619,1712837000.0,1712837000.0,3.49708,1
1,logistic-regression-evaluation,,782,63.938619,1712837000.0,1712837000.0,2.06692,2
1,logistic-regression-evaluation,,782,63.938619,1712837000.0,1712837000.0,2.43345,3


In [None]:
collected_cycle_dfs = {
    'global-models': pd.DataFrame(),
    'local-models': pd.DataFrame(),
    'metrics': pd.DataFrame()
}

In [5]:
minio_object = get_object_data_and_metadata(
    minio_client = minio_client,
    bucket_name = 'central', 
    object_path = 'experiments/central-federated-learning/3/1/metrics'
)
minio_object

{'data': {'1': {'name': 'logistic-regression-testing',
   'true-positives': 0,
   'false-positives': 0,
   'true-negatives': 99946,
   'false-negatives': 54,
   'recall': 0.0,
   'selectivity': 1.0,
   'precision': 0.0,
   'miss-rate': 0.0,
   'fall-out': 0.0,
   'balanced-accuracy': 0.0,
   'accuracy': 0.0,
   'train-amount': 80000,
   'test-amount': 100000,
   'eval-amount': 0},
  '2': {'name': 'logistic-regression-evaluation',
   'true-positives': 0,
   'false-positives': 0,
   'true-negatives': 99946,
   'false-negatives': 54,
   'recall': 0.0,
   'selectivity': 1.0,
   'precision': 0.0,
   'miss-rate': 0.0,
   'fall-out': 0.0,
   'balanced-accuracy': 0.0,
   'accuracy': 0.0,
   'train-amount': 80000,
   'test-amount': 0,
   'eval-amount': 100000},
  '3': {'name': 'logistic-regression-evaluation',
   'true-positives': 0,
   'false-positives': 0,
   'true-negatives': 99946,
   'false-negatives': 54,
   'recall': 0.0,
   'selectivity': 1.0,
   'precision': 0.0,
   'miss-rate': 0.0,
 

In [6]:
metrics_data = minio_object['data']['3']

In [7]:
metrics_data

{'name': 'logistic-regression-evaluation',
 'true-positives': 0,
 'false-positives': 0,
 'true-negatives': 99946,
 'false-negatives': 54,
 'recall': 0.0,
 'selectivity': 1.0,
 'precision': 0.0,
 'miss-rate': 0.0,
 'fall-out': 0.0,
 'balanced-accuracy': 0.0,
 'accuracy': 0.0,
 'train-amount': 80000,
 'test-amount': 0,
 'eval-amount': 100000}

In [12]:
parameters['central']['metric-thresholds']

{'true-positives': 50,
 'false-positives': 100,
 'true-negatives': 1000,
 'false-negatives': 100,
 'recall': 0.4,
 'selectivity': 0.99,
 'precision': 0.8,
 'miss-rate': 0.05,
 'fall-out': 0.05,
 'balanced-accuracy': 0.85,
 'accuracy': 0.99}

In [13]:
parameters['central']['metric-conditions']

{'true-positives': '>=',
 'false-positives': '<=',
 'true-negatives': '>=',
 'false-negatives': '<=',
 'recall': '>=',
 'selectivity': '>=',
 'precision': '>=',
 'miss-rate': '<=',
 'fall-out': '<=',
 'balanced-accuracy': '>=',
 'accuracy': '>='}

In [22]:
succesful_metrics = 0
eval_metrics = metrics_data
thresholds = parameters['central']['metric-thresholds']
conditions = parameters['central']['metric-conditions']
# This doesn't work as expected
print('Metric evalution (value condition threshold):')
for key,value in eval_metrics.items():
    if 'amount' in key or 'name' in key:
        continue
    message = str(key)
    
    #print(conditions[key],thresholds[key],value)
    # >= means that the metric should be smaller or equal to threshold
    # <= means that the metric should be smaller or equal to threshold
    if conditions[key] == '>=' and value >= thresholds[key]:
        message = message + ' succeeded: ' + str(value) + str(conditions[key]) + str(thresholds[key])
        print(message)
        succesful_metrics += 1
        continue
    if conditions[key] == '<=' and thresholds[key] <= value:
        message = message + ' succeeded: ' + str(value) + str(conditions[key]) + str(thresholds[key])
        print(message)
        succesful_metrics += 1
        continue
    message = message + ' failed: ' + str(value) + str(conditions[key]) + str(thresholds[key])
    print(message)
print(succesful_metrics)

Metric evalution (value condition threshold):
true-positives failed: 0>=50
false-positives failed: 0<=100
true-negatives succeeded: 99946>=1000
false-negatives failed: 54<=100
recall failed: 0.0>=0.4
selectivity succeeded: 1.0>=0.99
precision failed: 0.0>=0.8
miss-rate failed: 0.0<=0.05
fall-out failed: 0.0<=0.05
balanced-accuracy failed: 0.0>=0.85
accuracy failed: 0.0>=0.99
2


In [35]:
experiment_objects = get_object_list(
    minio_client = minio_client,
    bucket_name = 'central',
    path_prefix = 'experiments/central-federated-learning/3/2/data'
)

In [36]:
experiment_objects

{}

In [29]:
formatted_paths = []
for path in experiment_objects.keys():
    pkl_split = path.split('.')[0]
    formatted_paths.append(pkl_split)
print(formatted_paths)

['experiments/central-federated-learning/3/1/data/03814f81-12c3-4367-9bd5-b187d0cfce95', 'experiments/central-federated-learning/3/1/data/20539e2d-074f-4584-abc8-5d146c5a3220', 'experiments/central-federated-learning/3/1/data/7ea2b86a-e68b-4914-ab24-ef85bee924b6', 'experiments/central-federated-learning/3/1/data/96b44ace-0389-4d7c-aa33-dcb30bbcbbf7', 'experiments/central-federated-learning/3/1/data/c890a70e-5d92-4e32-937a-a657579dc8b1']


In [32]:
workers = []
for path in formatted_paths:
    path_split = path.split('/')
    workers.append(path_split[-1])
print(workers)

['03814f81-12c3-4367-9bd5-b187d0cfce95', '20539e2d-074f-4584-abc8-5d146c5a3220', '7ea2b86a-e68b-4914-ab24-ef85bee924b6', '96b44ace-0389-4d7c-aa33-dcb30bbcbbf7', 'c890a70e-5d92-4e32-937a-a657579dc8b1']


In [40]:
available = set([
    '03814f81-12c3-4367-9bd5-b187d0cfce95', 
    '20539e2d-074f-4584-abc8-5d146c5a3220', 
    '7ea2b86a-e68b-4914-ab24-ef85bee924b6', 
    '96b44ace-0389-4d7c-aa33-dcb30bbcbbf7', 
    'c890a70e-5d92-4e32-937a-a657579dc8b1'
])
splitted = set([
    '03814f81-12c3-4367-9bd5-b187d0cfce95', 
    '20539e2d-074f-4584-abc8-5d146c5a3220', 
    '7ea2b86a-e68b-4914-ab24-ef85bee924b6', 
])

difference = list(available-splitted)
print(difference)

['96b44ace-0389-4d7c-aa33-dcb30bbcbbf7', 'c890a70e-5d92-4e32-937a-a657579dc8b1']


In [41]:
available = set([
    '03814f81-12c3-4367-9bd5-b187d0cfce95', 
    '20539e2d-074f-4584-abc8-5d146c5a3220', 
    '7ea2b86a-e68b-4914-ab24-ef85bee924b6', 
    '96b44ace-0389-4d7c-aa33-dcb30bbcbbf7', 
    'c890a70e-5d92-4e32-937a-a657579dc8b1'
])
splitted = set([])

difference = list(available-splitted)
print(difference)

['03814f81-12c3-4367-9bd5-b187d0cfce95', '7ea2b86a-e68b-4914-ab24-ef85bee924b6', 'c890a70e-5d92-4e32-937a-a657579dc8b1', '96b44ace-0389-4d7c-aa33-dcb30bbcbbf7', '20539e2d-074f-4584-abc8-5d146c5a3220']


In [64]:
thing = {}

thing.add(4)

AttributeError: 'dict' object has no attribute 'add'