## Central Worker Experiments

In [2]:
import requests
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Setup

### Context

In [79]:
model_parameters = {
    'seed': 42,
    'used-columns': [
        'amount',
        'type_CASH_IN',
        'type_CASH_OUT',
        'type_DEBIT',
        'type_PAYMENT',
        'type_TRANSFER',
        'isFraud'
    ],
    'input-size': 6,
    'target-column': 'isFraud',
    'scaled-columns': [
        'amount'
    ],
    'learning-rate': 0.05,
    'sample-rate': 0.10,
    'optimizer':'SGD',
    'epochs': 20
}

In [80]:
central_parameters = {
    'sample-pool': 50000,
    'data-augmentation': {
        'active': True,
        'sample-pool': 100000,
        '1-0-ratio': 0.2
    },
    'eval-ratio': 0.2,
    'train-ratio': 0.8,
    'min-update-amount': 1,
    'max-cycles': 10,
    'min-metric-success': 7,
    'metric-thresholds': {
        'true-positives': 50,
        'false-positives': 100,
        'true-negatives': 1000, 
        'false-negatives': 100,
        'recall': 0.40,
        'selectivity': 0.99,
        'precision': 0.80,
        'miss-rate': 0.05,
        'fall-out': 0.05,
        'balanced-accuracy': 0.80,
        'accuracy': 0.90
    },
    'metric-conditions': {
        'true-positives': '>=',
        'false-positives': '<=',
        'true-negatives': '>=', 
        'false-negatives': '<=',
        'recall': '>=',
        'selectivity': '>=',
        'precision': '>=',
        'miss-rate': '<=',
        'fall-out': '<=',
        'balanced-accuracy': '>=',
        'accuracy': '>='
    }
}

In [81]:
worker_parameters = {
    'data-augemntation': True,
    'sample-pool': 50000,
    'data-augmentation': {
        'active': True,
        'sample-pool': 100000,
        '1-0-ratio': 0.2
    },
    'eval-ratio': 0.2,
    'train-ratio': 0.8
}

In [82]:
parameters = {
    'model' : model_parameters,
    'central': central_parameters,
    'worker': worker_parameters 
}

### Data

#### Formatting

In [None]:
source_data_df = pd.read_csv('data/fraud_detection.csv')

In [None]:
def formatting(
    source_df: any
) -> any:
    print('Formatting data')
    formated_df = source_df.copy()
    
    irrelevant_columns = [
        'oldbalanceOrg',
        'newbalanceOrig',
        'oldbalanceDest',
        'newbalanceDest'
    ]
    formated_df.drop(
        columns = irrelevant_columns, 
        inplace = True
    )
    print('Columns dropped')
    formated_df = pd.get_dummies(
        data = formated_df, 
        columns = ['type']
    )
    
    for column in formated_df.columns:
        if 'type' in column:
            formated_df[column] = formated_df[column].astype(int)
    print('One hot coded type')

    unique_values_orig = formated_df['nameOrig'].unique()
    unique_values_dest = formated_df['nameDest'].unique()
    
    unique_value_list_orig = unique_values_orig.tolist()
    unique_value_list_dest = unique_values_dest.tolist()

    print('Orig amount:', len(unique_value_list_orig))
    print('Dest amount:', len(unique_value_list_dest))
    
    set_orig_ids = set(unique_value_list_orig)
    set_dest_ids = set(unique_value_list_dest)
    intersection = set_dest_ids.intersection(set_orig_ids)

    print('Orig and Dest duplicates', len(intersection))
    
    set_dest_ids.difference_update(intersection)
    fixed_unique_value_list_dest = list(set_dest_ids)
    print('Fixed Dest amount:',len(fixed_unique_value_list_dest))
    
    orig_encoding_dict = {}
    index = 1
    for string in unique_value_list_orig:
        if not string in orig_encoding_dict:
            orig_encoding_dict[string] = index
            index = index + 1

    dest_encoding_dict = {}
    cont_index = len(orig_encoding_dict) + 1
    for string in fixed_unique_value_list_dest:
        if not string in dest_encoding_dict:
            dest_encoding_dict[string] = cont_index
            cont_index = cont_index + 1
    print('Orig dict amount:', len(orig_encoding_dict))
    print('Dest dict amount:', len(dest_encoding_dict))
    
    print('Orig and dest string-integer encodings created')

    string_orig_values = formated_df['nameOrig'].tolist()
    string_dest_values = formated_df['nameDest'].tolist()

    orig_encoded_values = []
    for string in string_orig_values:
        orig_encoded_values.append(orig_encoding_dict[string])

    dest_encoded_values = []
    for string in string_dest_values:
        if not string in dest_encoding_dict:
            dest_encoded_values.append(orig_encoding_dict[string])
            continue
        dest_encoded_values.append(dest_encoding_dict[string])

    formated_df['nameOrig'] = orig_encoded_values
    formated_df['nameDest'] = dest_encoded_values

    print('Orig encoded values amount:', len(orig_encoded_values))
    print('Dest encoded values amount:', len(dest_encoded_values))
    
    print('Orig and dest encodings set')

    formated_df['amount'] = formated_df['amount'].round(0).astype(int)
    print('Amount rounded')

    column_order = [
        'step',
        'amount',
        'nameOrig',
        'nameDest',
        'type_CASH_IN',
        'type_CASH_OUT',
        'type_DEBIT',
        'type_PAYMENT',
        'type_TRANSFER',
        'isFraud',
        'isFlaggedFraud'
    ]
    formated_df = formated_df[column_order]
    print('Columns reordered')
    print('Dataframe shape:', formated_df.shape)
    print('Formatting done')
    return formated_df

In [None]:
formated_data_df = formatting(
    source_df = source_data_df
)

In [None]:
formated_data_df.to_csv('data/Formated_Fraud_Detection_Data.csv', index = False)

#### Sampling

In [57]:
formated_data_df = pd.read_csv('data/Formated_Fraud_Detection_Data.csv')
formated_data_df

Unnamed: 0,step,amount,nameOrig,nameDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,isFraud,isFlaggedFraud
0,1,9840,1,7233461,0,0,0,1,0,0,0
1,1,1864,2,7735206,0,0,0,1,0,0,0
2,1,181,3,8598945,0,0,0,0,1,1,0
3,1,181,4,7880837,0,1,0,0,0,1,0
4,1,11668,5,7670940,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,339682,6353303,6895525,0,1,0,0,0,1,0
6362616,743,6311409,6353304,7179249,0,0,0,0,1,1,0
6362617,743,6311409,6353305,7452283,0,1,0,0,0,1,0
6362618,743,850003,6353306,7284324,0,0,0,0,1,1,0


In [83]:
data = formated_data_df.iloc[:100000].values.tolist()
columns = formated_data_df.columns.tolist()

#### Sending

In [84]:
context = {
    'parameters': parameters,
    'data': data,
    'columns': columns
}

payload = json.dumps(context)

response = requests.post(
    url = 'http://127.0.0.1:7600/context',
    json = payload
)

print(response.status_code)
print(response.text)

200
{"stored":true}



#### Pointing

In [67]:
address = {
    'central-address': 'http://127.0.0.1:7600' 
}

payload = json.dumps(address)

response = requests.post(
    url = 'http://127.0.0.1:7500/point',
    json = payload
)

print(response.status_code)

200


#### Starting

In [85]:
response = requests.post(
    url = 'http://127.0.0.1:7600/start'
)

print(response.status_code)
message = json.loads(response.text)
message

200


{'training': True}

## Logs

In [70]:
specifics = {
    'type': 'resources',
    'experiment': 1,
    'subject': 'central'
}

payload = json.dumps(specifics)

response = requests.get(
    url = 'http://127.0.0.1:7600/storage',
    json = payload
)

print(response.status_code)
central_logs = json.loads(response.text)
central_logs

200


{'data': {'function': {'1': {'1': {'cpu-percentage': 0.0,
     'disk-bytes': 0,
     'name': 'store-worker-1',
     'ram-bytes': 3887104,
     'time-seconds': 0.00304},
    '10': {'cpu-percentage': 0.0,
     'disk-bytes': 0,
     'name': 'store-worker-1',
     'ram-bytes': -16838656,
     'time-seconds': 0.00497},
    '11': {'cpu-percentage': 0.0,
     'disk-bytes': -151552,
     'name': 'store-worker-1',
     'ram-bytes': -8192,
     'time-seconds': 0.00303},
    '12': {'cpu-percentage': 0.0,
     'disk-bytes': 0,
     'name': 'store-worker-1',
     'ram-bytes': -1064960,
     'time-seconds': 0.00347},
    '13': {'cpu-percentage': -5.0,
     'disk-bytes': 4096,
     'name': 'store-worker-1',
     'ram-bytes': -5947392,
     'time-seconds': 0.03003},
    '14': {'cpu-percentage': -5.0,
     'disk-bytes': 20480,
     'name': 'initial-model-training',
     'ram-bytes': 1781760,
     'time-seconds': 6.511945962905884},
    '15': {'cpu-percentage': 0.0,
     'disk-bytes': 0,
     'name': 's

In [47]:
specifics = {
    'type': 'status',
    'experiment': 1,
    'subject': 'worker'
}

payload = json.dumps(specifics)

response = requests.get(
    url = 'http://127.0.0.1:7500/storage',
    json = payload
)

print(response.status_code)
workers_logs = json.loads(response.text)
workers_logs

200


{'data': {'central-address': 'http://127.0.0.1:7600',
  'complete': True,
  'cycle': 6,
  'eval-amount': 10000,
  'id': '1',
  'preprocessed': True,
  'status': 'waiting',
  'stored': True,
  'test-amount': 2000,
  'train-amount': 8000,
  'trained': True,
  'updated': True,
  'worker-address': 'http://127.0.0.1:7500'}}

## Models

In [19]:
specifics = {
    'experiment': 0,
    'subject': 'global'
}

payload = json.dumps(specifics)

response = requests.get(
    url = 'http://127.0.0.1:7600/models',
    json = payload
    
)

print(response.status_code)
models = json.loads(response.text)
models

200


{'data': {'global': {'1': {'0': {'bias': [-0.6406737565994263],
     'collective-samples': '20000',
     'update-amount': '0',
     'weights': [[0.36179283261299133,
       0.17071613669395447,
       -0.0830765962600708,
       0.35710176825523376,
       -0.3915247619152069,
       0.11602921038866043]]},
    '1': {'bias': [-0.8535194396972656],
     'collective-samples': '8000',
     'update-amount': '1',
     'weights': [[0.3789110779762268,
       0.04992080107331276,
       -0.08393267542123795,
       0.3440307080745697,
       -0.5913933515548706,
       0.23777452111244202]]},
    '2': {'bias': [-0.9805437922477722],
     'collective-samples': '8000',
     'update-amount': '1',
     'weights': [[0.39772936701774597,
       -0.05259191617369652,
       -0.06293722242116928,
       0.3316356837749481,
       -0.7445551156997681,
       0.35782429575920105]]},
    '3': {'bias': [-1.0514317750930786],
     'collective-samples': '8000',
     'update-amount': '1',
     'weights': [[

## File sizes

In [31]:
response = requests.get(
    url = 'http://127.0.0.1:7600/files',
    json = payload
    
)

print(response.status_code)
files = json.loads(response.text)
files

200


{'data': {'data': {'experiment_1': {'central_pool.csv': 335949,
    'source.csv': 1067414,
    'worker_1_1_20000.csv': 669642,
    'worker_1_2_20000.csv': 669790,
    'worker_1_3_20000.csv': 669743,
    'worker_1_4_20000.csv': 669653,
    'worker_1_5_20000.csv': 669881,
    'worker_pool.csv': 335796},
   'experiment_1-size': 5087868,
   'experiment_2': {'central_pool.csv': 335782,
    'source.csv': 1067414,
    'worker_1_1_20000.csv': 671273,
    'worker_1_2_20000.csv': 671700,
    'worker_1_3_20000.csv': 671776,
    'worker_pool.csv': 335813},
   'experiment_2-size': 3753758,
   'experiment_3': {'central_pool.csv': 336041,
    'source.csv': 1067414,
    'worker_1_1_20000.csv': 671697,
    'worker_1_2_20000.csv': 671713,
    'worker_1_3_20000.csv': 671815,
    'worker_1_4_20000.csv': 671539,
    'worker_1_5_20000.csv': 671582,
    'worker_1_6_20000.csv': 671853,
    'worker_pool.csv': 335954},
   'experiment_3-size': 5769608},
  'data-size': 14611234,
  'logs': {'central.log': 264},
  

In [39]:
response = requests.get(
    url = 'http://127.0.0.1:7500/files',
    json = payload
    
)

print(response.status_code)
files = json.loads(response.text)
files

200


{'data': {'data': {'experiment_1': {'sample_1.csv': 669642,
    'sample_2.csv': 669790,
    'sample_3.csv': 669743,
    'sample_4.csv': 669653,
    'sample_5.csv': 669881},
   'experiment_1-size': 3348709,
   'experiment_2': {'sample_1.csv': 671273,
    'sample_2.csv': 671700,
    'sample_3.csv': 671776},
   'experiment_2-size': 2014749,
   'experiment_3': {'sample_1.csv': 671697,
    'sample_2.csv': 671713,
    'sample_3.csv': 671815,
    'sample_4.csv': 671539,
    'sample_5.csv': 671582,
    'sample_6.csv': 671853},
   'experiment_3-size': 4030199},
  'data-size': 9393657,
  'logs': {'worker.log': 737},
  'logs-size': 737,
  'metrics': {'experiment_1': {'local.txt': 4421},
   'experiment_1-size': 4421,
   'experiment_2': {'local.txt': 2590},
   'experiment_2-size': 2590,
   'experiment_3': {'local.txt': 5306},
   'experiment_3-size': 5306,
   'templates': {'local.txt': 484},
   'templates-size': 484},
  'metrics-size': 12801,
  'models': {'experiment_1': {'global_0.pth': 1438,
    '

## Inference

In [33]:
def central_worker_inference(
    address: str,
    experiment: str,
    subject: str,
    cycle: int,
    data_df: any,
    relevant_columns: list,
    rows: int
):
    sample_df = data_df.iloc[:rows,:]
    relevant_df = sample_df[relevant_columns]
    input_df = relevant_df.iloc[:rows,:-2]
    mean = input_df['amount'].mean()
    std_dev = input_df['amount'].std()
    input_df['amount'] = (input_df['amount'] - mean)/std_dev

    payload = {
        'experiment-id': experiment,
        'subject': subject,
        'cycle': cycle,
        'input': input_df.values.tolist()
    }
    payload = json.dumps(payload)
    central_address = address + '/predict' 
    response = requests.post(
        url = central_address,
        json = payload
    )

    text_output = json.loads(response.text)
    sample_df['pred'] = np.array(text_output['predictions']).astype(int)
    return sample_df

In [38]:
central_inference_df = central_worker_inference(
    address = 'http://127.0.0.1:7600',
    experiment = 3,
    subject = 'global',
    cycle = 6,
    data_df = formated_data_df,
    relevant_columns = [
        'amount',
        'type_CASH_IN',
        'type_CASH_OUT',
        'type_DEBIT',
        'type_PAYMENT',
        'type_TRANSFER',
        'isFraud',
        'isFlaggedFraud'
    ],
    rows = 50
)
central_inference_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df['pred'] = np.array(text_output['predictions']).astype(int)


Unnamed: 0,step,amount,nameOrig,nameDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,isFraud,isFlaggedFraud,pred
0,1,9840,1,7233461,0,0,0,1,0,0,0,0
1,1,1864,2,7735206,0,0,0,1,0,0,0,0
2,1,181,3,8598945,0,0,0,0,1,1,0,0
3,1,181,4,7880837,0,1,0,0,0,1,0,0
4,1,11668,5,7670940,0,0,0,1,0,0,0,0
5,1,7818,6,6477257,0,0,0,1,0,0,0,0
6,1,7108,7,8194799,0,0,0,1,0,0,0,0
7,1,7862,8,8738506,0,0,0,1,0,0,0,0
8,1,4024,9,6735336,0,0,0,1,0,0,0,0
9,1,5338,10,6427877,0,0,1,0,0,0,0,0


In [41]:
worker_inference_df = central_worker_inference(
    address = 'http://127.0.0.1:7500',
    experiment = 3,
    subject = 'global',
    cycle = 6,
    data_df = formated_data_df,
    relevant_columns = [
        'amount',
        'type_CASH_IN',
        'type_CASH_OUT',
        'type_DEBIT',
        'type_PAYMENT',
        'type_TRANSFER',
        'isFraud',
        'isFlaggedFraud'
    ],
    rows = 50
)
worker_inference_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df['pred'] = np.array(text_output['predictions']).astype(int)


Unnamed: 0,step,amount,nameOrig,nameDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,isFraud,isFlaggedFraud,pred
0,1,9840,1,7233461,0,0,0,1,0,0,0,0
1,1,1864,2,7735206,0,0,0,1,0,0,0,0
2,1,181,3,8598945,0,0,0,0,1,1,0,0
3,1,181,4,7880837,0,1,0,0,0,1,0,0
4,1,11668,5,7670940,0,0,0,1,0,0,0,0
5,1,7818,6,6477257,0,0,0,1,0,0,0,0
6,1,7108,7,8194799,0,0,0,1,0,0,0,0
7,1,7862,8,8738506,0,0,0,1,0,0,0,0
8,1,4024,9,6735336,0,0,0,1,0,0,0,0
9,1,5338,10,6427877,0,0,1,0,0,0,0,0


In [52]:
from datetime import datetime
datetime.now().strftime('%Y-%m-%d-%H:%M:%S.%f')

'2024-03-10-14:07:17.033502'