In [2]:
from scipy.io import loadmat
from scipy.sparse import issparse
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from Data import VFLDataset
from torch.utils.data import DataLoader
import VFL
import torch
import os
DIR = "Data"

# Arcene

* EPOCH: All train 40 Epoch
* Train-Test-Split: 0.5 0.5
* Network Architecture: (8 -> 8)
* Lam = 0.2
* Top_lam = 0.3
* num_clients = 2 client 1 client 2 server emb server top
* embedding_size = 8
* input_feature_size = 10000

In [6]:
file_name = 'arcene.mat'
mat = loadmat(os.path.join(DIR, file_name))
X = mat["X"]
y = mat["Y"]
print(X.shape)
if issparse(X):
    X = X.todense()
y = y.flatten()
print(file_name, X.shape, y.shape)
y[np.where(y == -1)] = 0
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
dataset = VFLDataset(data_source=(X, y), 
                    num_clients=9,
                    gini_portion=None,
                    insert_noise=False,
                    test_size=0.5)
train_loader = DataLoader(dataset.train(), batch_size=128, shuffle=True)
val_loader = DataLoader(dataset.valid(), batch_size=1000, shuffle=True)
test_loader = DataLoader(dataset.test(), batch_size=1000, shuffle=True)
input_dim_list = dataset.get_input_dim_list()
output_dim = np.unique(y).size
criterion = torch.nn.CrossEntropyLoss()

(200, 10000)
arcene.mat (200, 10000) (200,)
Client 0: Feature Index 0-999
Client 1: Feature Index 1000-1999
Client 2: Feature Index 2000-2999
Client 3: Feature Index 3000-3999
Client 4: Feature Index 4000-4999
Client 5: Feature Index 5000-5999
Client 6: Feature Index 6000-6999
Client 7: Feature Index 7000-7999
Client 8: Feature Index 8000-8999
Server : Feature Index 9000-9999


fnn

In [204]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='FNN',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[8, 8],
                            activation='relu')
fnn_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=40,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [205]:
fnn_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc
35,0.368368,0.93,0.94,0.94
36,0.350603,0.94,0.94,0.94
37,0.332023,0.94,0.94,0.94
38,0.313225,0.94,0.95,0.95
39,0.294042,0.95,0.95,0.95


STG

In [206]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='STG',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[8, 8],
                            activation='relu', lam=0.2)
stg_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=40, , optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [207]:
stg_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats
35,0.124657,1.0,0.99,0.97,10000
36,0.12467,1.0,1.0,0.99,10000
37,0.127946,1.0,0.97,0.97,10000
38,0.12453,1.0,0.98,0.97,10000
39,0.124354,1.0,1.0,0.97,10000


STG with GINI Initialization

In [7]:
gini_labels = dataset.gini_filter(0.5)
feat_idx_list = dataset.get_feature_index_list()
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='STG',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[8, 8],
                            activation='relu', lam=0.2, mus=mus)
stg_gini_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=40,  optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

Epoch: 1, Train Loss: 0.8401, Train Acc: 0.4000, Val Acc 0.6000, Test Acc: 0.6000, Best Acc: 0.6, Num Feats: 10000.0000
Epoch: 2, Train Loss: 0.8032, Train Acc: 0.6000, Val Acc 0.6000, Test Acc: 0.6000, Best Acc: 0.6, Num Feats: 10000.0000
Epoch: 3, Train Loss: 0.7982, Train Acc: 0.6000, Val Acc 0.6000, Test Acc: 0.6000, Best Acc: 0.6, Num Feats: 10000.0000
Epoch: 4, Train Loss: 0.7744, Train Acc: 0.6000, Val Acc 0.6000, Test Acc: 0.6000, Best Acc: 0.6, Num Feats: 10000.0000
Epoch: 5, Train Loss: 0.7466, Train Acc: 0.6000, Val Acc 0.6000, Test Acc: 0.6000, Best Acc: 0.6, Num Feats: 10000.0000
Epoch: 6, Train Loss: 0.7235, Train Acc: 0.6000, Val Acc 0.6800, Test Acc: 0.7000, Best Acc: 0.68, Num Feats: 10000.0000
Epoch: 7, Train Loss: 0.6976, Train Acc: 0.6600, Val Acc 0.7100, Test Acc: 0.6700, Best Acc: 0.71, Num Feats: 9999.0000
Epoch: 8, Train Loss: 0.6864, Train Acc: 0.7600, Val Acc 0.8000, Test Acc: 0.7900, Best Acc: 0.8, Num Feats: 9959.0000
Epoch: 9, Train Loss: 0.6720, Train Acc:

In [209]:
stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats
35,0.138181,1.0,1.0,0.99,6701
36,0.209717,0.95,0.98,1.0,6674
37,0.130873,1.0,1.0,1.0,6668
38,0.12937,1.0,1.0,1.0,6654
39,0.127612,1.0,1.0,1.0,6637


Dual STG with GINI Initialization

In [214]:
%%capture
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
    input_dim_list=input_dim_list,
    type="DualSTG",
    emb_dim=8,
    output_dim=output_dim,
    hidden_dims=[8, 8],
    activation="relu",
    mus=mus, top_lam=0.8, lam=0.2)
dual_stg_gini_history = VFL.train(
    models,
    top_model,
    train_loader,
    val_loader,
    test_loader,
    epochs=40,
    optimizer='Adam',
    criterion=criterion,
    verbose=True,
    save_mask_at=100000, freeze_top_till=0)

In [215]:
dual_stg_gini_history.tail()

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats,num_emb
35,0.695254,0.99,0.99,0.99,5878,24
36,0.738949,0.99,1.0,0.99,5884,24
37,0.637581,1.0,0.98,1.0,5883,24
38,1.018261,0.79,0.99,0.92,5886,24
39,0.639038,0.96,1.0,1.0,5893,24


In [217]:
print( dual_stg_gini_history['num_emb'].sum()/(3*8*40))

1.0


dual stg with longer training

In [9]:
mus = VFL.initialize_mu(gini_labels, feat_idx_list)
models, top_model = VFL.make_binary_models(
    input_dim_list=input_dim_list,
    type="DualSTG",
    emb_dim=8,
    output_dim=output_dim,
    hidden_dims=[8, 8],
    activation="relu",
    mus=mus, top_lam=0.8, lam=0.2)


'''
   ref: https://discuss.pytorch.org/t/finding-model-size/130275
'''
param_size = 0
for param in top_model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in top_model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

# longer_dual_stg_gini_history = VFL.train(
#     models,
#     top_model,
#     train_loader,
#     val_loader,
#     test_loader,
#     epochs=80,
#     optimizer='Adam',
#     criterion=criterion,
#     verbose=True,
#     save_mask_at=100000, freeze_top_till=0)

model size: 0.010MB


In [243]:
longer_dual_stg_gini_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc,num_feats,num_emb
95,0.516584,1.0,1.0,1.0,6154,14
96,0.539191,0.99,0.99,1.0,6152,14
97,0.614073,0.97,1.0,1.0,6151,14
98,0.527423,1.0,1.0,1.0,6147,14
99,0.509614,1.0,1.0,1.0,6143,14


SFFS Filtered (0.5)

In [218]:
from SFFS import get_f_stat_index
index = get_f_stat_index(X, y)

total computation time for pinv is: 244.09113812446594


  f_statistics[j] = theta_param[j] ** 2 / diag_x[j]
  f_statistics[j] = theta_param[j] ** 2 / diag_x[j]


In [223]:
X_filtered = X[:, index[:int(0.5*len(index))]]
print(X_filtered.shape)

(200, 5000)


In [229]:
dataset = VFLDataset(data_source=(X_filtered, y), 
                    num_clients=2,
                    gini_portion=None,
                    insert_noise=False,
                    test_size=0.5)
train_loader = DataLoader(dataset.train(), batch_size=128, shuffle=True)
val_loader = DataLoader(dataset.valid(), batch_size=1000, shuffle=True)
test_loader = DataLoader(dataset.test(), batch_size=1000, shuffle=True)
input_dim_list = dataset.get_input_dim_list()
output_dim = np.unique(y).size
criterion = torch.nn.CrossEntropyLoss()

Client 0: Feature Index 0-1666
Client 1: Feature Index 1667-3333
Server : Feature Index 3334-4999


In [230]:
%%capture
models, top_model = VFL.make_binary_models(
                            input_dim_list=input_dim_list,
                            type='FNN',
                            emb_dim=8,
                            output_dim=output_dim, hidden_dims=[8, 8],
                            activation='relu')
sffs_fnn_history = VFL.train(models, top_model, train_loader, val_loader, test_loader,
                epochs=40, optimizer='Adam',verbose=True, save_mask_at=10000, 
                criterion=criterion)

In [231]:
sffs_fnn_history.tail(5)

Unnamed: 0,train_loss,train_acc,val_acc,test_acc
35,0.495296,0.95,0.95,0.95
36,0.480403,0.95,0.95,0.95
37,0.464686,0.95,0.95,0.95
38,0.448242,0.95,0.95,0.95
39,0.431058,0.95,0.97,0.97


---
# Summary

| Model        | # Features | Test Acc | Ratio Embedding  | 
|--------------|------------|----------|-------------------|  
| FNN          | 10000      | 0.95     | 1                 |  
| STG          | 10000      | 0.97     | 1                 |
| STG+GINI     | 6357       | 1        | 1                 |
| DualSTG+GINI | 5893       | 1        | 1                 |
| DualSTG+GINI (double)| 6154| 1        | 0.5833
| SFFS->FNN    | 5000       | 0.97     | 1                 | 

In [245]:
import dill
dill.dump_session('ArceneDataExperiments.db')