# Fine Tuning the MoE models
This notebooks performs several calculations to find the best model architectures within our different moe models.
We consider dense and Bidirectional LSTM moe models, with both soft gating and top k gating. 

All evaluations are performed for 3 training round with 100 epochs each, calculating the MSE. 
As a final result the MSE for 3 examplary users is calculated (mean) and the standard deviation is given for the 3 training rounds

In [1]:
#Imports
import pandas as pd
import os
import tensorflow as tf
from keras import layers, models

from utils.modelgenerator import *
from utils.modelhandler import *
from utils.datahandler import *

In [3]:
dense_results = pd.read_csv('evaluations/moe_bilstm_benchmark_results.csv')
dense_results

Unnamed: 0.1,Unnamed: 0,architecture,Loss@User10,std@User10,Loss@User11,std@User11,Loss@User12,std@User12,mean
0,0,Bi20_sexp4d8,0.016075,0.001194,0.014661,0.000493,0.011256,0.00111,0.013997
1,1,1xBi20_top2exp5d8_d16,0.016893,0.001499,0.014476,0.000408,0.0103,0.000619,0.01389
2,2,Bi10_sexp4d8,0.016085,0.000419,0.014669,0.000775,0.011186,0.001014,0.01398
3,3,1xBi10_top2exp5d8_d16,0.016815,0.000191,0.014956,0.000667,0.012526,0.000992,0.014766
4,4,Bi8_sexp8d16,0.015341,0.002136,0.014728,0.001041,0.010751,0.000751,0.013606
5,5,Bi8_top2exp10d16_d16,0.016634,0.002735,0.015349,0.000627,0.011403,0.001181,0.014462


In [4]:
dense_results = pd.read_csv('evaluations/moe_soft_dense_benchmark_results.csv')
dense_results

Unnamed: 0.1,Unnamed: 0,architecture,Loss@User10,std@User10,Loss@User11,std@User11,Loss@User12,std@User12,mean
0,0,2xd16_sE3d4_d16,0.014863,0.000865,0.015716,0.000373,0.011766,0.000291,0.014115
1,1,2xd16_sE3d8_d16,0.014991,0.001748,0.015563,0.000569,0.011207,0.000437,0.013921
2,2,2xd16_sE3d16_d16,0.014703,0.001334,0.014851,0.000446,0.011391,0.002108,0.013648
3,3,2xd16_sE3d32_d16,0.014385,0.000669,0.015349,0.000435,0.012665,0.001137,0.014133
4,4,2xd16_sE4d4_d16,0.016658,0.002067,0.015469,0.000796,0.011775,0.001496,0.014634
5,5,2xd16_sE4d8_d16,0.013657,0.001151,0.015522,0.000894,0.01057,0.000482,0.01325
6,6,2xd16_sE4d16_d16,0.015293,0.002308,0.014989,0.001056,0.011152,0.001178,0.013811
7,7,2xd16_sE8d8_d16,0.015531,0.002475,0.015491,0.000597,0.012214,0.002272,0.014412
8,8,2xd16_sE8d16_d16,0.014905,0.001718,0.015811,0.000193,0.012741,0.00048,0.014486
9,9,2xd16_sE10d8_d16,0.014596,0.000723,0.015427,0.000218,0.010958,0.001739,0.01366


In [5]:
dense_results = pd.read_csv('evaluations/moe_topk_dense_benchmark_results.csv')
dense_results


Unnamed: 0.1,Unnamed: 0,architecture,Loss@User10,std@User10,Loss@User11,std@User11,Loss@User12,std@User12,mean
0,0,top1_exp3_d8,0.014745,0.000583,0.015301,0.000701,0.011411,0.000999,0.013819
1,1,top1_exp3_d16,0.014932,0.001491,0.014849,0.000776,0.011906,0.001783,0.013896
2,2,top1_exp5_d8,0.014287,0.001308,0.014642,0.000111,0.011462,0.000453,0.013463
3,3,top2_exp5_d8,0.014162,0.000396,0.015219,0.001199,0.010901,0.000754,0.013427
4,4,top3_exp5_d8,0.015286,0.000848,0.01608,0.000992,0.011258,0.001074,0.014208
5,5,top1_exp5_d16,0.014293,0.00077,0.015046,0.000523,0.011782,0.000764,0.013707
6,6,top2_exp5_d16,0.014098,0.001321,0.015763,0.001226,0.011265,0.000629,0.013709
7,7,top3_exp5_d16,0.014096,0.001128,0.01571,0.001202,0.011096,0.000298,0.013634
8,8,top1_exp10_d8,0.014841,0.001264,0.015565,0.000665,0.011061,0.001489,0.013822
9,9,top2_exp10_d8,0.014477,0.000688,0.01454,0.000294,0.012062,0.001433,0.013693


In [2]:
#Get data 
cwd = os.path.normpath(os.getcwd())
df = pd.read_csv(cwd+'/data/df_with_final_features.csv', index_col='Date') #df = pd.read_csv('user5.csv')
df.index = pd.to_datetime(df.index)
#df = df[['User5', 'temp', 'rhum']]
df.fillna(0, inplace=True)

#Select only 3 User for testing
df_user10 = df[['User10', 'temp', 'rhum', 'wspd', 'PC1', 'hour sin', 'hour cos', 'User10_lag_24hrs']]
df_user11 = df[['User11', 'temp', 'rhum', 'wspd', 'PC1', 'hour sin', 'hour cos', 'User11_lag_24hrs']]
df_user12 = df[['User12', 'temp', 'rhum', 'wspd', 'PC1', 'hour sin', 'hour cos', 'User12_lag_24hrs']]
df_array = [df_user10, df_user11, df_user12]
df_array[1].head(3)

Unnamed: 0_level_0,User11,temp,rhum,wspd,PC1,hour sin,hour cos,User11_lag_24hrs
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-07-08 00:00:00,0.312,9.8,85.0,0.0,-2.453691,0.0,1.0,0.667
2012-07-08 01:00:00,0.263,9.8,85.0,0.0,-2.453691,0.258819,0.965926,0.316
2012-07-08 02:00:00,0.257,9.8,85.0,0.0,-2.453691,0.5,0.866025,0.356


In [3]:
#Train, Validation and Test datasets
sequence_length = 25
batch_size = 16
num_features = df_array[0].shape[1]

dh = Datahandler()

X_train, y_train, X_val, y_val, X_test, y_test = {}, {}, {}, {}, {}, {}

#Create Train, Validation and Test datasets
for idx, df in enumerate(df_array):
    n = len(df)
    train_df = df[0:int(n*0.7)]
    val_df = df[int(n*0.7):int(n*0.9)]
    test_df = df[int(n*0.9):]

    # Min max sclaing
    train_df = dh.min_max_scaling(train_df)
    val_df = dh.min_max_scaling(val_df)
    test_df = dh.min_max_scaling(test_df)

    # Sequencing
    train_sequences = dh.create_sequences(train_df, sequence_length)
    val_sequences = dh.create_sequences(val_df, sequence_length)
    test_sequences = dh.create_sequences(test_df, sequence_length)

    #Split into feature and label
    X_train[f'user1{idx}'], y_train[f'user1{idx}'] = dh.prepare_data(train_sequences, batch_size)
    X_val[f'user1{idx}'], y_val[f'user1{idx}'] = dh.prepare_data(val_sequences, batch_size)
    X_test[f'user1{idx}'], y_test[f'user1{idx}'] = dh.prepare_data(test_sequences, batch_size)

In [4]:
#General Hyperparameters
# #All models
horizon = 1
max_epochs = 100
m1 = ModelGenerator()
mh = Modelhandler()

loss = tf.keras.losses.MeanSquaredError()
metrics=[
    tf.keras.metrics.RootMeanSquaredError(), 
    tf.keras.metrics.MeanAbsolutePercentageError(),
    tf.keras.metrics.MeanAbsoluteError(),
]

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=10,mode='min')
timing_callback = TimingCallback()
custom_callback = CustomCallback()
callbacks=[early_stopping, timing_callback, custom_callback]

### 1. Dense mixture of experts model with soft gating

In [30]:
   #Builds a MoE model with soft gating
def build_soft_dense_moe_model(X_train, batch_size, horizon, dense_units,  expert_units, num_experts, m1):
    #Input of shape (batch_size, sequence_length, features)
    inputs = layers.Input(shape=(X_train.shape[1], X_train.shape[2]), batch_size=batch_size, name='input_layer') 
    x = inputs


    #EMBEDDED MOE LAYER
    # Gating network (Routing Softmax)
    routing_logits = layers.Dense(num_experts, activation='softmax')(x)
    #experts
    experts = [m1.build_expert_network(expert_units=expert_units)(x) for _ in range(num_experts)]
    expert_outputs = tf.stack(experts, axis=1)
    #Add and Multiply expert models with router probability
    moe_output = tf.einsum('bsn,bnse->bse', routing_logits, expert_outputs)
    #END MOE LAYER

    x = layers.Dense(dense_units, activation="relu")(moe_output)
    x = layers.Dense(dense_units, activation="relu")(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Flatten()(x)
    outputs = layers.Dense(horizon)(x)
    softgated_moe_model = models.Model(inputs=inputs, outputs=outputs, name="soft_dense_moe")

    return softgated_moe_model

In [27]:
dense_moe_results = pd.DataFrame(columns=['architecture', 'Loss@User10','std@User10', 'Loss@User11','std@User11', 'Loss@User12','std@User12'])

In [33]:
#dense_moe_architectures to test: 

#dense_moe_architectures - tested: #e3_d4, e3_d8, e3_d16, e3_d32

#Dense Hyperparameter
dense_moe_architecture = "2xd16_sE4d8_d16"
dense_units = 16

num_experts = 4
expert_units = 8

dense_moe_all_results = pd.DataFrame(columns=["user", "architecture", "mse"])
#For each of the 3 user
for idx in range(3):
    print("User: ", idx)
    for round in range(3):
        #print("Round: ", round) 
        dense_moe_model = build_soft_dense_moe_model(X_train[f'user1{idx}'], batch_size, horizon, dense_units, expert_units, num_experts, m1)
        dense_histroy, dense_moe_user_results = mh.compile_fit_evaluate_model(
            model=dense_moe_model, 
            loss=loss, 
            metrics=metrics, 
            X_train=X_train[f'user1{idx}'],
            y_train = y_train[f'user1{idx}'], 
            max_epochs = max_epochs, 
            batch_size=batch_size, 
            X_val=X_val[f'user1{idx}'], 
            y_val=y_val[f'user1{idx}'], 
            X_test=X_test[f'user1{idx}'], 
            y_test=y_test[f'user1{idx}'], 
            callbacks=callbacks, 
            user=f'user1{idx}', 
            hyper=dense_moe_architecture,
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
        )
        # Add the 'architecture' column from dense_user_results to dense_results
        dense_moe_all_results = pd.merge(dense_moe_all_results, dense_moe_user_results, how='outer')   

new_row = {
    'architecture': dense_moe_all_results["architecture"][0],
    'Loss@User10': dense_moe_all_results[dense_moe_all_results["user"]=="user10"]["mse"].mean(),
    'std@User10' : dense_moe_all_results[dense_moe_all_results["user"]=="user10"]["mse"].std(),
    'Loss@User11': dense_moe_all_results[dense_moe_all_results["user"]=="user11"]["mse"].mean(),
    'std@User11' : dense_moe_all_results[dense_moe_all_results["user"]=="user11"]["mse"].std(),
    'Loss@User12': dense_moe_all_results[dense_moe_all_results["user"]=="user12"]["mse"].mean(),
    'std@User12' : dense_moe_all_results[dense_moe_all_results["user"]=="user12"]["mse"].std(),
}
dense_moe_results.loc[len(dense_moe_results)] = new_row

User:  0
User:  1
User:  2


In [34]:
dense_moe_results["mean"] = dense_moe_results[['Loss@User10', 'Loss@User11', 'Loss@User12']].mean(axis=1)
#lstm_results.drop(labels=['Unnamed: 0', 'sum'], axis=1,  inplace=True)
dense_moe_results

Unnamed: 0,architecture,Loss@User10,std@User10,Loss@User11,std@User11,Loss@User12,std@User12,mean
0,2xd16_sE4d8_d16,0.01479,0.001334,0.015348,0.000157,0.0111,0.001163,0.013746
1,2xd16_sE4d8_d16,0.015137,0.001225,0.015056,0.000696,0.010635,0.000699,0.013609
2,2xd16_sE4d8_d16,0.013936,0.00156,0.015738,0.000228,0.010132,0.000289,0.013269


In [39]:
dense_moe_results["mean"] = dense_moe_results[['Loss@User10', 'Loss@User11', 'Loss@User12']].mean(axis=1)
#lstm_results.drop(labels=['Unnamed: 0', 'sum'], axis=1,  inplace=True)
dense_moe_results

Unnamed: 0,architecture,Loss@User10,std@User10,Loss@User11,std@User11,Loss@User12,std@User12,mean
0,2xd16_sE3d4_d16,0.014863,0.000865,0.015716,0.000373,0.011766,0.000291,0.014115
1,2xd16_sE3d8_d16,0.014991,0.001748,0.015563,0.000569,0.011207,0.000437,0.013921
2,2xd16_sE3d16_d16,0.014703,0.001334,0.014851,0.000446,0.011391,0.002108,0.013648
3,2xd16_sE3d32_d16,0.014385,0.000669,0.015349,0.000435,0.012665,0.001137,0.014133
4,2xd16_sE4d4_d16,0.016658,0.002067,0.015469,0.000796,0.011775,0.001496,0.014634
5,2xd16_sE4d8_d16,0.013657,0.001151,0.015522,0.000894,0.01057,0.000482,0.01325
6,2xd16_sE4d16_d16,0.015293,0.002308,0.014989,0.001056,0.011152,0.001178,0.013811
7,2xd16_sE8d8_d16,0.015531,0.002475,0.015491,0.000597,0.012214,0.002272,0.014412
8,2xd16_sE8d16_d16,0.014905,0.001718,0.015811,0.000193,0.012741,0.00048,0.014486
9,2xd16_sE10d8_d16,0.014596,0.000723,0.015427,0.000218,0.010958,0.001739,0.01366


In [35]:
#dense_moe_results.to_csv('evaluations/moe_soft_dense_benchmark_results.csv')

#### 1.1 Dense mixture of experts model with top k gating

In [5]:
dense_moe_results = pd.DataFrame(columns=['architecture', 'Loss@User10','std@User10', 'Loss@User11','std@User11', 'Loss@User12','std@User12'])

In [44]:
#dense_moe_architectures to test: 

#Done: t1e3d8, t1e3d16, t1e5d8, t2e5d8, t3e5d8, t1e5d16, t2e5d16, t3e5d16, t1e10d8, t2e10d8, t3e10d8, t5e10d8, t1e10d16, t2e10d16, t3e10d16, t5e10d8

#Dense Hyperparameter
dense_moe_architecture = "top5_exp10_d16"
dense_units = 16

top_k = 5
num_experts = 10
expert_units = 16

dense_moe_all_results = pd.DataFrame(columns=["user", "architecture", "mse"])
#For each of the 3 user
for idx in range(3):
    print("User: ", idx)
    for round in range(3):
        #print("Round: ", round)  
        dense_moe_model = m1.build_topk_dense_moe_model(X_train[f'user1{idx}'], batch_size, horizon, dense_units, num_experts, top_k, expert_units, m1)
        dense_histroy, dense_moe_user_results = mh.compile_fit_evaluate_model(
            model=dense_moe_model, 
            loss=loss, 
            metrics=metrics, 
            X_train=X_train[f'user1{idx}'],
            y_train = y_train[f'user1{idx}'], 
            max_epochs = max_epochs, 
            batch_size=batch_size, 
            X_val=X_val[f'user1{idx}'], 
            y_val=y_val[f'user1{idx}'], 
            X_test=X_test[f'user1{idx}'], 
            y_test=y_test[f'user1{idx}'], 
            callbacks=callbacks, 
            user=f'user1{idx}', 
            hyper=dense_moe_architecture,
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
        )
        # Add the 'architecture' column from dense_user_results to dense_results
        dense_moe_all_results = pd.merge(dense_moe_all_results, dense_moe_user_results, how='outer')   

new_row = {
    'architecture': dense_moe_all_results["architecture"][0],
    'Loss@User10': dense_moe_all_results[dense_moe_all_results["user"]=="user10"]["mse"].mean(),
    'std@User10' : dense_moe_all_results[dense_moe_all_results["user"]=="user10"]["mse"].std(),
    'Loss@User11': dense_moe_all_results[dense_moe_all_results["user"]=="user11"]["mse"].mean(),
    'std@User11' : dense_moe_all_results[dense_moe_all_results["user"]=="user11"]["mse"].std(),
    'Loss@User12': dense_moe_all_results[dense_moe_all_results["user"]=="user12"]["mse"].mean(),
    'std@User12' : dense_moe_all_results[dense_moe_all_results["user"]=="user12"]["mse"].std(),
}
dense_moe_results.loc[len(dense_moe_results)] = new_row

User:  0
User:  1
User:  2


In [45]:
dense_moe_results["mean"] = dense_moe_results[['Loss@User10', 'Loss@User11', 'Loss@User12']].mean(axis=1)
#lstm_results.drop(labels=['Unnamed: 0', 'sum'], axis=1,  inplace=True)
dense_moe_results

Unnamed: 0,architecture,Loss@User10,std@User10,Loss@User11,std@User11,Loss@User12,std@User12,mean
0,top1_exp3_d8,0.014745,0.000583,0.015301,0.000701,0.011411,0.000999,0.013819
1,top1_exp3_d16,0.014932,0.001491,0.014849,0.000776,0.011906,0.001783,0.013896
2,top1_exp5_d8,0.014287,0.001308,0.014642,0.000111,0.011462,0.000453,0.013463
3,top2_exp5_d8,0.014162,0.000396,0.015219,0.001199,0.010901,0.000754,0.013427
4,top3_exp5_d8,0.015286,0.000848,0.01608,0.000992,0.011258,0.001074,0.014208
5,top1_exp5_d16,0.014293,0.00077,0.015046,0.000523,0.011782,0.000764,0.013707
6,top2_exp5_d16,0.014098,0.001321,0.015763,0.001226,0.011265,0.000629,0.013709
7,top3_exp5_d16,0.014096,0.001128,0.01571,0.001202,0.011096,0.000298,0.013634
8,top1_exp10_d8,0.014841,0.001264,0.015565,0.000665,0.011061,0.001489,0.013822
9,top2_exp10_d8,0.014477,0.000688,0.01454,0.000294,0.012062,0.001433,0.013693


In [46]:
dense_moe_results.to_csv('evaluations/moe_topk_dense_benchmark_results.csv')

### 2. Bidirectional LSTM mixture of experts model with soft gating

In [35]:
bilstm_moe_results = pd.DataFrame(columns=['architecture', 'Loss@User10','std@User10', 'Loss@User11','std@User11', 'Loss@User12','std@User12'])

In [36]:
#dense_moe_architectures to test: 
# , , , t2e5d8, t3e5d8, t1e5d16, t2e5d16, t3e5d16
# t1e10d8, t2e10d8, t3e10d8, t5e10d8, t1e10d16, t2e10d16, t3e10d16, t5e10d8

#Done: t1e3d8, t1e3d16, t1e5d8

#Dense Hyperparameter
bilstm_moe_architecture = "sexp4d8_Bi4"
lstm_units = 4

num_experts = 4
expert_units = 8

bilstm_moe_all_results = pd.DataFrame(columns=["user", "architecture", "mse"])
#For each of the 3 user
for idx in range(3):
    print("User: ", idx)
    for round in range(3):
        #print("Round: ", round)  build_soft_biLSTM_moe_model(self, X_train, batch_size, horizon, lstm_units, num_experts, expert_units, m1)
        bilstm_moe_model = m1.build_soft_biLSTM_moe_model(X_train[f'user1{idx}'], batch_size, horizon, lstm_units, num_experts, expert_units, m1)
        bilstm_histroy, bilstm_moe_user_results = mh.compile_fit_evaluate_model(
            model=bilstm_moe_model, 
            loss=loss, 
            metrics=metrics, 
            X_train=X_train[f'user1{idx}'],
            y_train = y_train[f'user1{idx}'], 
            max_epochs = max_epochs, 
            batch_size=batch_size, 
            X_val=X_val[f'user1{idx}'], 
            y_val=y_val[f'user1{idx}'], 
            X_test=X_test[f'user1{idx}'], 
            y_test=y_test[f'user1{idx}'], 
            callbacks=callbacks, 
            user=f'user1{idx}', 
            hyper=bilstm_moe_architecture,
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
        )
        # Add the 'architecture' column from dense_user_results to dense_results
        bilstm_moe_all_results = pd.merge(bilstm_moe_all_results, bilstm_moe_user_results, how='outer')   

new_row = {
    'architecture': bilstm_moe_all_results["architecture"][0],
    'Loss@User10': bilstm_moe_all_results[bilstm_moe_all_results["user"]=="user10"]["mse"].mean(),
    'std@User10' : bilstm_moe_all_results[bilstm_moe_all_results["user"]=="user10"]["mse"].std(),
    'Loss@User11': bilstm_moe_all_results[bilstm_moe_all_results["user"]=="user11"]["mse"].mean(),
    'std@User11' : bilstm_moe_all_results[bilstm_moe_all_results["user"]=="user11"]["mse"].std(),
    'Loss@User12': bilstm_moe_all_results[bilstm_moe_all_results["user"]=="user12"]["mse"].mean(),
    'std@User12' : bilstm_moe_all_results[bilstm_moe_all_results["user"]=="user12"]["mse"].std(),
}
bilstm_moe_results.loc[len(bilstm_moe_results)] = new_row

User:  0
User:  1
User:  2


In [37]:
bilstm_moe_results["mean"] = bilstm_moe_results[['Loss@User10', 'Loss@User11', 'Loss@User12']].mean(axis=1)
bilstm_moe_results


Unnamed: 0,architecture,Loss@User10,std@User10,Loss@User11,std@User11,Loss@User12,std@User12,mean
0,sexp4d8_Bi4,0.014184,0.000263,0.015205,0.000712,0.010184,0.00086,0.013191


#### 2.1 Bidirectional LSTM mixture of experts model with top k gating

In [23]:
#Dense Hyperparameter
bilstm_moe_architecture = "top2exp4d8_bi4"
lstm_units = 8

num_experts = 4
top_k = 2
expert_units = 8

bilstm_moe_all_results = pd.DataFrame(columns=["user", "architecture", "mse"])
#For each of the 3 user
for idx in range(3):
    print("User: ", idx)
    for round in range(3):
        #print("Round: ", round)  build_topk_bilstm_moe_model(self, X_train, batch_size, horizon, lstm_units, num_experts, top_k, expert_units, m1)
        bilstm_moe_model = build_topk_bilstm_moe_model(X_train[f'user1{idx}'], batch_size, horizon, lstm_units, num_experts, top_k, expert_units, m1)
        bilstm_histroy, bilstm_moe_user_results = mh.compile_fit_evaluate_model(
            model=bilstm_moe_model, 
            loss=loss, 
            metrics=metrics, 
            X_train=X_train[f'user1{idx}'],
            y_train = y_train[f'user1{idx}'], 
            max_epochs = max_epochs, 
            batch_size=batch_size, 
            X_val=X_val[f'user1{idx}'], 
            y_val=y_val[f'user1{idx}'], 
            X_test=X_test[f'user1{idx}'], 
            y_test=y_test[f'user1{idx}'], 
            callbacks=callbacks, 
            user=f'user1{idx}', 
            hyper=bilstm_moe_architecture,
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
        )
        # Add the 'architecture' column from dense_user_results to dense_results
        bilstm_moe_all_results = pd.merge(bilstm_moe_all_results, bilstm_moe_user_results, how='outer')   

new_row = {
    'architecture': bilstm_moe_all_results["architecture"][0],
    'Loss@User10': bilstm_moe_all_results[bilstm_moe_all_results["user"]=="user10"]["mse"].mean(),
    'std@User10' : bilstm_moe_all_results[bilstm_moe_all_results["user"]=="user10"]["mse"].std(),
    'Loss@User11': bilstm_moe_all_results[bilstm_moe_all_results["user"]=="user11"]["mse"].mean(),
    'std@User11' : bilstm_moe_all_results[bilstm_moe_all_results["user"]=="user11"]["mse"].std(),
    'Loss@User12': bilstm_moe_all_results[bilstm_moe_all_results["user"]=="user12"]["mse"].mean(),
    'std@User12' : bilstm_moe_all_results[bilstm_moe_all_results["user"]=="user12"]["mse"].std(),
}
bilstm_moe_results.loc[len(bilstm_moe_results)] = new_row

User:  0
User:  1
User:  2


In [25]:
bilstm_moe_results["mean"] = bilstm_moe_results[['Loss@User10', 'Loss@User11', 'Loss@User12']].mean(axis=1)
#lstm_results.drop(labels=['Unnamed: 0', 'sum'], axis=1,  inplace=True)
bilstm_moe_results["architecture"][0] = "sexp4d8_Bi8"

bilstm_moe_results

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bilstm_moe_results["architecture"][0] = "sexp4d8_Bi8"


Unnamed: 0,architecture,Loss@User10,std@User10,Loss@User11,std@User11,Loss@User12,std@User12,mean
0,sexp4d8_Bi8,0.013683,0.000557,0.015055,0.000388,0.010755,0.000414,0.013164
1,sexp4d8_Bi8,0.014493,0.000739,0.014866,0.00025,0.009939,0.000556,0.013099
2,sexp4d8_Bi4,0.014654,0.001256,0.014599,0.000251,0.010496,0.00069,0.01325
3,top2exp10d16_bi8,0.014964,0.001961,0.015414,0.000535,0.011637,0.001384,0.014005
4,top2exp10d16_bi4,0.015599,0.001142,0.015771,0.00032,0.010956,0.000841,0.014109
5,top2exp4d8_bi4,0.013957,0.00032,0.014925,0.000381,0.01076,0.000594,0.013214


In [17]:
bilstm_moe_results.to_csv('evaluations/moe_bilstm_benchmark_results.csv')

# Summary

In [None]:
cnn_results = pd.read_csv('evaluations/cnn_benchmark_results.csv')
cnn_results["sum"] = cnn_results[['Loss@User10', 'Loss@User11', 'Loss@User12']].sum(axis=1)
#cnn_results.drop(labels=['Unnamed: 0'], axis=1,  inplace=True)
cnn_results
#cnn_results

In [50]:
moe_soft_dense_results = pd.read_csv('evaluations/moe_soft_dense_benchmark_results.csv')
moe_soft_dense_results

Unnamed: 0.1,Unnamed: 0,architecture,Loss@User10,std@User10,Loss@User11,std@User11,Loss@User12,std@User12,mean
0,0,2xd16_sE3d4_d16,0.014863,0.000865,0.015716,0.000373,0.011766,0.000291,0.014115
1,1,2xd16_sE3d8_d16,0.014991,0.001748,0.015563,0.000569,0.011207,0.000437,0.013921
2,2,2xd16_sE3d16_d16,0.014703,0.001334,0.014851,0.000446,0.011391,0.002108,0.013648
3,3,2xd16_sE3d32_d16,0.014385,0.000669,0.015349,0.000435,0.012665,0.001137,0.014133
4,4,2xd16_sE4d4_d16,0.016658,0.002067,0.015469,0.000796,0.011775,0.001496,0.014634
5,5,2xd16_sE4d8_d16,0.013657,0.001151,0.015522,0.000894,0.01057,0.000482,0.01325
6,6,2xd16_sE4d16_d16,0.015293,0.002308,0.014989,0.001056,0.011152,0.001178,0.013811
7,7,2xd16_sE8d8_d16,0.015531,0.002475,0.015491,0.000597,0.012214,0.002272,0.014412
8,8,2xd16_sE8d16_d16,0.014905,0.001718,0.015811,0.000193,0.012741,0.00048,0.014486
9,9,2xd16_sE10d8_d16,0.014596,0.000723,0.015427,0.000218,0.010958,0.001739,0.01366


In [52]:
moe_soft_dense_results["mean"].min() #2xd16_sE4d8_d16 mit 0.0132496367312139

0.0132496367312139

In [51]:
moe_topk_dense_results = pd.read_csv('evaluations/moe_topk_dense_benchmark_results.csv')
moe_topk_dense_results

Unnamed: 0.1,Unnamed: 0,architecture,Loss@User10,std@User10,Loss@User11,std@User11,Loss@User12,std@User12,mean
0,0,top1_exp3_d8,0.014745,0.000583,0.015301,0.000701,0.011411,0.000999,0.013819
1,1,top1_exp3_d16,0.014932,0.001491,0.014849,0.000776,0.011906,0.001783,0.013896
2,2,top1_exp5_d8,0.014287,0.001308,0.014642,0.000111,0.011462,0.000453,0.013463
3,3,top2_exp5_d8,0.014162,0.000396,0.015219,0.001199,0.010901,0.000754,0.013427
4,4,top3_exp5_d8,0.015286,0.000848,0.01608,0.000992,0.011258,0.001074,0.014208
5,5,top1_exp5_d16,0.014293,0.00077,0.015046,0.000523,0.011782,0.000764,0.013707
6,6,top2_exp5_d16,0.014098,0.001321,0.015763,0.001226,0.011265,0.000629,0.013709
7,7,top3_exp5_d16,0.014096,0.001128,0.01571,0.001202,0.011096,0.000298,0.013634
8,8,top1_exp10_d8,0.014841,0.001264,0.015565,0.000665,0.011061,0.001489,0.013822
9,9,top2_exp10_d8,0.014477,0.000688,0.01454,0.000294,0.012062,0.001433,0.013693


In [53]:
moe_topk_dense_results["mean"].min() #top2_exp5_d8 mit 0.0134272127308779

0.0134272127308779

In [54]:
moe_bilstm_results = pd.read_csv('evaluations/moe_bilstm_benchmark_results.csv')
moe_bilstm_results

Unnamed: 0.1,Unnamed: 0,architecture,Loss@User10,std@User10,Loss@User11,std@User11,Loss@User12,std@User12,mean
0,0,Bi20_sexp4d8,0.016075,0.001194,0.014661,0.000493,0.011256,0.00111,0.013997
1,1,1xBi20_top2exp5d8_d16,0.016893,0.001499,0.014476,0.000408,0.0103,0.000619,0.01389
2,2,Bi10_sexp4d8,0.016085,0.000419,0.014669,0.000775,0.011186,0.001014,0.01398
3,3,1xBi10_top2exp5d8_d16,0.016815,0.000191,0.014956,0.000667,0.012526,0.000992,0.014766
4,4,Bi8_sexp8d16,0.015341,0.002136,0.014728,0.001041,0.010751,0.000751,0.013606
5,5,Bi8_top2exp10d16_d16,0.016634,0.002735,0.015349,0.000627,0.011403,0.001181,0.014462


In [55]:
moe_bilstm_results["mean"].min() 
# Top 1: Bi8_sexp8d16 mit 0.0136064913951688
# Top 2: 1xBi20_top2exp5d8_d16 mit 0.013890

0.0136064913951688

In [3]:
#dense_results = pd.read_csv('evaluations/local_learning/dense_local_learning_results.csv')
dense_results = pd.read_csv('evaluations/local_learning/moe_soft_dense_local_learning_results.csv')
dense_results


Unnamed: 0.1,Unnamed: 0,architecture,train_time,avg_time_epoch,mse,mse_std,rmse,rmse_std,mape,mape_std,mae,mae_std
0,0,2xd16_sE4d8_d16,94.014312,2.03907,0.03367,0.000641,0.183488,0.001743,69654.58,9837.284989,0.122516,0.000569
1,1,2xd16_sE4d8_d16,52.153175,1.949474,0.017896,0.000383,0.133771,0.00143,174345.1,72228.646975,0.079514,0.00542
2,2,2xd16_sE4d8_d16,43.482136,1.982685,0.020041,0.000664,0.141552,0.002354,45820.16,28997.492101,0.081765,0.009918
3,3,2xd16_sE4d8_d16,48.717956,1.89608,0.023646,0.001071,0.153745,0.003498,121542.7,13889.035962,0.069786,0.006521
4,4,2xd16_sE4d8_d16,60.132965,1.86464,0.014527,0.000374,0.120523,0.001551,124823.8,28074.649558,0.08454,0.002385
5,5,2xd16_sE4d8_d16,55.440249,1.972699,0.020247,0.000394,0.142289,0.00138,108427.1,6041.949009,0.104712,0.00203
6,6,2xd16_sE4d8_d16,87.748468,1.847504,0.020676,0.001104,0.143757,0.003824,85107.33,11466.412348,0.080189,0.002633
7,7,2xd16_sE4d8_d16,52.946954,1.928415,0.026181,0.001887,0.161737,0.005812,51837.05,22855.995976,0.104406,0.003303
8,8,2xd16_sE4d8_d16,44.16517,2.023682,0.031291,0.001148,0.176873,0.003259,64.12129,0.985042,0.10228,0.00186
9,9,2xd16_sE4d8_d16,63.738232,1.824571,0.014299,0.001059,0.119521,0.004478,2512015.0,124984.056216,0.072786,0.001851


In [4]:
dense_results["mse"].mean() 

0.018560692843877555

In [3]:
#dense_results = pd.read_csv('evaluations/local_learning/dense_local_learning_results.csv')
dense_results = pd.read_csv('evaluations/local_learning/moe_topk_dense_local_learning_results.csv')
dense_results


Unnamed: 0.1,Unnamed: 0,architecture,train_time,avg_time_epoch,mse,mse_std,rmse,rmse_std,mape,mape_std,mae,mae_std
0,0,2xd16_sE4d8_d16,83.040874,1.977245,0.035007,0.000859,0.187092,0.002288,72630.71,16750.254311,0.124065,0.002834
1,1,2xd16_sE4d8_d16,34.115855,2.075183,0.017599,0.000449,0.132655,0.001686,196543.4,25578.935295,0.076366,0.001862
2,2,2xd16_sE4d8_d16,69.371979,2.030803,0.019578,0.000554,0.139912,0.001982,47009.35,6826.545823,0.074854,0.000331
3,3,2xd16_sE4d8_d16,41.559633,1.990049,0.02211,0.000633,0.148684,0.002122,144955.1,20660.772009,0.064685,0.002472
4,4,2xd16_sE4d8_d16,41.952918,1.914348,0.014271,0.000414,0.119454,0.001736,98372.61,34341.315016,0.075848,0.002346
5,5,2xd16_sE4d8_d16,63.257136,1.912281,0.020263,0.000166,0.142347,0.000585,98608.62,12421.175159,0.10203,0.003183
6,6,2xd16_sE4d8_d16,61.831269,1.861274,0.02064,0.001232,0.143624,0.004283,73174.6,11194.008835,0.082122,0.003818
7,7,2xd16_sE4d8_d16,47.181396,2.078284,0.026975,0.001475,0.164199,0.004507,43045.5,11147.308603,0.109392,0.003029
8,8,2xd16_sE4d8_d16,31.721055,1.886508,0.030928,0.003649,0.175655,0.010454,68.94953,10.395231,0.10266,0.00388
9,9,2xd16_sE4d8_d16,54.580453,1.785817,0.014278,0.000805,0.119458,0.003386,2361718.0,367272.659594,0.072956,0.003198


In [4]:
dense_results["mse"].mean() 

0.01877490349320897