### Demonstration of Aggregate Prediction Algorithms (Weighted Score and Majority Voting) Using Different Classifiers

In [1]:
import helper, algorithm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
import pandas as pd
import joblib
import time
import warnings
warnings.simplefilter('ignore')

0. Train client and server models

In [2]:
# Temporarily switch the root directory to the upper level to avoid not finding the dataset files.
import os
original_dir = os.getcwd()
os.chdir('..')

X_train1, y_train1 = helper.load_sensor_train_set(1)
X_train2, y_train2 = helper.load_sensor_train_set(2)
X_train3, y_train3 = helper.load_sensor_train_set(3)
X_train4, y_train4 = helper.load_network_train_set()
sensor_test, network_test = helper.load_test_set()

os.chdir(original_dir) # Switch back to the original directory

In [3]:
rf = RandomForestClassifier()
rf.fit(X_train1, y_train1)

rf_filename = './new_models/client_1.joblib'
joblib.dump(rf, filename=rf_filename)

['./new_models/client_1.joblib']

In [4]:
lr = LogisticRegression()
lr.fit(X_train2, y_train2)

lr_filename = './new_models/client_2.joblib'
joblib.dump(lr, filename=lr_filename)

['./new_models/client_2.joblib']

In [5]:
gb = GradientBoostingClassifier()
gb.fit(X_train3, y_train3)

gb_filename = './new_models/client_3.joblib'
joblib.dump(gb, filename=gb_filename)

['./new_models/client_3.joblib']

In [6]:
sv = svm.SVC()
sv.fit(X_train4, y_train4)

sv_filename = './new_models/server.joblib'
joblib.dump(sv, filename=sv_filename)

['./new_models/server.joblib']

1. Testing of each model and demonstration of model aggregate prediction algorithms

In [7]:
# Load the models
sensor_models = [rf_filename, lr_filename, gb_filename]
network_model = sv_filename

# Test models 
models_predictions, models_metrics = algorithm.get_predictions_and_metrics(
    local_models=sensor_models,
    sensor_test=sensor_test,
    global_model=network_model,
    network_test=network_test,
    roc=False
)


Client 1 Prediction Results:
*******************
Accuracy : 0.911647
Precision: 0.911929
Recall   : 0.911647
F1 Score : 0.911759
*******************


Client 2 Prediction Results:
*******************
Accuracy : 0.617661
Precision: 0.591625
Recall   : 0.617661
F1 Score : 0.533447
*******************


Client 3 Prediction Results:
*******************
Accuracy : 0.725896
Precision: 0.725676
Recall   : 0.725896
F1 Score : 0.725784
*******************


Server Prediction Results:
*******************
Accuracy : 0.971000
Precision: 0.970982
Recall   : 0.971000
F1 Score : 0.970987
*******************



In [8]:
# Load network testing set, set the weights
y_test = network_test.iloc[:, -1]
model_weights = [0.2, 0.2, 0.2, 0.4]

In [9]:
print(f"Aggregate Prediction by Weighted Score:")

score = algorithm.aggregate_predict_by_score(
    models_predictions=models_predictions,
    models_metrics=models_metrics,
    models_weights=model_weights)

helper.get_metrics(y_test, score, printout=True)

Aggregate Prediction by Weighted Score:
*******************
Accuracy : 0.953771
Precision: 0.954912
Recall   : 0.953771
F1 Score : 0.953393
*******************



(0.9537712895377128,
 0.9549118136813081,
 0.9537712895377128,
 0.9533932867229846)

In [10]:
print(f"Aggregate Prediction by Majority Voting:")

vote = algorithm.aggregate_predict_by_vote(models_predictions=models_predictions)

helper.get_metrics(y_test, vote, printout=True)

Aggregate Prediction by Majority Voting:
*******************
Accuracy : 0.937329
Precision: 0.943410
Recall   : 0.937329
F1 Score : 0.937893
*******************



(0.937329499373295, 0.9434096937078937, 0.937329499373295, 0.937893072417127)

2. Simulation of intrusion detection

In [11]:
sensor_test['target'].value_counts()

target
0    24859
1    15830
Name: count, dtype: int64

In [12]:
# Select 10000 equally distributed samples from the testing set
s0 = sensor_test[sensor_test['target'] == 0].sample(10000)
s1 = sensor_test[sensor_test['target'] == 1].sample(10000)

sampled_sensor_data = pd.concat([s0, s1])
sampled_network_data = network_test.loc[sampled_sensor_data.index]

sampled_sensor_data = sampled_sensor_data.reset_index(drop=True)
sampled_network_data = sampled_network_data.reset_index(drop=True)

print(sampled_sensor_data['target'].value_counts())
print(sampled_network_data['target'].value_counts())

target
0    10000
1    10000
Name: count, dtype: int64
target
0    10000
1    10000
Name: count, dtype: int64


In [13]:
# Use Client 1 as an example
client_1_model = joblib.load(rf_filename)

test_X = sampled_sensor_data.iloc[:, :-1]
test_y = sampled_sensor_data.iloc[:, -1]

client_1_predict = client_1_model.predict(test_X)

helper.get_metrics(test_y, client_1_predict, printout=True)

*******************
Accuracy : 0.909100
Precision: 0.909585
Recall   : 0.909100
F1 Score : 0.909073
*******************



(0.9091, 0.9095846861341838, 0.9091, 0.9090731001859589)

In [14]:
client_1_predict_df = pd.DataFrame(client_1_predict, columns=['Pred'])
client_1_predict_df['Pred'].value_counts()

Pred
0    10344
1     9656
Name: count, dtype: int64

In [15]:
# Client 1 will test the sensor data locally, only those that pass will be sent to the server
# Drop false data, target = 1
rows_to_keep = client_1_predict_df['Pred'] == 0
filtered_sensor_data = sampled_sensor_data[rows_to_keep]
filtered_network_data = sampled_network_data[rows_to_keep]

print(filtered_sensor_data['target'].value_counts())
print(filtered_network_data['target'].value_counts())

target
0    9263
1    1081
Name: count, dtype: int64
target
0    9263
1    1081
Name: count, dtype: int64


In [16]:
# Test Client 1 data using all models
new_models_predictions, new_models_metrics = algorithm.get_predictions_and_metrics(
    local_models=sensor_models,
    sensor_test=filtered_sensor_data,
    global_model=network_model,
    network_test=filtered_network_data,
    roc=False
)


Client 1 Prediction Results:
*******************
Accuracy : 0.895495
Precision: 0.801911
Recall   : 0.895495
F1 Score : 0.846123
*******************


Client 2 Prediction Results:
*******************
Accuracy : 0.849381
Precision: 0.814884
Recall   : 0.849381
F1 Score : 0.830957
*******************


Client 3 Prediction Results:
*******************
Accuracy : 0.767595
Precision: 0.839911
Recall   : 0.767595
F1 Score : 0.798144
*******************


Server Prediction Results:
*******************
Accuracy : 0.972448
Precision: 0.974675
Recall   : 0.972448
F1 Score : 0.973176
*******************



In [17]:
# Load Client 1 network testing set
new_y_test = filtered_network_data.iloc[:, -1]

In [18]:
print(f"Client 1 Aggregate Prediction by Weighted Score:")
start_time = time.time()

new_score = algorithm.aggregate_predict_by_score(
    models_predictions=new_models_predictions,
    models_metrics=new_models_metrics,
    models_weights=model_weights)

end_time = time.time()
rows = len(new_models_predictions[0])
time_spend = end_time - start_time

helper.get_metrics(new_y_test, new_score, printout=True)
print(f"Time spent predicting {rows} samples: {time_spend:.4f}s, average time spent: {time_spend / rows:.4f}s.")

Client 1 Aggregate Prediction by Weighted Score:
*******************
Accuracy : 0.928074
Precision: 0.925984
Recall   : 0.928074
F1 Score : 0.913639
*******************

Time spent predicting 10344 samples: 0.0010s, average time spent: 0.0000s.


In [19]:
print(f"Client 1 Aggregate Prediction by Majority Voting:")
start_time = time.time()

new_vote = algorithm.aggregate_predict_by_vote(models_predictions=new_models_predictions)

end_time = time.time()
time_spend = end_time - start_time

helper.get_metrics(new_y_test, new_vote, printout=True)
print(f"Time spent predicting {rows} samples: {time_spend:.4f}s, average time spent: {time_spend / rows:.4f}s.")

Client 1 Aggregate Prediction by Majority Voting:
*******************
Accuracy : 0.955530
Precision: 0.964695
Recall   : 0.955530
F1 Score : 0.958161
*******************

Time spent predicting 10344 samples: 0.0060s, average time spent: 0.0000s.
