In [1]:
# Custom
import sys
sys.path.append('../')
from utils.dataset_manager import fit_dataset
from utils.constant import FEATURES, LABELS, ATTACKS 

# General
import warnings
from joblib import dump

# Model and Metrics
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

# Warnings
warnings.filterwarnings('ignore')

# Dataset

In [2]:
n_files = 2

df_train, df_test = fit_dataset(n_files, ATTACKS)

X_train, y_train = df_train[FEATURES], df_train[LABELS]

# Prints
print('Training Population: {}'.format(len(df_train)))
print('Testing Population: {}'.format(len(df_test)))

100%|██████████| 2/2 [00:03<00:00,  1.97s/it]
100%|██████████| 1/1 [00:01<00:00,  1.88s/it]


Training Population: 457492
Testing Population: 275258


# Model

In [3]:
# Classifiers
xgb_model = xgb.XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
logistic_model = LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42)

# Create an ensemble using VotingClassifier
ensemble_model = VotingClassifier(estimators=[
    ('xgb', xgb_model),
    ('rf', rf_model),
    ('logistic', logistic_model)],
    voting='soft')

# Fit the ensemble model on the training data
ensemble_model.fit(X_train, y_train)

In [4]:
# Save de Model
name = f"../outputs/voting_classifier_{n_files}.joblib"
dump(ensemble_model, name)

['../outputs/voting_classifier_2.joblib']

# Evaluation

In [5]:
X_test, y_test = df_test[FEATURES], df_test[LABELS]

# Predict
y_pred = ensemble_model.predict(X_test)

# Evaluate
print('Accuracy: {:4f}'.format(accuracy_score(y_test, y_pred)))
print('Recall: {:4f}'.format(recall_score(y_test, y_pred)))
print('Precision: {:4f}'.format(precision_score(y_test, y_pred)))
print('F1: {:4f}'.format(f1_score(y_test, y_pred)))
print("Classification Report:\n{}".format(classification_report(y_test, y_pred)))

Accuracy: 0.996178
Recall: 0.997847
Precision: 0.998240
F1: 0.998043
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      6387
           1       1.00      1.00      1.00    268871

    accuracy                           1.00    275258
   macro avg       0.95      0.96      0.96    275258
weighted avg       1.00      1.00      1.00    275258

