In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier
import pickle

def hex_to_log(x):
    try:
        if isinstance(x, str):  
            return np.log1p(float(int(x, 16)))  
        else:
            return 0  
    except ValueError:
        return 0  

file_path = './datasets/1029_labeled_re.csv'
data = pd.read_csv(file_path)

print(data.dtypes)

Time                 float64
Source                 int64
Destination            int64
Protocol              object
Length                 int64
ID                    object
Data                  object
Same Data              int64
Strange Data           int64
Entropy              float64
Dos Attack            object
Fuzzing Attack        object
Replaying Attack      object
label                  int64
IAT                  float64
IAT_Anomaly            int64
Message_Frequency      int64
Frequency_Anomaly      int64
dtype: object


In [3]:
data['ID'] = data['ID'].apply(hex_to_log)
data['Data'] = data['Data'].apply(hex_to_log)

features = ['Time', 'Length', 'ID', 'Data', 'Same Data', 'Entropy','IAT_Anomaly','Frequency_Anomaly']  
X = data[features]
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# CatBoost
cat_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    random_seed=42,
    verbose=200
)

# XgBoost
xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',  
    num_class=len(y.unique()),
    max_depth=6,
    learning_rate=0.1,
    n_estimators=500,
    random_state=42
)


#RandomForest
rf_model = RandomForestClassifier(random_state = 42)

cat_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

ensemble_model = VotingClassifier(
    estimators=[('cat', cat_model), ('xgb', xgb_model), ('rf', rf_model)],
    voting='soft'  
)

ensemble_model.fit(X_train, y_train)

y_pred = ensemble_model.predict(X_test)
print("Ensemble Model Evaluation:")
print(classification_report(y_test, y_pred))

model_filename = 'ensem_cb_xgb_model.pkl'

with open(model_filename, 'wb') as file:
    pickle.dump(ensemble_model, file)
print(f"Ensemble model saved to {model_filename}")

with open(model_filename, 'rb') as file:
    loaded_model = pickle.load(file)
print("Ensemble model loaded successfully")

loaded_y_pred = loaded_model.predict(X_test)
print("Loaded model prediction complete")

print("Classification report for loaded model:")
print(classification_report(y_test, loaded_y_pred))

0:	learn: 1.1207457	total: 161ms	remaining: 1m 20s
200:	learn: 0.0590474	total: 3.41s	remaining: 5.08s
400:	learn: 0.0548781	total: 6.5s	remaining: 1.6s
499:	learn: 0.0532856	total: 8.1s	remaining: 0us
0:	learn: 1.1207457	total: 15.3ms	remaining: 7.66s
200:	learn: 0.0590474	total: 3.43s	remaining: 5.1s
400:	learn: 0.0548781	total: 6.74s	remaining: 1.66s
499:	learn: 0.0532856	total: 8.34s	remaining: 0us
Ensemble Model Evaluation:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     39906
           1       1.00      1.00      1.00      7059
           2       0.79      0.77      0.78      2065
           3       0.64      0.67      0.66       159

    accuracy                           0.98     49189
   macro avg       0.85      0.86      0.86     49189
weighted avg       0.98      0.98      0.98     49189

Ensemble model saved to ensem_cb_xgb_model.pkl
Ensemble model loaded successfully
Loaded model prediction complete
Classification re