In [3]:
pip install catboost

Collecting catboost
  Obtaining dependency information for catboost from https://files.pythonhosted.org/packages/1c/e1/78e635a1e5f0066bd02a1ecfd658ad09fe30d275c65c2d0dd76fe253e648/catboost-1.2.7-cp311-cp311-win_amd64.whl.metadata
  Downloading catboost-1.2.7-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Obtaining dependency information for graphviz from https://files.pythonhosted.org/packages/00/be/d59db2d1d52697c6adc9eacaf50e8965b6345cc143f671e1ed068818d5cf/graphviz-0.20.3-py3-none-any.whl.metadata
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.7-cp311-cp311-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
    --------------------------------------- 1.5/101.7 MB 49.5 MB/s eta 0:00:03
   - -------------------------------------- 3.7/101.7 MB 59.3 MB/s eta 0:00:02
   -- ------------------------------------- 5.3/101.7 MB 56.1 MB/s eta 0:00:02
   -- -----------------

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import pickle
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

def hex_to_log(x):
    try:
        if isinstance(x, str): # x가 str이라면
            return np.log1p(flaot(int(x,16))) # 16진수 형태의 x를 정수로 변환하고 로그 값으로 변환하여 return한다
        else:
            return 0
    except ValueError:
        return 0
    
file_path = './datasets/1029_labeled_re.csv'
data = pd.read_csv(file_path)

print(data.dtypes)

Time                 float64
Source                 int64
Destination            int64
Protocol              object
Length                 int64
ID                    object
Data                  object
Same Data              int64
Strange Data           int64
Entropy              float64
Dos Attack            object
Fuzzing Attack        object
Replaying Attack      object
label                  int64
IAT                  float64
IAT_Anomaly            int64
Message_Frequency      int64
Frequency_Anomaly      int64
dtype: object


In [5]:
# ID와 Data는 hex값에서 로그값으로 바꿔준다.
data['ID'] = data['ID'].apply(hex_to_log) 
data['Data'] = data['Data'].apply(hex_to_log)

features = ['Time','Length','ID','Data','Same Data','Entropy','IAT_Anomaly','Frequency_Anomaly']
X = data[features]
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# CatBoost Classifier
model = CatBoostClassifier(
    iterations = 1000,
    learning_rate = 0.1,
    depth = 6,
    loss_function = 'MultiClass',
    random_seed = 42,
    verbose = 200
)

model.fit(X_train, y_train)

# y_pred에 X_test 데이터에 대한 모델이 "예상"한 결과를 저장하도록 한다.
y_pred = model.predict(X_test)

# classification_report()는 Scikit_learn 라이브러리에서 제공하는 함수 -> 모델의 성능 상세히 평가하는 보고서 생성
# 정확도, 정밀도, 재현율, F1 점수 포함
print(classification_report(y_est, y_pred))

model_filename = 'cb_model.pkl'

with open(model_filename, 'wb') as file:
    # pickle.dump를 통해 model 객체를 바이너리 형식으로 파일에 저장한다.직렬화
    # 이유 : 모델을 반복적으로 훈련시키는 것이 아닌 직렬화하여 저장하여 이후 동일한 모델 재훈련 없이 바로 사용하기 위함
    pickle.dump(model, file)
print(f"Model saved to {model_filename}")

with open(model_filename, 'rb') as file:
    # pickle.load를 통해 저장된 파일에서 모델을 읽어와 복원한다. 역직렬화
    loaded_model = pickle.load(file)
print("Model Loaded Successfully")

loaded_y_pred = loaded_model.predict(X_test)
print("Loaded Model Prediciton complete")

print(classification_report(y_test, loaded_y_pred))

0:	learn: 1.1207457	total: 148ms	remaining: 2m 27s
200:	learn: 0.0590474	total: 3.41s	remaining: 13.5s
400:	learn: 0.0548781	total: 6.5s	remaining: 9.71s
600:	learn: 0.0517782	total: 9.61s	remaining: 6.38s
800:	learn: 0.0491207	total: 12.7s	remaining: 3.16s
999:	learn: 0.0467919	total: 15.7s	remaining: 0us
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     39906
           1       1.00      1.00      1.00      7059
           2       0.65      0.64      0.64      2065
           3       0.68      0.83      0.75       159

    accuracy                           0.97     49189
   macro avg       0.83      0.86      0.84     49189
weighted avg       0.97      0.97      0.97     49189

Model saved to cb_model.pkl
Model loaded successfully
Loaded model prediction complete
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     39906
           1       1.00      1.00      1.00      7059
       