In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

import time
import sys
import os

from typing import Iterable

## 모델 성능 측정
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

## 모델 추출
import joblib


In [41]:
def BOUN_TCP_Anon_preprocessing(packet: pd.DataFrame, window_length: float=0.5):
    '''
    ## 1. 공격 패킷 표시
        - 공격 기간(초)
            - 1. [80.22269, 102.20233]
            - 2. [180.17426, 203.08441]
            - 3. [279.97402, 301.79111]
            - 4. [380.10981, 402.35755]
        - 공격 대상은 10.50.199.86:80
    '''
    
    # 1. 공격 패킷 표시
    packet['is_attack'] = (
        (
            ((packet['Time'] >= 80.22269) & (packet['Time'] <= 102.20233))
            | ((packet['Time'] >= 180.17426) & (packet['Time'] <= 203.08441))
            | ((packet['Time'] >= 279.97402) & (packet['Time'] <= 301.79111))
            | ((packet['Time'] >= 380.10981) & (packet['Time'] <= 402.35755))
        )
        & (packet['Destination_IP'] == '10.50.199.86')
        & (packet['Destination_Port'] == 80.0)
    )



    # 2. 패킷 시간 분할
    packet['window_num'] = packet['Time'] // window_length



    # 3. 윈도우 별 특징 정보 추출
    result = {
        'Source_IP_count': [], 
        'Source_IP_maxiter': [], 
        'Frame_length_sum': [], 
        'Receive_packet_count': [], 
        'attack_packet_density': [], 
    }

    for _, window_cur in packet.groupby(packet['window_num']):
        # print(window_cur)
        result['Source_IP_count'].append(window_cur['Source_ip'].count())
        result['Source_IP_maxiter'].append(window_cur['Source_ip'].value_counts().max())
        result['Frame_length_sum'].append(window_cur['Frame_length'].sum())
        result['Receive_packet_count'].append(window_cur.shape[0])
        result['attack_packet_density'].append((len(window_cur.loc[window_cur['is_attack'] == True]) / window_cur.shape[0]) * 10)
    


    # 4. 결과 반환
    return pd.DataFrame(result)

In [42]:
before = pd.read_csv('../../Resource/DDoS_AI/BOUN_DDoS dataset/BOUN_UDP_Anon.csv')

In [43]:
before

Unnamed: 0,Time,Frame Number,Frame_length,Source_ip,Destination_IP,Source_Port,Destination_Port,SYN,ACK,RST,TTL,TCP_Protocol
0,0.000000,1,68,92.45.54.178,10.50.209.134,,,,,,116,UDP
1,0.000218,2,900,10.50.197.6,31.13.84.8,49218.0,443.0,Not set,Set,Not set,127,TCP
2,0.000233,3,171,31.13.84.8,192.168.66.111,443.0,40991.0,Not set,Set,Not set,25287,TCP
3,0.000235,4,1500,192.168.68.148,54.225.245.82,54602.0,443.0,Not set,Set,Not set,6364,TCP
4,0.000466,5,126,192.168.79.128,64.15.113.173,55251.0,443.0,Not set,Set,Not set,6364,TCP
...,...,...,...,...,...,...,...,...,...,...,...,...
8047330,484.638574,8047331,2978,79.123.178.64,167.114.91.218,3389.0,3403.0,Not set,Set,Not set,127,TCP
8047331,484.638728,8047332,2978,79.123.178.64,167.114.91.218,3389.0,3403.0,Not set,Set,Not set,127,TCP
8047332,484.638730,8047333,382,192.168.74.193,216.58.208.102,,,,,,63128,UDP
8047333,484.638731,8047334,144,78:dd:08:c4:2b:ff,Alcatel-_70:9b:c7,,,,,,63,Generic Routing Encapsulation


In [44]:
## 황경호 교수님 제출용 샘플 데이터 추출 (안쓰면 주석처리)
before.sample(100).to_csv('UDP-Flooding_before_preprocessing_sample_100.csv')

In [45]:
after = BOUN_TCP_Anon_preprocessing(
    packet=before, 
    window_length=0.5
)

In [46]:
after.describe()

Unnamed: 0,Source_IP_count,Source_IP_maxiter,Frame_length_sum,Receive_packet_count,attack_packet_density
count,969.0,969.0,969.0,969.0,969.0
mean,8304.783282,806.803922,6477842.0,8304.783282,0.0
std,1730.536662,162.881335,1689487.0,1730.536662,0.0
min,29.0,8.0,19012.0,29.0,0.0
25%,7045.0,715.0,5243672.0,7045.0,0.0
50%,8172.0,800.0,6320837.0,8172.0,0.0
75%,9425.0,867.0,7542814.0,9425.0,0.0
max,14952.0,1815.0,12180730.0,14952.0,0.0


In [47]:
train = {}
test = {}



train['feature'], test['feature'], train['target'], test['target'] = train_test_split(
    after.drop(['attack_packet_density'], axis=1), 
    after['attack_packet_density'], 
    # stratify=after['attack_packet_density'], 
    test_size=0.3, 
    random_state=42
)

In [48]:
## 모델 테스트 함수
def model_test_result(
        models: Iterable, 
        train: dict, 
        test: dict,
        save_result: bool=False, 
        save_path: str=None, 
        report_nickname: str='',
        save_force: bool=False, 
        result_precision: int=6, 
):
    # 결과 메세지 선언
    result_msg = ''



    # 테스트 환경 -> result_msg
    PYTHON_VERSION = f'{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}'
    result_msg += f'python version : {PYTHON_VERSION}\n\n\n\n'



    # 모델별 학습/예측 결과 -> result_msg
    for model_cur in models:
        # 학습시간 측정
        train_time = time.time()
        model_cur.fit(train['feature'], train['target'])
        train_time = time.time() - train_time

        # 예측시간 측정
        pred_time = time.time()
        pred_result = model_cur.predict(test['feature'])
        pred_time = time.time() - pred_time

        # 결과 생성
        result_msg_cur = ''
        result_msg_cur += f"--측정 결과--\n"
        result_msg_cur += f"class name : {model_cur.__class__.__name__}\n\n"
        result_msg_cur += f"학습 시간 : {round(train_time, result_precision)}초\n"
        result_msg_cur += f"예측 시간 : {round(pred_time, result_precision)}초\n"
        try:
            result_msg_cur += f"classification report :\n{classification_report(test['target'], pred_result)}\n\n"
            result_msg_cur += f"confusion matrix :\n{pd.DataFrame(confusion_matrix(test['target'], pred_result), index=('Leak_true', 'Norm_true'), columns=('Leak_pred', 'Norm_pred'))}\n\n\n\n"
        # except:
            result_msg_cur += f"model_cur.score() : {round(model_cur.score(test['feature'], test['target']), result_precision)}\n"
            result_msg_cur += f"r2 score : {round(r2_score(test['target'], pred_result), result_precision)}\n"
            result_msg_cur += f"MAE : {round(mean_absolute_error(test['target'], pred_result), result_precision)}\n"
            result_msg_cur += f"MSE : {round(mean_squared_error(test['target'], pred_result), result_precision)}\n\n\n\n"
        except:
            pass
        # # 결과 기록
        result_msg += result_msg_cur



    # save_result == True일 시, 테스트 결과/모델/데이터 저장
    if save_result:
        # save_force == True일 시, 경로 없을 시 생성 후 저장
        if save_force and not(os.path.isdir(save_path)):
            os.makedirs(save_path)

        # 리포트 저장 디렉토리 생성
        report_path = f'{save_path}{time.strftime("%y%m%d-%H%M%S")}_{report_nickname}/'
        os.mkdir(report_path)

        # 모델 저장
        for model_cur in models:
            model_path = f'{report_path}model_{PYTHON_VERSION}_{model_cur.__class__.__name__}.pkl'
            joblib.dump(model_cur, model_path)

        # 학습/검증 데이터 저장
        train['feature'].to_csv(f'{report_path}feature_train.csv')
        train['target'].to_csv(f'{report_path}target_train.csv')
        test['feature'].to_csv(f'{report_path}feature_test.csv')
        test['target'].to_csv(f'{report_path}target_test.csv')

        # 테스트 결과 저장
        with open(f'{report_path}test_report.txt', mode='w', encoding='utf-8') as report_file:
           report_file.write(result_msg)

        

    # 테스트 결과 반환
    return result_msg

In [49]:
models = [
    LGBMRegressor(random_state=42), 
    XGBRegressor(random_state=42), 
    LinearRegression(), 
    # LogisticRegression(random_state=42), 
]



print(
    model_test_result(
        models=models, 
        train=train, 
        test=test, 
        save_result=False
    )
)

python version : 3.11.1



--측정 결과--
class name : LGBMRegressor

학습 시간 : 0.009997초
예측 시간 : 0.000998초
classification report :
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       291

    accuracy                           1.00       291
   macro avg       1.00      1.00      1.00       291
weighted avg       1.00      1.00      1.00       291


model_cur.score() : 1.0
r2 score : 1.0
MAE : 0.0
MSE : 0.0



--측정 결과--
class name : XGBRegressor

학습 시간 : 0.023999초
예측 시간 : 0.001초
classification report :
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       291

    accuracy                           1.00       291
   macro avg       1.00      1.00      1.00       291
weighted avg       1.00      1.00      1.00       291


model_cur.score() : 1.0
r2 score : 1.0
MAE : 0.0
MSE : 0.0



--측정 결과--
class name : LinearRegression

학습 시간 : 0.004초
예측 시간 : 0.000999초
classification report :
           