In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.datasets import make_classification
from typing import Dict, List, Tuple, Any
import matplotlib.patches as mpatches
import kagglehub
import glob
import os

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

- T·∫°o class DoSDDoSClassifier v·ªõi ƒë·∫ßy ƒë·ªß ph∆∞∆°ng ph√°p:
- Tri·ªÉn khai 4 thu·∫≠t to√°n ML (KNN, AdaBoost, Random Forest, SVM)
- Tri·ªÉn khai 3 k·ªπ thu·∫≠t gi·∫£m chi·ªÅu d·ªØ li·ªáu (PCA, Feature Importance, Univariate Selection)
- Ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu (l√†m s·∫°ch, chu·∫©n h√≥a MinMax)

In [2]:
class DoSDDoSClassifier:
    """
    - Thu·∫≠t to√°n 4 ml: KNN, AdaBoost, Random Forest, SVM
    - 3 K·ªπ thu·∫≠t gi·∫£m chi·ªÅu d·ªØ li·ªáu: PCA, Feature Importance, Univariate Selection
    - Ho√†n th√†nh ti·ªÅn x·ª≠ l√Ω
    - ƒê√°nh gi√° v·ªõi th·ªùi gian v√† ƒë·ªô ch√≠nh x√°c
    """
    
    def __init__(self):
        # Thu·∫≠t to√°n ML
        self.ml_algorithms = {
            'KNN': KNeighborsClassifier(n_neighbors=5),
            'AdaBoost': AdaBoostClassifier(n_estimators=50, random_state=42),
            'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
            'SVM': SVC(kernel='rbf', random_state=42)
        }
        
        # Gi·∫£m chi·ªÅu d·ªØ li·ªáu
        self.feature_selection_methods = {
            'PCA': None,  # S·∫Ω ƒë∆∞·ª£c kh·ªüi t·∫°o ƒë·ªông
            'Feature_Importance': None,  # ExtraTree + SelectFromModel
            'Univariate_Selection': None  # SelectKBest v·ªõi chi2
        }
        
        # Scaler normalisation MinMax (-1, 1)
        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        self.label_encoder = LabelEncoder()
        
        # L∆∞u tr·ªØ k·∫øt qu·∫£ v√† d·ªØ li·ªáu ƒë√£ gi·∫£m chi·ªÅu
        self.results = {}
        self.feature_reduced_data = {}
        
    def preprocess_data(self, X, y, dataset_name="Dataset"):
        """
        Ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu:
        1. L√†m s·∫°ch gi√° tr·ªã NaN v√† v√¥ c√πng
        2. Lo·∫°i b·ªè c√°c features b·∫±ng 0
        3. Chu·∫©n h√≥a MinMax (-1, 1)
        4. Encode nh√£n
        """
        print(f"\nTi·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu {dataset_name}...")
        print(f"   K√≠ch th∆∞·ªõc ban ƒë·∫ßu: {X.shape}")
        
        # 1. L√†m s·∫°ch d·ªØ li·ªáu
        X_clean = X.copy()
        X_clean = X_clean.replace([np.inf, -np.inf], np.nan)
        X_clean = X_clean.fillna(X_clean.mean())

        # 2. Lo·∫°i b·ªè c√°c feature b·∫±ng 0
        non_zero_var_cols = X_clean.columns[(X_clean != 0).any(axis=0)]
        X_clean = X_clean[non_zero_var_cols]
        
        print(f"   Sau khi l√†m s·∫°ch: {X_clean.shape}")
        
        # 3. Chu·∫©n h√≥a MinMax (-1, 1)
        X_scaled = pd.DataFrame(
            self.scaler.fit_transform(X_clean),
            columns=X_clean.columns,
            index=X_clean.index
        )
        
        # 4. Encode nh√£n
        y_encoded = self.label_encoder.fit_transform(y)
        
        print(f"Ti·ªÅn x·ª≠ l√Ω ho√†n t·∫•t - shape cu·ªëi c√πng: {X_scaled.shape}")
        return X_scaled, y_encoded
    
    def apply_feature_selection(self, X_train, X_test, y_train, target_features=None):
        """
        √Åp d·ª•ng 3 k·ªπ thu·∫≠t gi·∫£m chi·ªÅu d·ªØ li·ªáu:
        1. PCA (gi·∫£m xu·ªëng ~50% s·ªë feature)
        2. Feature Importance (ExtraTree + SelectFromModel)
        3. Univariate Selection (SelectKBest v·ªõi chi2)
        """
        print(f"\n√Åp d·ª•ng c√°c k·ªπ thu·∫≠t gi·∫£m chi·ªÅu d·ªØ li·ªáu...")
        
        # T√≠nh s·ªë l∆∞·ª£ng features m·ª•c ti√™u (kho·∫£ng 50% theo b√†i b√°o)
        if target_features is None:
            target_features = max(X_train.shape[1] // 2, 10)
        
        results = {}
        
        # 1. PCA (gi·∫£m xu·ªëng ~50% s·ªë features)
        print(f"PCA: {X_train.shape[1]} ‚Üí {target_features} features")
        pca = PCA(n_components=target_features)
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)
        results['PCA'] = {
            'X_train': X_train_pca,
            'X_test': X_test_pca,
            'n_features': target_features,
            'method': pca
        }
        
        # 2. Feature Importance ExtraTree + SelectFromModel
        print(f"Feature Importance: t·ª± ƒë·ªông ch·ªçn c√°c features t·ªët nh·∫•t")
        extra_tree = ExtraTreesClassifier(n_estimators=100, random_state=42)
        extra_tree.fit(X_train, y_train)
        selector = SelectFromModel(extra_tree, max_features=target_features)
        X_train_fi = selector.fit_transform(X_train, y_train)
        X_test_fi = selector.transform(X_test)
        results['Feature_Importance'] = {
            'X_train': X_train_fi,
            'X_test': X_test_fi,
            'n_features': X_train_fi.shape[1],
            'method': selector
        }

        # 3. Univariate Selection v·ªõi chi2 (b√†i b√°o trang 3)
        print(f"Univariate Selection: SelectKBest vs chi2")
        # Chuy·ªÉn ƒë·ªïi th√†nh gi√° tr·ªã d∆∞∆°ng cho chi2
        X_train_pos = X_train - X_train.min() + 0.01
        X_test_pos = X_test - X_test.min() + 0.01
        
        univariate = SelectKBest(score_func=chi2, k=target_features)
        X_train_us = univariate.fit_transform(X_train_pos, y_train)
        X_test_us = univariate.transform(X_test_pos)
        results['Univariate_Selection'] = {
            'X_train': X_train_us,
            'X_test': X_test_us,
            'n_features': target_features,
            'method': univariate
        }
        
        print(f"Ho√†n th√†nh gi·∫£m chi·ªÅu d·ªØ li·ªáu")
        return results
    
    def evaluate_model_combination(self, X_train, X_test, y_train, y_test, 
                                 ml_name, feature_method_name, dataset_name):
        """
        ƒê√°nh gi√° th·ªùi gian v√† ƒë·ªô ch√≠nh x√°c
        """
        # L·∫•y m√¥ h√¨nh ML
        model = self.ml_algorithms[ml_name]
        
        # Hu·∫•n luy·ªán vs ƒëo th·ªùi gian
        start_time = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_time
        
        # D·ª± ƒëo√°n vs ƒëo th·ªùi gian
        start_time = time.time()
        y_pred = model.predict(X_test)
        predict_time = time.time() - start_time

        # T√≠nh ƒë·ªô ch√≠nh x√°c
        accuracy = accuracy_score(y_test, y_pred)

        # T·ªïng th·ªùi gian (hu·∫•n luy·ªán + d·ª± ƒëo√°n) t√≠nh b·∫±ng mili gi√¢y
        total_time_ms = (train_time + predict_time) * 1000
        
        return {
            'accuracy': accuracy * 100,  # Ph·∫ßn trƒÉm
            'total_time_ms': total_time_ms,
            'train_time': train_time,
            'predict_time': predict_time,
            'y_pred': y_pred
        }


- T·∫°o c√°c b·ªô d·ªØ li·ªáu t·ªïng h·ª£p m√¥ ph·ªèng NSL-KDD 2019, CICIDS 2017 v√† d·ªØ li·ªáu m√¥ ph·ªèng (ƒëo·∫°n n√†y b·ªè qua v√¨ d√πng lu√¥n 2 b·ªô d·ªØ li·ªáu tr√™n kaggle)

In [3]:
#b·ªè qua
def create_synthetic_datasets():
    """
    T·∫°o c√°c b·ªô d·ªØ li·ªáu t·ªïng h·ª£p m√¥ ph·ªèng ƒë·∫∑c tr∆∞ng c·ªßa 3 b·ªô d·ªØ li·ªáu trong b√†i b√°o:
    1. NSL-KDD 2019: ~150,000 m·∫´u, 42 ƒë·∫∑c tr∆∞ng ‚Üí 21 sau gi·∫£m
    2. CICIDS 2017: 68 ƒë·∫∑c tr∆∞ng ‚Üí 23 sau gi·∫£m  
    3. D·ªØ li·ªáu m√¥ ph·ªèng: 45,500 m·∫´u, 73 ƒë·∫∑c tr∆∞ng ‚Üí 20 sau gi·∫£m
    """
    
    datasets = {}
    
    print("T·∫°o c√°c b·ªô d·ªØ li·ªáu t·ªïng h·ª£p d·ª±a tr√™n b√†i b√°o...")
    
    # 1. Dataset simulant NSL-KDD 2019
    print("\nT·∫°o b·ªô d·ªØ li·ªáu NSL-KDD 2019...")
    X_nsl, y_nsl = make_classification(
        n_samples=15000,  # s·ªë l∆∞·ª£ng m·∫´u (article: 150,000)
        n_features=42,    
        n_informative=35,
        n_redundant=5,
        n_clusters_per_class=2,
        class_sep=0.8,
        random_state=42
    )
    
    # Chuy·ªÉn ƒë·ªïi sang d·ªØ li·ªáu v·ªõi t√™n c·ªôt th·ª±c t·∫ø
    nsl_features = [f'feature_{i+1}' for i in range(42)]
    X_nsl_df = pd.DataFrame(X_nsl, columns=nsl_features)
    y_nsl_labels = ['Normal' if label == 0 else 'Attack' for label in y_nsl]
    
    datasets['NSL-KDD_2019'] = {
        'X': X_nsl_df,
        'y': pd.Series(y_nsl_labels),
        'description': 'Dataset t·ªïng h·ª£p m√¥ ph·ªèng NSL-KDD 2019 (42‚Üí21 ƒë·∫∑c tr∆∞ng)'
    }
    
    # 2. Dataset simulant CICIDS 2017
    print("T·∫°o b·ªô d·ªØ li·ªáu t·ªïng h·ª£p CICIDS 2017...")
    X_cicids, y_cicids = make_classification(
        n_samples=12000,  
        n_features=68,    
        n_informative=55,
        n_redundant=8,
        n_clusters_per_class=3,
        class_sep=0.7,
        random_state=123
    )

    # M√¥ ph·ªèng c√°c lo·∫°i t·∫•n c√¥ng kh√°c nhau
    cicids_features = [f'flow_feature_{i+1}' for i in range(68)]
    X_cicids_df = pd.DataFrame(X_cicids, columns=cicids_features)

    # T·∫°o nh√£n ƒëa l·ªõp ƒë·ªÉ m√¥ ph·ªèng c√°c lo·∫°i t·∫•n c√¥ng kh√°c nhau
    attack_types = ['Normal', 'DoS', 'DDoS', 'Brute_Force']
    y_cicids_multiclass = np.random.choice(attack_types, size=len(y_cicids), p=[0.6, 0.2, 0.15, 0.05])
    
    datasets['CICIDS_2017'] = {
        'X': X_cicids_df,
        'y': pd.Series(y_cicids_multiclass),
        'description': 'Dataset t·ªïng h·ª£p m√¥ ph·ªèng CICIDS 2017 (68‚Üí23 ƒë·∫∑c tr∆∞ng)'
    }

    # 3. Dataset m√¥ ph·ªèng d·ªØ li·ªáu (Lima Filho et al., 2019)
    print("T·∫°o b·ªô d·ªØ li·ªáu m√¥ ph·ªèng...")
    X_sim, y_sim = make_classification(
        n_samples=4550,   
        n_features=73,   
        n_informative=60,
        n_redundant=10,
        n_clusters_per_class=2,
        class_sep=0.9,    # R·∫•t t√°ch bi·ªát ƒë·ªÉ m√¥ ph·ªèng m·ªôt m√¥i tr∆∞·ªùng ki·ªÉm so√°t
        random_state=456
    )
    
    sim_features = [f'network_metric_{i+1}' for i in range(73)]
    X_sim_df = pd.DataFrame(X_sim, columns=sim_features)
    y_sim_labels = ['Normal' if label == 0 else 'Attack' for label in y_sim]
    
    datasets['Simulated_Data'] = {
        'X': X_sim_df,
        'y': pd.Series(y_sim_labels),
        'description': 'Dataset t·ªïng h·ª£p m√¥ ph·ªèng d·ªØ li·ªáu Lima Filho (73‚Üí20 ƒë·∫∑c tr∆∞ng)'
    }
    
    return datasets

# T·∫°o c√°c b·ªô d·ªØ li·ªáu t·ªïng h·ª£p
synthetic_datasets = create_synthetic_datasets()

T·∫°o c√°c b·ªô d·ªØ li·ªáu t·ªïng h·ª£p d·ª±a tr√™n b√†i b√°o...

T·∫°o b·ªô d·ªØ li·ªáu NSL-KDD 2019...
T·∫°o b·ªô d·ªØ li·ªáu t·ªïng h·ª£p CICIDS 2017...
T·∫°o b·ªô d·ªØ li·ªáu m√¥ ph·ªèng...


In [4]:
# B·ªï sung m·ªôt v√†i gi√° tr·ªã c√≥ v·∫•n ƒë·ªÅ ƒë·ªÉ ki·ªÉm tra ti·ªÅn x·ª≠ l√Ω
print("B·ªï sung gi√° tr·ªã NaN v√† v√¥ c·ª±c ƒë·ªÉ ki·ªÉm tra ti·ªÅn x·ª≠ l√Ω...")

for dataset_name, dataset in synthetic_datasets.items():
    # Ti√™m 1-2% gi√° tr·ªã c√≥ v·∫•n ƒë·ªÅ
    n_samples, n_features = dataset['X'].shape
    n_nan = int(0.01 * n_samples * n_features)

    # V·ªã tr√≠ ng·∫´u nhi√™n cho NaN
    nan_positions = np.random.choice(n_samples * n_features, n_nan, replace=False)
    for pos in nan_positions:
        row, col = divmod(pos, n_features)
        dataset['X'].iloc[row, col] = np.nan

    # M·ªôt v√†i gi√° tr·ªã v√¥ c·ª±c
    n_inf = max(1, n_nan // 10)
    inf_positions = np.random.choice(n_samples * n_features, n_inf, replace=False)
    for pos in inf_positions:
        row, col = divmod(pos, n_features)
        dataset['X'].iloc[row, col] = np.inf if np.random.random() > 0.5 else -np.inf

print(f"\n‚úÖ 3 datasets t·ªïng h·ª£p ƒë∆∞·ª£c t·∫°o th√†nh c√¥ng:")
for name, dataset in synthetic_datasets.items():
    print(f"   ‚Ä¢ {name}: {dataset['X'].shape[0]} m·∫´u, {dataset['X'].shape[1]} ƒë·∫∑c tr∆∞ng")
    print(f"     Classes: {dataset['y'].value_counts().to_dict()}")
    print(f"     NaN values: {dataset['X'].isnull().sum().sum()}")
    print(f"     Inf values: {np.isinf(dataset['X']).sum().sum()}")

B·ªï sung gi√° tr·ªã NaN v√† v√¥ c·ª±c ƒë·ªÉ ki·ªÉm tra ti·ªÅn x·ª≠ l√Ω...

‚úÖ 3 datasets t·ªïng h·ª£p ƒë∆∞·ª£c t·∫°o th√†nh c√¥ng:
   ‚Ä¢ NSL-KDD_2019: 15000 m·∫´u, 42 ƒë·∫∑c tr∆∞ng
     Classes: {'Normal': 7500, 'Attack': 7500}
     NaN values: 6296
     Inf values: 630
   ‚Ä¢ CICIDS_2017: 12000 m·∫´u, 68 ƒë·∫∑c tr∆∞ng
     Classes: {'Normal': 7233, 'DoS': 2376, 'DDoS': 1791, 'Brute_Force': 600}
     NaN values: 8150
     Inf values: 816
   ‚Ä¢ Simulated_Data: 4550 m·∫´u, 73 ƒë·∫∑c tr∆∞ng
     Classes: {'Attack': 2279, 'Normal': 2271}
     NaN values: 3319
     Inf values: 332


- Load data t·ª´ b·ªô dataset NSL-KDD 2019, CICIDS 2017 tr√™n kaggle
- V·ªõi b·ªô NSL-KDD 2019: g√°n t√™n c·ªôt, encode m·ªôt s·ªë c·ªôt d·ªØ li·ªáu, g·ªôp 2 b·ªô d·ªØ li·ªáu train, test
- V·ªõi CICIDS 2017: g·ªôp t·∫•t c·∫£ c√°c b·ªô d·ªØ li·ªáu

In [5]:
def load_dataset():
  
  datasets = {}
  # L·∫•y d·ªØ li·ªáu b·ªô NSL-KDD 2019
  # Download latest version
  path_nslkdd = kagglehub.dataset_download("hassan06/nslkdd")
  train_data_KDD = pd.read_csv(f"{path_nslkdd}/KDDTrain+.txt", header=None)
  test_data_KDD = pd.read_csv(f"{path_nslkdd}/KDDTest+.txt", header=None)
  
  # Define the list of column names based on the NSL-KDD dataset description
  columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'attack', 'level'
  ]

  # Assign the column names to the dataframe
  train_data_KDD.columns = columns
  test_data_KDD.columns = columns
  
  data_KDD = pd.concat([train_data_KDD, test_data_KDD], ignore_index=True)

  X_KDD = data_KDD.drop(columns=['attack', 'level'])
  for col in X_KDD.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_KDD[col] = le.fit_transform(X_KDD[col])
  y_KDD = data_KDD['attack']

  datasets['NSL-KDD'] = {
        'X': X_KDD,
        'y': y_KDD,
        'description': 'NSL-KDD dataset t·ª´ Kaggle'
    }

  # L·∫•y d·ªØ li·ªáu b·ªô CICIDS 2017
  # Download latest version
  path_cicids = kagglehub.dataset_download("chethuhn/network-intrusion-dataset")

  # Get all CSV files in the directory
  csv_files = glob.glob(os.path.join(path_cicids, "*.csv"))
  # Read and concatenate all CSV files into a single DataFrame
  df_2017 = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)
  df_2017.columns = df_2017.columns.str.strip()
  #print(df_2017['Label'].value_counts())
  df_2017['Label'] = df_2017['Label'].str.strip()

  X_cicids = df_2017.drop(columns=['Label'])
  y_cicids = df_2017['Label']
  datasets['CICIDS_2017'] = {
        'X': X_cicids,
        'y': y_cicids,
        'description': 'CICIDS 2017 dataset t·ª´ Kaggle'
    }
  return datasets
datasets = load_dataset()    



In [6]:
print((datasets))

{'NSL-KDD': {'X':         duration  protocol_type  service  flag  src_bytes  dst_bytes  land  \
0              0              1       20     9        491          0     0   
1              0              2       44     9        146          0     0   
2              0              1       49     5          0          0     0   
3              0              1       24     9        232       8153     0   
4              0              1       24     9        199        420     0   
...          ...            ...      ...   ...        ...        ...   ...   
148512         0              1       54     9        794        333     0   
148513         0              1       24     9        317        938     0   
148514         0              1       24     9      54540       8314     0   
148515         0              2       12     9         42         42     0   
148516         0              1       57     1          0          0     0   

        wrong_fragment  urgent  hot  ...  dst

Implement systematic evaluation with time/accuracy measurement 
- Tri·ªÉn khai ƒë√°nh gi√° h·ªá th·ªëng v√° ƒëo th·ªùi gian/ƒë·ªô ch√≠nh x√°c

In [6]:
def run_complete_evaluation(classifier, datasets):
    """
    Th·ª±c hi·ªán ƒë√°nh gi√° ƒë·∫ßy ƒë·ªß theo ph∆∞∆°ng ph√°p c·ªßa b√†i vi·∫øt:
    - Ki·ªÉm tra c√°c thu·∫≠t to√°n 4 ml v·ªõi 3 k·ªπ thu·∫≠t l·ª±a ch·ªçn t√≠nh nƒÉng
    - ƒêo l∆∞·ªùng th·ªùi gian th·ª±c hi·ªán v√† ƒë·ªô ch√≠nh x√°c
    - T·∫°o b·∫£ng k·∫øt qu·∫£
    """
    
    print("B·∫Øt ƒë·∫ßu ƒë√°nh gi√°")
    print("="*80)
    
    all_results = {}
    

    for dataset_name, dataset_info in datasets.items():
        print(f"\nƒê√°nh gi√° b·ªô d·ªØ li·ªáu: {dataset_name}")
        print(f"   {dataset_info['description']}")
        print("-" * 60)
        
        X, y = dataset_info['X'], dataset_info['y']
        
        # 1. Ti·ªÅn x·ª≠ l√Ω
        X_processed, y_processed = classifier.preprocess_data(X, y, dataset_name)

        # 2. Division train/test (80/20)
        X_train, X_test, y_train, y_test = train_test_split(
            X_processed, y_processed, test_size=0.2, random_state=42, stratify=y_processed
        )
        
        print(f"\n   Division:")
        print(f"      Train: {X_train.shape[0]} m·∫´u")
        print(f"      Test: {X_test.shape[0]} m·∫´u")
        
        # 3. √Åp d·ª•ng c√°c k·ªπ thu·∫≠t l·ª±a gi·∫£m chi·ªÅu
        feature_selection_results = classifier.apply_feature_selection(
            X_train, X_test, y_train
        )
        
        # 4. ƒê√°nh gi√° k·∫øt h·ª£p ML + Feature Selection
        dataset_results = {}
        
        print(f"\n   ƒê√ÅNH GI√Å C√ÅC K·∫æT H·ª¢P ML + Feature Selection:")
        print(f"   {'Thu·∫≠t to√°n':<15} {'Ph∆∞∆°ng ph√°p gi·∫£m chi·ªÅu':<18} {'ƒê·ªô ch√≠nh x√°c (%)':<12} {'Th·ªùi gian (ms)':<10} {'S·ªë features':<8}")
        print(f"   {'-'*15} {'-'*18} {'-'*12} {'-'*10} {'-'*8}")
        
        # ƒê√°nh gi√° tr∆∞·ªõc khi gi·∫£m chi·ªÅu (baseline)
        print(f"\n   BASELINE (kh√¥ng gi·∫£m chi·ªÅu):")
        for ml_name in classifier.ml_algorithms.keys():
            result = classifier.evaluate_model_combination(
                X_train, X_test, y_train, y_test, ml_name, "No_Reduction", dataset_name
            )
            
            dataset_results[f"{ml_name}_No_Reduction"] = {
                'ml_algorithm': ml_name,
                'feature_method': 'No_Reduction',
                'accuracy': result['accuracy'],
                'time_ms': result['total_time_ms'],
                'n_features': X_train.shape[1]
            }
            
            print(f"   {ml_name:<15} {'No_Reduction':<18} {result['accuracy']:<12.2f} {result['total_time_ms']:<10.1f} {X_train.shape[1]:<8}")
        
        # ƒë√°nh gi√° khi gi·∫£m chi·ªÅu
        print(f"\n   SAU KHI GI·∫¢M CHI·ªÄU:")
        for feature_method, feature_data in feature_selection_results.items():
            print(f"\n   üîπ {feature_method}:")
            
            X_train_reduced = feature_data['X_train']
            X_test_reduced = feature_data['X_test']
            n_features = feature_data['n_features']
            
            for ml_name in classifier.ml_algorithms.keys():
                result = classifier.evaluate_model_combination(
                    X_train_reduced, X_test_reduced, y_train, y_test, 
                    ml_name, feature_method, dataset_name
                )
                
                key = f"{ml_name}_{feature_method}"
                dataset_results[key] = {
                    'ml_algorithm': ml_name,
                    'feature_method': feature_method,
                    'accuracy': result['accuracy'],
                    'time_ms': result['total_time_ms'],
                    'n_features': n_features
                }
                
                print(f"     {ml_name:<15} {feature_method:<18} {result['accuracy']:<12.2f} {result['total_time_ms']:<10.1f} {n_features:<8}")
        
        all_results[dataset_name] = dataset_results
        
        # X√°c ƒë·ªãnh s·ª± k·∫øt h·ª£p t·ªët nh·∫•t cho b·ªô d·ªØ li·ªáu n√†y
        best_combo = max(dataset_results.items(), key=lambda x: x[1]['accuracy'])
        print(f"\n   S·ª± k·∫øt h·ª£p t·ªët nh·∫•t {dataset_name}:")
        print(f"      {best_combo[1]['ml_algorithm']} + {best_combo[1]['feature_method']}")
        print(f"      ƒê·ªô ch√≠nh x√°c: {best_combo[1]['accuracy']:.2f}%")
        print(f"      Th·ªùi gian: {best_combo[1]['time_ms']:.1f} ms")
        print(f"      S·ªë ƒë·∫∑c ƒëi·ªÉm: {best_combo[1]['n_features']}")

    return all_results

# Kh·ªüi t·∫°o tr√¨nh ph√¢n lo·∫°i v√† th·ª±c hi·ªán ƒë√°nh gi√°
classifier = DoSDDoSClassifier()
#evaluation_results = run_complete_evaluation(classifier, load_dataset())
evaluation_results = run_complete_evaluation(classifier, create_synthetic_datasets())


T·∫°o c√°c b·ªô d·ªØ li·ªáu t·ªïng h·ª£p d·ª±a tr√™n b√†i b√°o...

T·∫°o b·ªô d·ªØ li·ªáu NSL-KDD 2019...
T·∫°o b·ªô d·ªØ li·ªáu t·ªïng h·ª£p CICIDS 2017...
T·∫°o b·ªô d·ªØ li·ªáu m√¥ ph·ªèng...
B·∫Øt ƒë·∫ßu ƒë√°nh gi√°

ƒê√°nh gi√° b·ªô d·ªØ li·ªáu: NSL-KDD_2019
   Dataset t·ªïng h·ª£p m√¥ ph·ªèng NSL-KDD 2019 (42‚Üí21 ƒë·∫∑c tr∆∞ng)
------------------------------------------------------------

Ti·ªÅn x·ª≠ l√Ω d·ªØ li·ªáu NSL-KDD_2019...
   K√≠ch th∆∞·ªõc ban ƒë·∫ßu: (15000, 42)
   Sau khi l√†m s·∫°ch: (15000, 42)
Ti·ªÅn x·ª≠ l√Ω ho√†n t·∫•t - shape cu·ªëi c√πng: (15000, 42)

   Division:
      Train: 12000 m·∫´u
      Test: 3000 m·∫´u

√Åp d·ª•ng c√°c k·ªπ thu·∫≠t gi·∫£m chi·ªÅu d·ªØ li·ªáu...
PCA: 42 ‚Üí 21 features
Feature Importance: t·ª± ƒë·ªông ch·ªçn c√°c features t·ªët nh·∫•t
Univariate Selection: SelectKBest vs chi2
Ho√†n th√†nh gi·∫£m chi·ªÅu d·ªØ li·ªáu

   ƒê√ÅNH GI√Å C√ÅC K·∫æT H·ª¢P ML + Feature Selection:
   Thu·∫≠t to√°n      Ph∆∞∆°ng ph√°p gi·∫£m chi·ªÅu ƒê·ªô ch√≠nh 

- T·∫°o b·∫£ng k·∫øt qu·∫£

In [7]:
def create_results_tables(evaluation_results):
    """
    t·∫°o b·∫£ng k·∫øt qu·∫£
    """
    
    print("T·∫†O C√ÅC B·∫¢NG K·∫æT QU·∫¢")
    print("="*80)
    
    for dataset_name, results in evaluation_results.items():
        print(f"\nB·∫¢NG K·∫æT QU·∫¢ - {dataset_name}")
        print("-" * 70)

        # t·∫°o 1 DataFrame ƒë·ªÉ hi·ªÉn th·ªã b·∫£ng
        table_data = []
        
        for combo_name, combo_results in results.items():
            ml_algo = combo_results['ml_algorithm']
            feature_method = combo_results['feature_method']
            accuracy = combo_results['accuracy']
            time_ms = combo_results['time_ms']
            n_features = combo_results['n_features']
            
            table_data.append({
                'Thu·∫≠t to√°n ML': ml_algo,
                'Ph∆∞∆°ng ph√°p gi·∫£m chi·ªÅu': feature_method,
                'ƒê·ªô ch√≠nh x√°c (%)': f"{accuracy:.2f}",
                'Th·ªùi gian (ms)': f"{time_ms:.1f}",
                'S·ªë features': n_features
            })
        
        df_results = pd.DataFrame(table_data)
        
        # S·∫Øp x·∫øp b·∫±ng c√°ch gi·∫£m ƒë·ªô ch√≠nh x√°c
        df_results = df_results.sort_values('ƒê·ªô ch√≠nh x√°c (%)', ascending=False)

        print(df_results.to_string(index=False))

        # Th·ªëng k√™ b·ªï sung
        best_accuracy = df_results.iloc[0]
        print(f"\n  K·∫æT QU·∫¢ T·ªêT NH·∫§T:")
        print(f"      {best_accuracy['Thu·∫≠t to√°n ML']} + {best_accuracy['Ph∆∞∆°ng ph√°p gi·∫£m chi·ªÅu']}")
        print(f"      ƒê·ªô ch√≠nh x√°c: {best_accuracy['ƒê·ªô ch√≠nh x√°c (%)']}%")
        print(f"      Th·ªùi gian: {best_accuracy['Th·ªùi gian (ms)']} ms")
        print(f"      S·ªë features: {best_accuracy['S·ªë features']}")

        # Ph√¢n t√≠ch c·∫£i thi·ªán th·ªùi gian so v·ªõi ƒë·ªô ch√≠nh x√°c
        baseline_results = df_results[df_results['Ph∆∞∆°ng ph√°p gi·∫£m chi·ªÅu'] == 'No_Reduction']
        reduced_results = df_results[df_results['Ph∆∞∆°ng ph√°p gi·∫£m chi·ªÅu'] != 'No_Reduction']

        if not baseline_results.empty and not reduced_results.empty:
            avg_baseline_time = baseline_results['Th·ªùi gian (ms)'].str.replace(' ms', '').astype(float).mean()
            avg_reduced_time = reduced_results['Th·ªùi gian (ms)'].str.replace(' ms', '').astype(float).mean()

            time_improvement = ((avg_baseline_time - avg_reduced_time) / avg_baseline_time) * 100

            print(f"\n  PH√ÇN T√çCH C·∫¢I THI·ªÜN:")
            print(f"      Gi·∫£m th·ªùi gian trung b√¨nh: {time_improvement:.1f}%")
            print(f"      Th·ªùi gian trung b√¨nh baseline: {avg_baseline_time:.1f} ms")
            print(f"      Th·ªùi gian trung b√¨nh v·ªõi gi·∫£m chi·ªÅu: {avg_reduced_time:.1f} ms")

# T·∫°o b·∫£ng k·∫øt qu·∫£
create_results_tables(evaluation_results)

T·∫†O C√ÅC B·∫¢NG K·∫æT QU·∫¢

B·∫¢NG K·∫æT QU·∫¢ - NSL-KDD_2019
----------------------------------------------------------------------
Thu·∫≠t to√°n ML Ph∆∞∆°ng ph√°p gi·∫£m chi·ªÅu ƒê·ªô ch√≠nh x√°c (%) Th·ªùi gian (ms)  S·ªë features
          SVM           No_Reduction            98.57         1098.3           42
          KNN           No_Reduction            97.00          335.4           42
 RandomForest           No_Reduction            93.60         4682.3           42
          SVM                    PCA            93.03         1396.5           21
          SVM   Univariate_Selection            90.70         1265.9           21
          SVM     Feature_Importance            90.40         1450.7           15
          KNN                    PCA            89.40           33.0           21
          KNN   Univariate_Selection            88.80           79.5           21
 RandomForest                    PCA            87.47         3285.2           21
          KNN     Feature

Create performance visualizations and comparisons
- T·∫°o bi·ªÉu ƒë·ªì so s√°nh hi·ªáu nƒÉng