In [23]:
# SMOTE-Tomek sampling retaining 5% of original data
import os
import pandas as pd
from imblearn.combine import SMOTETomek  
from sklearn.preprocessing import MinMaxScaler

folder_path = os.path.join(os.getcwd(), 'CIC')
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

chunksize = 500000
target_col = 'label'
label_mapping = {}

all_data_chunks = []
fraction_to_keep = 0.05  # Keep 5% of each chunk

for csv_file in csv_files:
    csv_path = os.path.join(folder_path, csv_file)
    print(f'Processing file: {csv_file}')

    for chunk in pd.read_csv(csv_path, chunksize=chunksize):
        # Convert label 
        if target_col in chunk.columns:
            # Build label mapping if empty
            if not label_mapping:
                unique_labels = chunk[target_col].unique()
                label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
            chunk[target_col] = chunk[target_col].map(label_mapping)

        # # Feature engineering
        # if 'Tot sum' in chunk.columns and 'Duration' in chunk.columns:
        #     chunk['Packets_Duration_Ratio'] = chunk['Tot sum'] / (chunk['Duration'] + 1e-6)

        # Randomly keep only "fraction_to_keep" of the current chunk
        chunk_sampled = chunk.sample(frac=fraction_to_keep, random_state=42)
        all_data_chunks.append(chunk_sampled)

# Concatenate all sampled chunks
subset_data = pd.concat(all_data_chunks, ignore_index=True)
print(f"\nSubset data shape (before SMOTE-Tomek): {subset_data.shape}")
print("This should be roughly 20% of the original dataset.")

# Summary before resampling
print("\nSummary statistics for data BEFORE SMOTE-Tomek:\n")
before_stats = subset_data.describe()
print(before_stats)

# Class distribution before
if target_col in subset_data.columns:
    class_dist_before = subset_data[target_col].value_counts(dropna=False)
    print("\nClass distribution (BEFORE SMOTE-Tomek):\n", class_dist_before)

# Apply SMOTE-Tomek
if target_col in subset_data.columns:
    feature_cols = [col for col in subset_data.columns if col != target_col]
    X = subset_data[feature_cols]
    y = subset_data[target_col]

    # Normalize features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    print("\nApplying SMOTE-Tomek for class imbalance handling...")
    smote_tomek = SMOTETomek(random_state=5)
    X_resampled, y_resampled = smote_tomek.fit_resample(X_scaled, y)

    # Rebuild into DataFrame
    resampled_data = pd.DataFrame(X_resampled, columns=feature_cols)
    resampled_data[target_col] = y_resampled

    print(f"Resampled data shape (after SMOTE-Tomek): {resampled_data.shape}")

    # Summary after
    print("\nSummary statistics for data AFTER SMOTE-Tomek:\n")
    after_stats = resampled_data.describe()
    print(after_stats)

    # Class distribution after
    class_dist_after = resampled_data[target_col].value_counts(dropna=False)
    print("\nClass distribution (AFTER SMOTE-Tomek):\n", class_dist_after)

    # Differences in summary stats
    common_stats_index = before_stats.index.intersection(after_stats.index)
    common_columns = before_stats.columns.intersection(after_stats.columns)

    diff_stats = after_stats.loc[common_stats_index, common_columns] - \
                 before_stats.loc[common_stats_index, common_columns]

    print("\nDifference in summary statistics (AFTER - BEFORE):\n")
    print(diff_stats)

    # Save to CSV
    before_stats.to_csv("before_smote_tomek_stats.csv")
    after_stats.to_csv("after_smote_tomek_stats.csv")
    diff_stats.to_csv("difference_in_stats_smote_tomek.csv", float_format="%.6f")
    subset_data.to_csv("subset_data.csv", index=False)
    resampled_data.to_csv("smote_tomek_data.csv", index=False)

else:
    print("\nNo 'label' column found in subset_data. SMOTE-Tomek skipped.")


Processing file: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00013-363d1ba3-8

In [24]:
#smote ENN
import os
import pandas as pd
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
from sklearn.neighbors import NearestNeighbors
from imblearn.over_sampling import SMOTE
# Setup
folder_path = os.path.join(os.getcwd(), 'CIC')
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
chunksize = 500000
target_col = 'label'
fraction_to_keep = 0.05
label_mapping = {}
all_data_chunks = []

# Read and sample
for csv_file in csv_files:
    csv_path = os.path.join(folder_path, csv_file)
    print(f'Processing file: {csv_file}')

    for chunk in pd.read_csv(csv_path, chunksize=chunksize):
        if target_col in chunk.columns:
            if not label_mapping:
                unique_labels = chunk[target_col].unique()
                label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
            chunk[target_col] = chunk[target_col].map(label_mapping)

        chunk_sampled = chunk.sample(frac=fraction_to_keep, random_state=42)
        all_data_chunks.append(chunk_sampled)

subset_data = pd.concat(all_data_chunks, ignore_index=True)
print(f"\nSubset data shape (before SMOTE-ENN): {subset_data.shape}")
print(subset_data.describe())

if target_col in subset_data.columns:
    class_dist_before = subset_data[target_col].value_counts(dropna=False)
    print("\nClass distribution BEFORE SMOTE-ENN:\n", class_dist_before)

feature_cols = [col for col in subset_data.columns if col != target_col]
X = subset_data[feature_cols]
y = subset_data[target_col]

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Check minority class size
class_counts = Counter(y)
minority_count = min(class_counts.values())

if minority_count < 6:
    print("\n Not enough minority samples for SMOTE. Skipping resampling.")
    resampled_data = subset_data.copy()
else:
    print("\n Applying SMOTE-ENN...")
    
    # Create SMOTE object with custom KNN
    knn = NearestNeighbors(n_neighbors=2, n_jobs=-1)
    smote = SMOTE(random_state=5, k_neighbors=knn)
    
    # Pass custom SMOTE to SMOTEENN
    smote_enn = SMOTEENN(random_state=5, smote=smote)
    
    X_resampled, y_resampled = smote_enn.fit_resample(X_scaled, y)
    resampled_data = pd.DataFrame(X_resampled, columns=feature_cols)
    resampled_data[target_col] = y_resampled


    print(f"\nResampled data shape: {resampled_data.shape}")
    print("\nSummary statistics AFTER SMOTE-ENN:\n")
    print(resampled_data.describe())

    class_dist_after = resampled_data[target_col].value_counts(dropna=False)
    print("\nClass distribution AFTER SMOTE-ENN:\n", class_dist_after)

    diff_stats = resampled_data.describe() - subset_data.describe()
    print("\nDifference in summary statistics (AFTER - BEFORE):\n")
    print(diff_stats)
    diff_stats.to_csv("difference_in_stats_smote_enn.csv", float_format="%.6f")

# Save
subset_data.to_csv("subset_data.csv", index=False)
resampled_data.to_csv("smote_enn_data.csv", index=False)
subset_data.describe().to_csv("before_smote_enn_stats.csv")
resampled_data.describe().to_csv("after_smote_enn_stats.csv")


Processing file: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing file: part-00013-363d1ba3-8

In [1]:
# XGBoost smote_enn - FINAL fixed version

import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

#  Setup paths
folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'smote_enn_data.csv'

# Feature engineering function
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

#  Load sampled dataset 
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

#  Load original files and sample 
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.05, random_state=42)  # 5% sample
    original_dfs.append(sampled)

# Combine data 
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

#  Bring Dask to Pandas 
print("Converting to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

#  Prepare X and y 
print("Preparing X and y")
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['label'].astype(str)

#  Check and fix weird float labels 
print("Checking labels...")
weird_numeric_labels = sorted(label for label in y.unique() if label.replace('.0', '').isdigit())
print(f"Numeric-like labels found: {weird_numeric_labels}")

#  Mapping float-like labels to real attack names 
label_mapping = {
    '0.0': 'BenignTraffic',
    '1.0': 'DDoS-UDP_Flood',
    '2.0': 'DDoS-TCP_Flood',
    '3.0': 'DDoS-ICMP_Flood',
    '4.0': 'DDoS-ICMP_Fragmentation',
    '5.0': 'DDoS-PSHACK_Flood',
    '6.0': 'DDoS-RSTFINFlood',
    '7.0': 'DDoS-SYN_Flood',
    '8.0': 'DDoS-SlowLoris',
    '9.0': 'DDoS-SynonymousIP_Flood',
    '10.0': 'DDoS-TCP_Flood',
    '11.0': 'DDoS-UDP_Flood',
    '12.0': 'DDoS-UDP_Fragmentation',
    '13.0': 'DNS_Spoofing',
    '14.0': 'DictionaryBruteForce',
    '15.0': 'DoS-HTTP_Flood',
    '16.0': 'DoS-SYN_Flood',
    '17.0': 'DoS-TCP_Flood',
    '18.0': 'DoS-UDP_Flood',
    '19.0': 'MITM-ArpSpoofing',
    '20.0': 'Mirai-greeth_flood',
    '21.0': 'Mirai-greip_flood',
    '22.0': 'Mirai-udpplain',
    '23.0': 'Recon-HostDiscovery',
    '24.0': 'Recon-OSScan',
    '25.0': 'Recon-PingSweep',
    '26.0': 'Recon-PortScan',
    '27.0': 'SqlInjection',
    '28.0': 'Uploading_Attack',
    '29.0': 'VulnerabilityScan',
    '30.0': 'XSS',
    '31.0': 'Backdoor_Malware',
    '32.0': 'BrowserHijacking',
    '33.0': 'CommandInjection',
}

#  Replace the bad float-string labels 
y = y.replace(label_mapping)

print("Unique labels after cleaning:", y.unique())

#  Encode labels into integers 
print("Encoding labels")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#  Train/test split 
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

#  Train XGBoost 
print("Training XGBoost")
clf = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss',  
    n_jobs=-1
)
clf.fit(X_train, y_train)

#  Evaluate 
print("Evaluating")
predictions = clf.predict(X_test)

# Decode predictions back to original string labels
predicted_labels = label_encoder.inverse_transform(predictions)
true_labels = label_encoder.inverse_transform(y_test)

# Accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"\nAccuracy of XGBoost: {accuracy:.4f}\n")

# Classification Report
print("Classification Report:")
print(classification_report(true_labels, predicted_labels))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating

Accuracy of XGBoost: 0.9902

Classification Report:
                         precision    recall  f1-score   support

       Backdoor_Malware       1.00      1.00      1.00    108323
          BenignTraffic       0.97      1.00      0.98    124758
       BrowserHijacking       1.00      1.00      1.00    108363
       CommandInjection       1.00      1.00      1.00    108349
 DDoS-ACK_Fragmentation       1.00      0.99      0.99      4250
        DDoS-HTTP_Flood       0.99      0.98      0.98       411
        DDoS-ICMP_Flood       1.00      1.00      1.00    210543
DDoS-ICMP_Fragmentation       1.00      1.00      1.00    112782
      DDoS-PSHACK_Flood       1.00      1.00      1.00    169640
       DDoS-RSTFINFlood       1.00      1.00      1.00    167269
         DDoS-SYN_Flood       1.00      1.00      1.00    169152
         DDoS-SlowLoris       1.00      1.00      1.00    104782
DDoS-SynonymousIP_Flood       1.00      1.00      1.00    162159
         DDoS-TCP_Flood  

In [5]:
# XGBoost smote_tomek - FINAL fixed version

import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

#  Setup paths 
folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'smote_tomek_data.csv'

#  Feature engineering function 
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

#  Load sampled dataset 
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

#  Load original files and sample 
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.05, random_state=42)  # 5% sample
    original_dfs.append(sampled)

#  Combine data 
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

#  Bring Dask to Pandas 
print("Converting to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

#  Prepare X and y 
print("Preparing X and y")
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['label'].astype(str)

#  Check and fix weird float labels 
print("Checking labels...")
weird_numeric_labels = sorted(label for label in y.unique() if label.replace('.0', '').isdigit())
print(f"Numeric-like labels found: {weird_numeric_labels}")

#  Mapping float-like labels to real attack names 
label_mapping = {
    '0.0': 'BenignTraffic',
    '1.0': 'DDoS-UDP_Flood',
    '2.0': 'DDoS-TCP_Flood',
    '3.0': 'DDoS-ICMP_Flood',
    '4.0': 'DDoS-ICMP_Fragmentation',
    '5.0': 'DDoS-PSHACK_Flood',
    '6.0': 'DDoS-RSTFINFlood',
    '7.0': 'DDoS-SYN_Flood',
    '8.0': 'DDoS-SlowLoris',
    '9.0': 'DDoS-SynonymousIP_Flood',
    '10.0': 'DDoS-TCP_Flood',
    '11.0': 'DDoS-UDP_Flood',
    '12.0': 'DDoS-UDP_Fragmentation',
    '13.0': 'DNS_Spoofing',
    '14.0': 'DictionaryBruteForce',
    '15.0': 'DoS-HTTP_Flood',
    '16.0': 'DoS-SYN_Flood',
    '17.0': 'DoS-TCP_Flood',
    '18.0': 'DoS-UDP_Flood',
    '19.0': 'MITM-ArpSpoofing',
    '20.0': 'Mirai-greeth_flood',
    '21.0': 'Mirai-greip_flood',
    '22.0': 'Mirai-udpplain',
    '23.0': 'Recon-HostDiscovery',
    '24.0': 'Recon-OSScan',
    '25.0': 'Recon-PingSweep',
    '26.0': 'Recon-PortScan',
    '27.0': 'SqlInjection',
    '28.0': 'Uploading_Attack',
    '29.0': 'VulnerabilityScan',
    '30.0': 'XSS',
    '31.0': 'Backdoor_Malware',
    '32.0': 'BrowserHijacking',
    '33.0': 'CommandInjection',
}

#  Replace the bad float-string labels 
y = y.replace(label_mapping)

print("Unique labels after cleaning:", y.unique())

#  Encode labels into integers 
print("Encoding labels")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#  Train/test split 
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

#  Train XGBoost 
print("Training XGBoost")
clf = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss',
    n_jobs=-1
)
clf.fit(X_train, y_train)

#  Evaluate 
print("Evaluating")
predictions = clf.predict(X_test)

# Decode predictions back to original string labels
predicted_labels = label_encoder.inverse_transform(predictions)
true_labels = label_encoder.inverse_transform(y_test)

# Accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"\nAccuracy of XGBoost: {accuracy:.4f}\n")

# Classification Report
print("Classification Report:")
print(classification_report(true_labels, predicted_labels))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating

Accuracy of XGBoost: 0.9835

Classification Report:
                         precision    recall  f1-score   support

       Backdoor_Malware       0.97      1.00      0.99    108323
          BenignTraffic       0.97      1.00      0.98    124836
       BrowserHijacking       0.99      1.00      1.00    108364
       CommandInjection       1.00      1.00      1.00    108349
 DDoS-ACK_Fragmentation       1.00      1.00      1.00      4250
        DDoS-HTTP_Flood       0.99      0.96      0.98       411
        DDoS-ICMP_Flood       1.00      1.00      1.00    215749
DDoS-ICMP_Fragmentation       1.00      1.00      1.00    114872
      DDoS-PSHACK_Flood       1.00      1.00      1.00    169815
       DDoS-RSTFINFlood       1.00      1.00      1.00    168799
         DDoS-SYN_Flood       1.00      1.00      1.00    169169
         DDoS-SlowLoris       1.00      1.00      1.00    108402
DDoS-SynonymousIP_Flood       1.00      1.00      1.00    162249
         DDoS-TCP_Flood  

In [9]:
#Random Forest Smote enn
import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Setup
folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'smote_enn_data.csv'

# Feature engineering
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

# Read sampled data
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

# Read and sample original data
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.05, random_state=42)
    original_dfs.append(sampled)

# Combine
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

# Convert to Pandas
print("Computing Dask to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

# Prepare X and y
print("Preparing X and y")
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['label'].astype(str)

# Fix float-style labels
label_mapping = {
    '0.0': 'BenignTraffic',
    '1.0': 'DDoS-UDP_Flood',
    '2.0': 'DDoS-TCP_Flood',
    '3.0': 'DDoS-ICMP_Flood',
    '4.0': 'DDoS-ICMP_Fragmentation',
    '5.0': 'DDoS-PSHACK_Flood',
    '6.0': 'DDoS-RSTFINFlood',
    '7.0': 'DDoS-SYN_Flood',
    '8.0': 'DDoS-SlowLoris',
    '9.0': 'DDoS-SynonymousIP_Flood',
    '10.0': 'DDoS-TCP_Flood',
    '11.0': 'DDoS-UDP_Flood',
    '12.0': 'DDoS-UDP_Fragmentation',
    '13.0': 'DNS_Spoofing',
    '14.0': 'DictionaryBruteForce',
    '15.0': 'DoS-HTTP_Flood',
    '16.0': 'DoS-SYN_Flood',
    '17.0': 'DoS-TCP_Flood',
    '18.0': 'DoS-UDP_Flood',
    '19.0': 'MITM-ArpSpoofing',
    '20.0': 'Mirai-greeth_flood',
    '21.0': 'Mirai-greip_flood',
    '22.0': 'Mirai-udpplain',
    '23.0': 'Recon-HostDiscovery',
    '24.0': 'Recon-OSScan',
    '25.0': 'Recon-PingSweep',
    '26.0': 'Recon-PortScan',
    '27.0': 'SqlInjection',
    '28.0': 'Uploading_Attack',
    '29.0': 'VulnerabilityScan',
    '30.0': 'XSS',
    '31.0': 'Backdoor_Malware',
    '32.0': 'BrowserHijacking',
    '33.0': 'CommandInjection',
}
y = y.replace(label_mapping)

print("Encoding labels")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

# Train Random Forest
print("Training Random Forest")
clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)
clf.fit(X_train, y_train)

# Predict and Evaluate
print("Evaluating")
predictions = clf.predict(X_test)

# Decode labels
predicted_labels = label_encoder.inverse_transform(predictions)
true_labels = label_encoder.inverse_transform(y_test)

accuracy = accuracy_score(true_labels, predicted_labels)
print(f"\nAccuracy of Random Forest: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(true_labels, predicted_labels))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

In [7]:
#Random Forest Smote tomek
import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Setup
folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'smote_tomek_data.csv'

# Feature engineering
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

# Read sampled data
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

# Read and sample original data
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.05, random_state=42)
    original_dfs.append(sampled)

# Combine
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

# Convert to Pandas
print("Computing Dask to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

# Prepare X and y
print("Preparing X and y")
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['label'].astype(str)

# Fix float-style labels
label_mapping = {
    '0.0': 'BenignTraffic',
    '1.0': 'DDoS-UDP_Flood',
    '2.0': 'DDoS-TCP_Flood',
    '3.0': 'DDoS-ICMP_Flood',
    '4.0': 'DDoS-ICMP_Fragmentation',
    '5.0': 'DDoS-PSHACK_Flood',
    '6.0': 'DDoS-RSTFINFlood',
    '7.0': 'DDoS-SYN_Flood',
    '8.0': 'DDoS-SlowLoris',
    '9.0': 'DDoS-SynonymousIP_Flood',
    '10.0': 'DDoS-TCP_Flood',
    '11.0': 'DDoS-UDP_Flood',
    '12.0': 'DDoS-UDP_Fragmentation',
    '13.0': 'DNS_Spoofing',
    '14.0': 'DictionaryBruteForce',
    '15.0': 'DoS-HTTP_Flood',
    '16.0': 'DoS-SYN_Flood',
    '17.0': 'DoS-TCP_Flood',
    '18.0': 'DoS-UDP_Flood',
    '19.0': 'MITM-ArpSpoofing',
    '20.0': 'Mirai-greeth_flood',
    '21.0': 'Mirai-greip_flood',
    '22.0': 'Mirai-udpplain',
    '23.0': 'Recon-HostDiscovery',
    '24.0': 'Recon-OSScan',
    '25.0': 'Recon-PingSweep',
    '26.0': 'Recon-PortScan',
    '27.0': 'SqlInjection',
    '28.0': 'Uploading_Attack',
    '29.0': 'VulnerabilityScan',
    '30.0': 'XSS',
    '31.0': 'Backdoor_Malware',
    '32.0': 'BrowserHijacking',
    '33.0': 'CommandInjection',
}
y = y.replace(label_mapping)

print("Encoding labels")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

# Train Random Forest
print("Training Random Forest")
clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)
clf.fit(X_train, y_train)

# Predict and Evaluate
print("Evaluating")
predictions = clf.predict(X_test)

# Decode labels
predicted_labels = label_encoder.inverse_transform(predictions)
true_labels = label_encoder.inverse_transform(y_test)

accuracy = accuracy_score(true_labels, predicted_labels)
print(f"\nAccuracy of Random Forest: {accuracy:.4f}\n")

print("Classification Report:")
print(classification_report(true_labels, predicted_labels))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9