In [3]:
!pip install dask[complete]




In [3]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   - -------------------------------------- 5.5/150.0 MB 33.4 MB/s eta 0:00:05
   ---- ----------------------------------- 18.6/150.0 MB 51.0 MB/s eta 0:00:03
   --------- ------------------------------ 36.4/150.0 MB 64.3 MB/s eta 0:00:02
   -------------- ------------------------- 55.1/150.0 MB 71.5 MB/s eta 0:00:02
   ------------------- -------------------- 74.4/150.0 MB 75.3 MB/s eta 0:00:02
   ------------------------- -------------- 93.8/150.0 MB 78.8 MB/s eta 0:00:01
   ----------------------------- --------- 113.8/150.0 MB 81.6 MB/s eta 0:00:01
   ---------------------------------- ---- 133.2/150.0 MB 83.4 MB/s eta 0:00:01
   --------------------------------------  149.9/150.0 MB 84.8 MB/s eta 0:00:01
   --------------------------------------  149.9/150.0 MB 84.8

In [15]:
# XGBoost stratified - FINAL fixed version

import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

#  Setup paths 
folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'stratified_sample.csv'

#  Feature engineering function 
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

#  Load sampled dataset 
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

#  Load original files and sample 
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.05, random_state=42)  # 5% sample
    original_dfs.append(sampled)

#  Combine data 
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

#  Bring Dask to Pandas 
print("Converting to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

#  Prepare X and y 
print("Preparing X and y")
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['label'].astype(str)

#  Check and fix weird float labels 
print("Checking labels...")
weird_numeric_labels = sorted(label for label in y.unique() if label.replace('.0', '').isdigit())
print(f"Numeric-like labels found: {weird_numeric_labels}")

#  Mapping float-like labels to real attack names 
label_mapping = {
    '0.0': 'BenignTraffic',
    '1.0': 'DDoS-UDP_Flood',
    '2.0': 'DDoS-TCP_Flood',
    '3.0': 'DDoS-ICMP_Flood',
    '4.0': 'DDoS-ICMP_Fragmentation',
    '5.0': 'DDoS-PSHACK_Flood',
    '6.0': 'DDoS-RSTFINFlood',
    '7.0': 'DDoS-SYN_Flood',
    '8.0': 'DDoS-SlowLoris',
    '9.0': 'DDoS-SynonymousIP_Flood',
    '10.0': 'DDoS-TCP_Flood',
    '11.0': 'DDoS-UDP_Flood',
    '12.0': 'DDoS-UDP_Fragmentation',
    '13.0': 'DNS_Spoofing',
    '14.0': 'DictionaryBruteForce',
    '15.0': 'DoS-HTTP_Flood',
    '16.0': 'DoS-SYN_Flood',
    '17.0': 'DoS-TCP_Flood',
    '18.0': 'DoS-UDP_Flood',
    '19.0': 'MITM-ArpSpoofing',
    '20.0': 'Mirai-greeth_flood',
    '21.0': 'Mirai-greip_flood',
    '22.0': 'Mirai-udpplain',
    '23.0': 'Recon-HostDiscovery',
    '24.0': 'Recon-OSScan',
    '25.0': 'Recon-PingSweep',
    '26.0': 'Recon-PortScan',
    '27.0': 'SqlInjection',
    '28.0': 'Uploading_Attack',
    '29.0': 'VulnerabilityScan',
    '30.0': 'XSS',
    '31.0': 'Backdoor_Malware',
    '32.0': 'BrowserHijacking',
    '33.0': 'CommandInjection',
}

#  Replace the bad float-string labels
y = y.replace(label_mapping)

print("Unique labels after cleaning:", y.unique())

#  Encode labels into integers 
print("Encoding labels")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#  Train/test split 
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

#  Train XGBoost 
print("Training XGBoost")
clf = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss',  
    n_jobs=-1
)
clf.fit(X_train, y_train)

#  Evaluate 
print("Evaluating")
predictions = clf.predict(X_test)

# Decode predictions back to original string labels
predicted_labels = label_encoder.inverse_transform(predictions)
true_labels = label_encoder.inverse_transform(y_test)

# Accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"\nAccuracy of XGBoost: {accuracy:.4f}\n")

# Classification Report
print("Classification Report:")
print(classification_report(true_labels, predicted_labels))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating

Accuracy of XGBoost: 0.8621

Classification Report:
                         precision    recall  f1-score   support

       Backdoor_Malware       0.75      0.04      0.08        68
          BenignTraffic       0.73      0.56      0.64     28704
       BrowserHijacking       0.42      0.05      0.09       100
       CommandInjection       0.44      0.13      0.21        82
 DDoS-ACK_Fragmentation       0.83      1.00      0.91      4250
        DDoS-HTTP_Flood       0.81      0.93      0.86       411
        DDoS-ICMP_Flood       0.83      0.91      0.87    117877
DDoS-ICMP_Fragmentation       0.83      0.53      0.65     12850
      DDoS-PSHACK_Flood       0.83      0.95      0.89     64540
       DDoS-RSTFINFlood       0.83      0.85      0.84     71375
         DDoS-SYN_Flood       0.83      0.96      0.89     63559
         DDoS-SlowLoris       0.72      0.03      0.05     12531
DDoS-SynonymousIP_Flood       0.83      0.81      0.82     66265
         DDoS-TCP_Flood  

In [17]:
# XGBoost undersampled - FINAL fixed version

import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

#  Setup paths 
folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'undersampled_data.csv'

#  Feature engineering function 
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

#  Load sampled dataset 
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

#  Load original files and sample 
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.05, random_state=42)  # 5% sample
    original_dfs.append(sampled)

#  Combine data 
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

#  Bring Dask to Pandas 
print("Converting to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

#  Prepare X and y 
print("Preparing X and y")
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['label'].astype(str)

#  Check and fix weird float labels 
print("Checking labels...")
weird_numeric_labels = sorted(label for label in y.unique() if label.replace('.0', '').isdigit())
print(f"Numeric-like labels found: {weird_numeric_labels}")

#  Mapping float-like labels to real attack names 
label_mapping = {
    '0.0': 'BenignTraffic',
    '1.0': 'DDoS-UDP_Flood',
    '2.0': 'DDoS-TCP_Flood',
    '3.0': 'DDoS-ICMP_Flood',
    '4.0': 'DDoS-ICMP_Fragmentation',
    '5.0': 'DDoS-PSHACK_Flood',
    '6.0': 'DDoS-RSTFINFlood',
    '7.0': 'DDoS-SYN_Flood',
    '8.0': 'DDoS-SlowLoris',
    '9.0': 'DDoS-SynonymousIP_Flood',
    '10.0': 'DDoS-TCP_Flood',
    '11.0': 'DDoS-UDP_Flood',
    '12.0': 'DDoS-UDP_Fragmentation',
    '13.0': 'DNS_Spoofing',
    '14.0': 'DictionaryBruteForce',
    '15.0': 'DoS-HTTP_Flood',
    '16.0': 'DoS-SYN_Flood',
    '17.0': 'DoS-TCP_Flood',
    '18.0': 'DoS-UDP_Flood',
    '19.0': 'MITM-ArpSpoofing',
    '20.0': 'Mirai-greeth_flood',
    '21.0': 'Mirai-greip_flood',
    '22.0': 'Mirai-udpplain',
    '23.0': 'Recon-HostDiscovery',
    '24.0': 'Recon-OSScan',
    '25.0': 'Recon-PingSweep',
    '26.0': 'Recon-PortScan',
    '27.0': 'SqlInjection',
    '28.0': 'Uploading_Attack',
    '29.0': 'VulnerabilityScan',
    '30.0': 'XSS',
    '31.0': 'Backdoor_Malware',
    '32.0': 'BrowserHijacking',
    '33.0': 'CommandInjection',
}

#  Replace the bad float-string labels 
y = y.replace(label_mapping)

print("Unique labels after cleaning:", y.unique())

#  Encode labels into integers 
print("Encoding labels")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#  Train/test split 
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

#  Train XGBoost 
print("Training XGBoost")
clf = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss', 
    n_jobs=-1
)
clf.fit(X_train, y_train)

#  Evaluate 
print("Evaluating")
predictions = clf.predict(X_test)

# Decode predictions back to original string labels
predicted_labels = label_encoder.inverse_transform(predictions)
true_labels = label_encoder.inverse_transform(y_test)

# Accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"\nAccuracy of XGBoost: {accuracy:.4f}\n")

# Classification Report
print("Classification Report:")
print(classification_report(true_labels, predicted_labels))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating

Accuracy of XGBoost: 0.9739

Classification Report:
                         precision    recall  f1-score   support

       Backdoor_Malware       0.00      0.00      0.00        53
          BenignTraffic       0.80      0.96      0.88     17044
       BrowserHijacking       0.00      0.00      0.00        94
       CommandInjection       0.00      0.00      0.00        79
 DDoS-ACK_Fragmentation       0.90      1.00      0.94      4250
        DDoS-HTTP_Flood       0.45      0.40      0.43       411
        DDoS-ICMP_Flood       1.00      1.00      1.00    108401
DDoS-ICMP_Fragmentation       0.92      0.95      0.94      7251
      DDoS-PSHACK_Flood       0.99      1.00      0.99     62036
       DDoS-RSTFINFlood       0.99      1.00      0.99     61090
         DDoS-SYN_Flood       1.00      1.00      1.00     61375
         DDoS-SlowLoris       0.31      0.22      0.26       820
DDoS-SynonymousIP_Flood       0.99      0.99      0.99     54455
         DDoS-TCP_Flood  

In [2]:
# XGBoost smote - FINAL fixed version

import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

#  Setup paths 
folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'smote_data.csv'

#  Feature engineering function 
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

#  Load sampled dataset 
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

#  Load original files and sample 
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.05, random_state=42)  # 5% sample
    original_dfs.append(sampled)

#  Combine data 
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

#  Bring Dask to Pandas 
print("Converting to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

#  Prepare X and y 
print("Preparing X and y")
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['label'].astype(str)

#  Check and fix weird float labels 
print("Checking labels...")
weird_numeric_labels = sorted(label for label in y.unique() if label.replace('.0', '').isdigit())
print(f"Numeric-like labels found: {weird_numeric_labels}")

#  Mapping float-like labels to real attack names 
label_mapping = {
    '0.0': 'BenignTraffic',
    '1.0': 'DDoS-UDP_Flood',
    '2.0': 'DDoS-TCP_Flood',
    '3.0': 'DDoS-ICMP_Flood',
    '4.0': 'DDoS-ICMP_Fragmentation',
    '5.0': 'DDoS-PSHACK_Flood',
    '6.0': 'DDoS-RSTFINFlood',
    '7.0': 'DDoS-SYN_Flood',
    '8.0': 'DDoS-SlowLoris',
    '9.0': 'DDoS-SynonymousIP_Flood',
    '10.0': 'DDoS-TCP_Flood',
    '11.0': 'DDoS-UDP_Flood',
    '12.0': 'DDoS-UDP_Fragmentation',
    '13.0': 'DNS_Spoofing',
    '14.0': 'DictionaryBruteForce',
    '15.0': 'DoS-HTTP_Flood',
    '16.0': 'DoS-SYN_Flood',
    '17.0': 'DoS-TCP_Flood',
    '18.0': 'DoS-UDP_Flood',
    '19.0': 'MITM-ArpSpoofing',
    '20.0': 'Mirai-greeth_flood',
    '21.0': 'Mirai-greip_flood',
    '22.0': 'Mirai-udpplain',
    '23.0': 'Recon-HostDiscovery',
    '24.0': 'Recon-OSScan',
    '25.0': 'Recon-PingSweep',
    '26.0': 'Recon-PortScan',
    '27.0': 'SqlInjection',
    '28.0': 'Uploading_Attack',
    '29.0': 'VulnerabilityScan',
    '30.0': 'XSS',
    '31.0': 'Backdoor_Malware',
    '32.0': 'BrowserHijacking',
    '33.0': 'CommandInjection',
}

#  Replace the bad float-string labels 
y = y.replace(label_mapping)

print("Unique labels after cleaning:", y.unique())

#  Encode labels into integers 
print("Encoding labels")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#  Train/test split 
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

#  Train XGBoost 
print("Training XGBoost")
clf = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss', 
    n_jobs=-1
)
clf.fit(X_train, y_train)

#  Evaluate 
print("Evaluating")
predictions = clf.predict(X_test)

# Decode predictions back to original string labels
predicted_labels = label_encoder.inverse_transform(predictions)
true_labels = label_encoder.inverse_transform(y_test)

# Accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"\nAccuracy of XGBoost: {accuracy:.4f}\n")

# Classification Report
print("Classification Report:")
print(classification_report(true_labels, predicted_labels))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating

Accuracy of XGBoost: 0.8910

Classification Report:
                         precision    recall  f1-score   support

       Backdoor_Malware       0.50      0.07      0.12       130
          BenignTraffic       0.97      0.99      0.98    124841
       BrowserHijacking       0.58      0.09      0.15       126
       CommandInjection       0.67      0.10      0.18        97
 DDoS-ACK_Fragmentation       1.00      1.00      1.00      4250
        DDoS-HTTP_Flood       0.97      0.99      0.98       411
        DDoS-ICMP_Flood       0.90      0.98      0.94    216199
DDoS-ICMP_Fragmentation       0.78      0.72      0.75    115048
      DDoS-PSHACK_Flood       0.83      0.81      0.82    169833
       DDoS-RSTFINFlood       0.96      0.85      0.90    168887
         DDoS-SYN_Flood       1.00      1.00      1.00    169172
         DDoS-SlowLoris       0.71      0.89      0.79    108617
DDoS-SynonymousIP_Flood       1.00      1.00      1.00    162253
         DDoS-TCP_Flood  

In [4]:
# XGBoost data_diffusion - FINAL fixed version

import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

#  Setup paths 
folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'data_diffusion_data.csv'

#  Feature engineering function
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

#  Load sampled dataset 
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

#  Load original files and sample 
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.05, random_state=42)  # 5% sample
    original_dfs.append(sampled)

#  Combine data 
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

#  Bring Dask to Pandas 
print("Converting to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

#  Prepare X and y 
print("Preparing X and y")
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['label'].astype(str)

#  Check and fix weird float labels 
print("Checking labels...")
weird_numeric_labels = sorted(label for label in y.unique() if label.replace('.0', '').isdigit())
print(f"Numeric-like labels found: {weird_numeric_labels}")

#  Mapping float-like labels to real attack names 
label_mapping = {
    '0.0': 'BenignTraffic',
    '1.0': 'DDoS-UDP_Flood',
    '2.0': 'DDoS-TCP_Flood',
    '3.0': 'DDoS-ICMP_Flood',
    '4.0': 'DDoS-ICMP_Fragmentation',
    '5.0': 'DDoS-PSHACK_Flood',
    '6.0': 'DDoS-RSTFINFlood',
    '7.0': 'DDoS-SYN_Flood',
    '8.0': 'DDoS-SlowLoris',
    '9.0': 'DDoS-SynonymousIP_Flood',
    '10.0': 'DDoS-TCP_Flood',
    '11.0': 'DDoS-UDP_Flood',
    '12.0': 'DDoS-UDP_Fragmentation',
    '13.0': 'DNS_Spoofing',
    '14.0': 'DictionaryBruteForce',
    '15.0': 'DoS-HTTP_Flood',
    '16.0': 'DoS-SYN_Flood',
    '17.0': 'DoS-TCP_Flood',
    '18.0': 'DoS-UDP_Flood',
    '19.0': 'MITM-ArpSpoofing',
    '20.0': 'Mirai-greeth_flood',
    '21.0': 'Mirai-greip_flood',
    '22.0': 'Mirai-udpplain',
    '23.0': 'Recon-HostDiscovery',
    '24.0': 'Recon-OSScan',
    '25.0': 'Recon-PingSweep',
    '26.0': 'Recon-PortScan',
    '27.0': 'SqlInjection',
    '28.0': 'Uploading_Attack',
    '29.0': 'VulnerabilityScan',
    '30.0': 'XSS',
    '31.0': 'Backdoor_Malware',
    '32.0': 'BrowserHijacking',
    '33.0': 'CommandInjection',
}

#  Replace the bad float-string labels 
y = y.replace(label_mapping)

print("Unique labels after cleaning:", y.unique())

#  Encode labels into integers 
print("Encoding labels")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#  Train/test split 
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

#  Train XGBoost 
print("Training XGBoost")
clf = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss', 
    n_jobs=-1
)
clf.fit(X_train, y_train)

#  Evaluate 
print("Evaluating")
predictions = clf.predict(X_test)

# Decode predictions back to original string labels
predicted_labels = label_encoder.inverse_transform(predictions)
true_labels = label_encoder.inverse_transform(y_test)

# Accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"\nAccuracy of XGBoost: {accuracy:.4f}\n")

# Classification Report
print("Classification Report:")
print(classification_report(true_labels, predicted_labels))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating

Accuracy of XGBoost: 0.9957

Classification Report:
                         precision    recall  f1-score   support

       Backdoor_Malware       1.00      1.00      1.00    108323
          BenignTraffic       0.97      1.00      0.98    124841
       BrowserHijacking       1.00      1.00      1.00    108364
       CommandInjection       1.00      1.00      1.00    108349
 DDoS-ACK_Fragmentation       0.99      0.99      0.99      4250
        DDoS-HTTP_Flood       0.97      0.97      0.97       411
        DDoS-ICMP_Flood       1.00      1.00      1.00    216199
DDoS-ICMP_Fragmentation       1.00      1.00      1.00    115048
      DDoS-PSHACK_Flood       1.00      1.00      1.00    169833
       DDoS-RSTFINFlood       1.00      1.00      1.00    168887
         DDoS-SYN_Flood       1.00      1.00      1.00    169172
         DDoS-SlowLoris       1.00      1.00      1.00    108617
DDoS-SynonymousIP_Flood       1.00      1.00      1.00    162253
         DDoS-TCP_Flood  

In [7]:
#AUC SMOTE
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

folder_path = os.path.join(os.getcwd(), 'CIC')
sampled_file = 'smote_data.csv'
chunksize = 500_000  # Tune based on your RAM

original_sampled_chunks = []
fraction_to_keep = 0.02  # Just keep 2% from each original chunk

for fname in os.listdir(folder_path):
    if fname.endswith('.csv'):
        fpath = os.path.join(folder_path, fname)
        try:
            print("working on " + fpath)
            for chunk in pd.read_csv(fpath, chunksize=chunksize):
                sampled_chunk = chunk.sample(frac=fraction_to_keep, random_state=42)
                sampled_chunk['is_synthetic'] = 0
                if 'label' in sampled_chunk.columns:
                    sampled_chunk = sampled_chunk.drop(columns=['label'])
                original_sampled_chunks.append(sampled_chunk)
        except Exception as e:
            print(f" Error reading {fname}: {e}")

original_df = pd.concat(original_sampled_chunks, ignore_index=True)

# Load synthetic/sampled data
print("Loading synthetic data...")
sampled_df = pd.read_csv(sampled_file)
sampled_df['is_synthetic'] = 1
if 'label' in sampled_df.columns:
    sampled_df = sampled_df.drop(columns=['label'])

# Align columns and merge
common_cols = list(set(original_df.columns) & set(sampled_df.columns))
original_df = original_df[common_cols]
sampled_df = sampled_df[common_cols]

combined_df = pd.concat([original_df, sampled_df], ignore_index=True)
combined_df = combined_df.dropna(axis=1, how='any')  # safe drop NaNs

#  Prepare for adversarial model
X = combined_df.drop(columns=['is_synthetic'])
y = combined_df['is_synthetic']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, stratify=y, test_size=0.3, random_state=42
)

#  Train adversarial classifier
clf = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)


print(f"\nAdversarial AUC (Synthetic vs Original): {auc_score:.4f}")
if auc_score > 0.9:
    print(" Synthetic data is very easy to detect.")
elif auc_score > 0.7:
    print(" Some detectable differences exist. ")
else:
    print("Synthetic data appears realistic. ")


working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
w

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Adversarial AUC (Synthetic vs Original): 1.0000
 Synthetic data is very easy to detect.


In [9]:
#AUC Stratified
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

folder_path = os.path.join(os.getcwd(), 'CIC')
sampled_file = 'stratified_sample.csv'
chunksize = 500_000  # Tune based on your RAM

original_sampled_chunks = []
fraction_to_keep = 0.02  # Just keep 2% from each original chunk

for fname in os.listdir(folder_path):
    if fname.endswith('.csv'):
        fpath = os.path.join(folder_path, fname)
        try:
            print("working on " + fpath)
            for chunk in pd.read_csv(fpath, chunksize=chunksize):
                sampled_chunk = chunk.sample(frac=fraction_to_keep, random_state=42)
                sampled_chunk['is_synthetic'] = 0
                if 'label' in sampled_chunk.columns:
                    sampled_chunk = sampled_chunk.drop(columns=['label'])
                original_sampled_chunks.append(sampled_chunk)
        except Exception as e:
            print(f" Error reading {fname}: {e}")

original_df = pd.concat(original_sampled_chunks, ignore_index=True)

# Load synthetic/sampled data
print("Loading synthetic data...")
sampled_df = pd.read_csv(sampled_file)
sampled_df['is_synthetic'] = 1
if 'label' in sampled_df.columns:
    sampled_df = sampled_df.drop(columns=['label'])

# Align columns and merge
common_cols = list(set(original_df.columns) & set(sampled_df.columns))
original_df = original_df[common_cols]
sampled_df = sampled_df[common_cols]

combined_df = pd.concat([original_df, sampled_df], ignore_index=True)
combined_df = combined_df.dropna(axis=1, how='any')  # safe drop NaNs

#  Prepare for adversarial model
X = combined_df.drop(columns=['is_synthetic'])
y = combined_df['is_synthetic']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, stratify=y, test_size=0.3, random_state=42
)

#  Train adversarial classifier
clf = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)


print(f"\nAdversarial AUC (Synthetic vs Original): {auc_score:.4f}")
if auc_score > 0.9:
    print(" Synthetic data is very easy to detect.")
elif auc_score > 0.7:
    print(" Some detectable differences exist. ")
else:
    print("Synthetic data appears realistic. ")


working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
w

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Adversarial AUC (Synthetic vs Original): 0.4884
Synthetic data appears realistic. 


In [11]:
#AUC Data diffusion
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

folder_path = os.path.join(os.getcwd(), 'CIC')
sampled_file = 'data_diffusion_data.csv'
chunksize = 500_000  # Tune based on your RAM

original_sampled_chunks = []
fraction_to_keep = 0.02  # Just keep 2% from each original chunk

for fname in os.listdir(folder_path):
    if fname.endswith('.csv'):
        fpath = os.path.join(folder_path, fname)
        try:
            print("working on " + fpath)
            for chunk in pd.read_csv(fpath, chunksize=chunksize):
                sampled_chunk = chunk.sample(frac=fraction_to_keep, random_state=42)
                sampled_chunk['is_synthetic'] = 0
                if 'label' in sampled_chunk.columns:
                    sampled_chunk = sampled_chunk.drop(columns=['label'])
                original_sampled_chunks.append(sampled_chunk)
        except Exception as e:
            print(f" Error reading {fname}: {e}")

original_df = pd.concat(original_sampled_chunks, ignore_index=True)

# Load synthetic/sampled data
print("Loading synthetic data...")
sampled_df = pd.read_csv(sampled_file)
sampled_df['is_synthetic'] = 1
if 'label' in sampled_df.columns:
    sampled_df = sampled_df.drop(columns=['label'])

# Align columns and merge
common_cols = list(set(original_df.columns) & set(sampled_df.columns))
original_df = original_df[common_cols]
sampled_df = sampled_df[common_cols]

combined_df = pd.concat([original_df, sampled_df], ignore_index=True)
combined_df = combined_df.dropna(axis=1, how='any')  # safe drop NaNs

#  Prepare for adversarial model
X = combined_df.drop(columns=['is_synthetic'])
y = combined_df['is_synthetic']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, stratify=y, test_size=0.3, random_state=42
)

#  Train adversarial classifier
clf = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)


print(f"\nAdversarial AUC (Synthetic vs Original): {auc_score:.4f}")
if auc_score > 0.9:
    print(" Synthetic data is very easy to detect.")
elif auc_score > 0.7:
    print(" Some detectable differences exist. ")
else:
    print("Synthetic data appears realistic. ")


working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
w

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Adversarial AUC (Synthetic vs Original): 1.0000
 Synthetic data is very easy to detect.


In [13]:
#AUC undersampled
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

folder_path = os.path.join(os.getcwd(), 'CIC')
sampled_file = 'undersampled_data.csv'
chunksize = 500_000  # Tune based on your RAM

original_sampled_chunks = []
fraction_to_keep = 0.02  # Just keep 2% from each original chunk

for fname in os.listdir(folder_path):
    if fname.endswith('.csv'):
        fpath = os.path.join(folder_path, fname)
        try:
            print("working on " + fpath)
            for chunk in pd.read_csv(fpath, chunksize=chunksize):
                sampled_chunk = chunk.sample(frac=fraction_to_keep, random_state=42)
                sampled_chunk['is_synthetic'] = 0
                if 'label' in sampled_chunk.columns:
                    sampled_chunk = sampled_chunk.drop(columns=['label'])
                original_sampled_chunks.append(sampled_chunk)
        except Exception as e:
            print(f" Error reading {fname}: {e}")

original_df = pd.concat(original_sampled_chunks, ignore_index=True)

# Load synthetic/sampled data
print("Loading synthetic data...")
sampled_df = pd.read_csv(sampled_file)
sampled_df['is_synthetic'] = 1
if 'label' in sampled_df.columns:
    sampled_df = sampled_df.drop(columns=['label'])

# Align columns and merge
common_cols = list(set(original_df.columns) & set(sampled_df.columns))
original_df = original_df[common_cols]
sampled_df = sampled_df[common_cols]

combined_df = pd.concat([original_df, sampled_df], ignore_index=True)
combined_df = combined_df.dropna(axis=1, how='any')  # safe drop NaNs

#  Prepare for adversarial model
X = combined_df.drop(columns=['is_synthetic'])
y = combined_df['is_synthetic']

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, stratify=y, test_size=0.3, random_state=42
)

#  Train adversarial classifier
clf = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_train, y_train)
y_pred_proba = clf.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)


print(f"\nAdversarial AUC (Synthetic vs Original): {auc_score:.4f}")
if auc_score > 0.9:
    print(" Synthetic data is very easy to detect.")
elif auc_score > 0.7:
    print(" Some detectable differences exist. ")
else:
    print("Synthetic data appears realistic. ")


working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
working on C:\Users\vlad.serban\Home\CIC analysis\CIC\part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
w

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Adversarial AUC (Synthetic vs Original): 0.8220
 Some detectable differences exist. 
