In [3]:
!pip install dask[complete]




In [3]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   - -------------------------------------- 5.5/150.0 MB 33.4 MB/s eta 0:00:05
   ---- ----------------------------------- 18.6/150.0 MB 51.0 MB/s eta 0:00:03
   --------- ------------------------------ 36.4/150.0 MB 64.3 MB/s eta 0:00:02
   -------------- ------------------------- 55.1/150.0 MB 71.5 MB/s eta 0:00:02
   ------------------- -------------------- 74.4/150.0 MB 75.3 MB/s eta 0:00:02
   ------------------------- -------------- 93.8/150.0 MB 78.8 MB/s eta 0:00:01
   ----------------------------- --------- 113.8/150.0 MB 81.6 MB/s eta 0:00:01
   ---------------------------------- ---- 133.2/150.0 MB 83.4 MB/s eta 0:00:01
   --------------------------------------  149.9/150.0 MB 84.8 MB/s eta 0:00:01
   --------------------------------------  149.9/150.0 MB 84.8

In [9]:
import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'stratified_sample.csv'

# Feature engineering function
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

# Read sampled dataset using Dask and label it
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

# Read and sample original data from multiple files
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.01, random_state=42)  # only 1% to avoid overload
    original_dfs.append(sampled)

# Concatenate all Dask DataFrames
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

# Compute to bring into memory for model training
print("Converting to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

# Prepare training data
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['is_sampled']

# Train/test split
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train XGBoost
print("Training XGBoost")
clf = XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1, random_state=42,
                    use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
clf.fit(X_train, y_train)

# Evaluate
print("Evaluating")
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"\nAccuracy of XGBoost: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, predictions, target_names=['Original', 'Sampled']))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating

Accuracy of XGBoost: 0.5006

Classification Report:
              precision    recall  f1-score   support

    Original       0.50      0.51      0.51    140060
     Sampled       0.50      0.49      0.50    140060

    accuracy                           0.50    280120
   macro avg       0.50      0.50      0.50    280120
weighted avg       0.50      0.50      0.50    280120



In [5]:
import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'undersampled_data.csv'

# Feature engineering function
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

# Read sampled dataset using Dask and label it
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

# Read and sample original data from multiple files
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.01, random_state=42)  # only 1% to avoid overload
    original_dfs.append(sampled)

# Concatenate all Dask DataFrames
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

# Compute to bring into memory for model training
print("Converting to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

# Prepare training data
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['is_sampled']

# Train/test split
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train XGBoost
print("Training XGBoost")
clf = XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1, random_state=42,
                    use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
clf.fit(X_train, y_train)

# Evaluate
print("Evaluating")
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"\nAccuracy of XGBoost: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, predictions, target_names=['Original', 'Sampled']))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating

Accuracy of XGBoost: 0.9955

Classification Report:
              precision    recall  f1-score   support

    Original       1.00      1.00      1.00    140061
     Sampled       0.33      0.01      0.02       632

    accuracy                           1.00    140693
   macro avg       0.66      0.50      0.51    140693
weighted avg       0.99      1.00      0.99    140693



In [13]:
import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'smote_data.csv'

# Feature engineering function
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

# Read sampled dataset using Dask and label it
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

# Read and sample original data from multiple files
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.01, random_state=42)  # only 1% to avoid overload
    original_dfs.append(sampled)

# Concatenate all Dask DataFrames
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

# Compute to bring into memory for model training
print("Converting to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

# Prepare training data
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['is_sampled']

# Train/test split
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train XGBoost
print("Training XGBoost")
clf = XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1, random_state=42,
                    use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
clf.fit(X_train, y_train)

# Evaluate
print("Evaluating")
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"\nAccuracy of XGBoost: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, predictions, target_names=['Original', 'Sampled']))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating

Accuracy of XGBoost: 1.0000

Classification Report:
              precision    recall  f1-score   support

    Original       1.00      1.00      1.00    140060
     Sampled       1.00      1.00      1.00   3067010

    accuracy                           1.00   3207070
   macro avg       1.00      1.00      1.00   3207070
weighted avg       1.00      1.00      1.00   3207070



In [15]:
import os
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

folder_path = os.path.join(os.getcwd(), 'CIC')
original_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
sampled_file = 'data_diffusion_data.csv'

# Feature engineering function
def feature_engineering(df):
    if 'Tot sum' in df.columns and 'Duration' in df.columns:
        df['Packets_Duration_Ratio'] = df['Tot sum'] / (df['Duration'] + 1e-6)
    return df

# Read sampled dataset using Dask and label it
print("Reading sampled file")
sampled_df = dd.read_csv(sampled_file, assume_missing=True)
sampled_df = feature_engineering(sampled_df)
sampled_df['is_sampled'] = 1

# Read and sample original data from multiple files
print("Reading original files")
original_dfs = []
for f in original_files:
    print(f"Processing: {f}")
    ddf = dd.read_csv(os.path.join(folder_path, f), assume_missing=True)
    ddf = feature_engineering(ddf)
    ddf['is_sampled'] = 0
    sampled = ddf.sample(frac=0.01, random_state=42)  # only 1% to avoid overload
    original_dfs.append(sampled)

# Concatenate all Dask DataFrames
print("Combining sampled and original")
original_df = dd.concat(original_dfs)
combined_df = dd.concat([original_df, sampled_df]).dropna()

# Compute to bring into memory for model training
print("Converting to Pandas")
with ProgressBar():
    combined_pd = combined_df.compute()

# Prepare training data
X = combined_pd.drop(['is_sampled', 'label'], axis=1, errors='ignore')
y = combined_pd['is_sampled']

# Train/test split
print("Splitting dataset")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train XGBoost
print("Training XGBoost")
clf = XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1, random_state=42,
                    use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
clf.fit(X_train, y_train)

# Evaluate
print("Evaluating")
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"\nAccuracy of XGBoost: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, predictions, target_names=['Original', 'Sampled']))


Reading sampled file
Reading original files
Processing: part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv
Processing: part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Evaluating

Accuracy of XGBoost: 1.0000

Classification Report:
              precision    recall  f1-score   support

    Original       1.00      1.00      1.00    140060
     Sampled       1.00      1.00      1.00   3681191

    accuracy                           1.00   3821251
   macro avg       1.00      1.00      1.00   3821251
weighted avg       1.00      1.00      1.00   3821251

