# CIC-IDS-2017 Classification using XGBoost

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Natural Language Processing Libraries
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# PyTorch Libraries
import torch
from torch.utils.data import Dataset, DataLoader

# Evaluation Metrics
from sklearn.metrics import classification_report, accuracy_score


### Loading the Data

In [None]:
# Load the CIC-IDS-2017 datasets
datasets = {
    'Benign-Monday': '/content/Benign-Monday-no-metadata.parquet',
    'Botnet-Friday': '/content/Botnet-Friday-no-metadata.parquet',
    'Bruteforce-Tuesday': '/content/Bruteforce-Tuesday-no-metadata.parquet',
    'DDoS-Friday': '/content/DDoS-Friday-no-metadata.parquet',
    'DoS-Wednesday': '/content//DoS-Wednesday-no-metadata.parquet',
    'Infiltration-Thursday': '/content/Infiltration-Thursday-no-metadata.parquet',
    'Portscan-Friday': '/content/Portscan-Friday-no-metadata.parquet',
    'WebAttacks-Thursday': '/content/WebAttacks-Thursday-no-metadata.parquet'
}

# Read the datasets into DataFrames
df_data = {key: pd.read_parquet(path) for key, path in datasets.items()}


In [None]:
# import os
# import pandas as pd

# # Ensure the output directory exists
# csv_folder = "/content/cicids2017_csv"
# os.makedirs(csv_folder, exist_ok=True)

# # Concatenate all DataFrames in df_data into a single DataFrame
# df_all_combined = pd.concat(df_data.values(), ignore_index=True)

# # Define the path for the combined CSV file
# combined_csv_file_path = os.path.join(csv_folder, "all_cic_ids_2017_data.csv")

# # Save the combined DataFrame to a CSV file
# df_all_combined.to_csv(combined_csv_file_path, index=False)
# print(f"✅ Successfully combined all DataFrames into: {combined_csv_file_path}")

✅ Successfully combined all DataFrames into: /content/cicids2017_csv/all_cic_ids_2017_data.csv


In [None]:
# df=pd.read_csv('/content/cicids2017_csv/all_cic_ids_2017_data.csv')

In [None]:
# df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2313810 entries, 0 to 2313809
Data columns (total 78 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   Protocol                  int64  
 1   Flow Duration             int64  
 2   Total Fwd Packets         int64  
 3   Total Backward Packets    int64  
 4   Fwd Packets Length Total  int64  
 5   Bwd Packets Length Total  int64  
 6   Fwd Packet Length Max     int64  
 7   Fwd Packet Length Min     int64  
 8   Fwd Packet Length Mean    float64
 9   Fwd Packet Length Std     float64
 10  Bwd Packet Length Max     int64  
 11  Bwd Packet Length Min     int64  
 12  Bwd Packet Length Mean    float64
 13  Bwd Packet Length Std     float64
 14  Flow Bytes/s              float64
 15  Flow Packets/s            float64
 16  Flow IAT Mean             float64
 17  Flow IAT Std              float64
 18  Flow IAT Max              int64  
 19  Flow IAT Min              int64  
 20  Fwd IAT Total           

In [None]:
# df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
Benign,1977318
DoS Hulk,172846
DDoS,128014
DoS GoldenEye,10286
FTP-Patator,5931
DoS slowloris,5385
DoS Slowhttptest,5228
SSH-Patator,3219
PortScan,1956
Web Attack � Brute Force,1470


In [None]:
df_all = pd.concat(df_data.values(), ignore_index=True)

In [None]:
null_counts = df_all.isnull().sum()
df_all.dropna(inplace=True)
duplicate_count = df_all.duplicated().sum()
df_all.drop_duplicates(inplace=True)
df_all.reset_index(drop=True, inplace=True)
categorical_columns = df_all.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_columns}\n")
df_all.head()


Categorical columns: ['Label']



Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packets Length Total,Bwd Packets Length Total,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,6,4,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
1,6,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
2,6,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
3,6,1,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,6,609,7,4,484,414,233,0,69.14286,111.967896,...,20,0.0,0.0,0,0,0.0,0.0,0,0,Benign


In [None]:
df_all.rename(columns={'Label': 'Attack_type'}, inplace=True)

In [None]:
# Define the mapping for Attack_type column values
rename_mapping = {
    'Benign': 'Benign',
    'DoS Hulk': 'DoS Hulk Attack',
    'DDoS': 'DDoS Attack',
    'DoS GoldenEye': 'DoS GoldenEye Attack',
    'FTP-Patator': 'FTP Brute Force Attack',
    'DoS slowloris': 'DoS Slowloris Attack',
    'DoS Slowhttptest': 'DoS Slowhttptest Attack',
    'SSH-Patator': 'SSH Brute Force Attack',
    'PortScan': 'Port Scan',
    'Web Attack � Brute Force': 'Web Brute Force Attack', # Corrected key
    'Bot': 'Botnet Activity',
    'Web Attack � XSS': 'Web XSS Attack',               # Corrected key
    'Infiltration': 'Infiltration Attack',
    'Web Attack � Sql Injection': 'Web SQL Injection Attack', # Corrected key
    'Heartbleed': 'Heartbleed Exploit'
}

# Apply the renaming to the 'Attack_type' column
df_all['Attack_type'] = df_all['Attack_type'].replace(rename_mapping)


In [None]:
# @title
cols_to_drop = [
    "Fwd Packet Length Mean",
    "Bwd Packet Length Mean",
    "Packet Length Variance",
    "Avg Packet Size",
    "Avg Fwd Segment Size",
    "Avg Bwd Segment Size",

    "Bwd PSH Flags",
    "Bwd URG Flags",
    "Fwd Avg Bytes/Bulk",
    "Fwd Avg Packets/Bulk",
    "Fwd Avg Bulk Rate",
    "Bwd Avg Bytes/Bulk",
    "Bwd Avg Packets/Bulk",
    "Bwd Avg Bulk Rate",

    "Subflow Fwd Packets",
    "Subflow Bwd Packets",
    "Subflow Fwd Bytes",
    "Subflow Bwd Bytes",

    "Fwd IAT Total",
    "Bwd IAT Total",
    "Fwd IAT Mean",
    "Bwd IAT Mean",
    "Fwd IAT Min",
    "Bwd IAT Min",
    "Flow IAT Min",

    "Flow Duration"
]

# Drop columns safely (ignores if any column is missing)
df_all = df_all.drop(columns=cols_to_drop, errors='ignore')

In [None]:
# Define mapping from raw labels to grouped labels
Attack_mapping = {
    # -------------------- DoS / DDoS --------------------
    "DoS Hulk Attack": "DoS",
    "DDoS Attack": "DoS",
    "DoS GoldenEye Attack": "DoS",
    "DoS Slowloris Attack": "DoS",
    "DoS Slowhttptest Attack": "DoS",
    "Heartbleed Exploit": "DoS",   # TLS heartbeat exploit behaves like DoS

    # -------------------- Brute Force --------------------
    "FTP Brute Force Attack": "BruteForce",
    "SSH Brute Force Attack": "BruteForce",
    "Web Brute Force Attack": "BruteForce",

    # -------------------- Scanning / Recon --------------------
    "Port Scan": "Scan",

    # -------------------- Web Attacks --------------------
    "Web XSS Attack": "WebAttack",
    "Web SQL Injection Attack": "WebAttack",

    # -------------------- Malware / Infiltration --------------------
    "Botnet Activity": "Malware",
    "Infiltration Attack": "Malware",

    # -------------------- Benign --------------------
    "Benign": "Benign"
}

df_all["Attack_type"] = df_all["Attack_type"].map(Attack_mapping)

unmapped = df_all[df_all["Attack_type"].isna()]
if len(unmapped) > 0:
    print("Unmapped labels found:")
    print(unmapped["Attack_type"].unique())
else:
    print("All attack types successfully mapped!")


All attack types successfully mapped!


In [None]:
df_all['Attack_type'].value_counts()

Unnamed: 0_level_0,count
Attack_type,Unnamed: 1_level_1
Benign,1895314
DoS,321770
BruteForce,10620
Scan,1956
Malware,1473
WebAttack,673


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Attack_type' column to create 'Attack_encode'
df_all['Attack_encode'] = label_encoder.fit_transform(df_all['Attack_type'])

# Display the mapping of original labels to encoded values
print("Mapping of Attack_type to Attack_encode Values:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{label}: {i}")

Mapping of Attack_type to Attack_encode Values:
Benign: 0
BruteForce: 1
DoS: 2
Malware: 3
Scan: 4
WebAttack: 5


In [None]:
X = df_all.drop(columns=['Attack_type', 'Attack_encode'])
y = df_all['Attack_encode']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training Set: {X_train.shape[0]} samples")
print(f"Test Set: {X_test.shape[0]} samples")

Training Set: 1785444 samples
Test Set: 446362 samples


In [None]:
from sklearn.preprocessing import RobustScaler
import pandas as pd

def robust_scale_data(X_train, X_test):
    scaler = RobustScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
    X_test_scaled  = pd.DataFrame(scaler.transform(X_test),      columns=X_test.columns)
    return X_train_scaled, X_test_scaled

X_train_scaled, X_test_scaled = robust_scale_data(X_train, X_test)
scaled_data = [X_train_scaled, X_test_scaled, y_train, y_test]

In [None]:
[X_train, X_test, y_train, y_test] = scaled_data

# Import SMOTE for oversampling (if not already imported globally)
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
sm = SMOTE(sampling_strategy={
    1: 15000, # BruteForce
    2: 260000, # DoS - Corrected: must be >= original (257416)
    3: 5000, # Malware
    4: 5000, # Scan
    5: 5000 # WebAttack
}, k_neighbors=3, random_state=42)

X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)

print("Shape of X_train before SMOTE:", X_train.shape)
print("Shape of y_train before SMOTE:", y_train.shape)
print("Shape of X_train_bal after SMOTE:", X_train_bal.shape)
print("Shape of y_train_bal after SMOTE:", y_train_bal.shape)
print("\nValue counts of y_train_bal after SMOTE:")
display(y_train_bal.value_counts())

Shape of X_train before SMOTE: (1785444, 51)
Shape of y_train before SMOTE: (1785444,)
Shape of X_train_bal after SMOTE: (1806251, 51)
Shape of y_train_bal after SMOTE: (1806251,)

Value counts of y_train_bal after SMOTE:


Unnamed: 0_level_0,count
Attack_encode,Unnamed: 1_level_1
0,1516251
2,260000
1,15000
5,5000
3,5000
4,5000


## Classifier, Training, Model Inspection and Evaluation



In [None]:
import xgboost as xgb
import numpy as np
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Set random seed for reproducibility
seed = 314159
np.random.seed(seed)

# === Prepare Data ===
# X_train_bal, y_train_bal are already balanced and scaled from previous steps
# X_test_scaled, y_test are also ready for evaluation
try:
    _ = X_train_bal, y_train_bal, X_test_scaled, y_test, label_encoder # Ensure label_encoder is also available
except NameError:
    raise NameError("X_train_bal, y_train_bal, X_test_scaled, y_test, or label_encoder is not defined. Ensure SMOTE, scaling steps, and label encoding are run.")

# Convert data to DMatrix format for xgb.train
dtrain = xgb.DMatrix(X_train_bal, label=y_train_bal)
dtest = xgb.DMatrix(X_test_scaled, label=y_test)

# === Define model parameters for multi-class classification ===
# Determine the number of classes from y_train_bal
num_classes = len(np.unique(y_train_bal))

xgb_params = {
    "objective": "multi:softmax", # Multi-class classification
    "num_class": num_classes,     # Number of unique classes
    "eval_metric": "mlogloss",    # Multi-class logloss
    "max_depth": 6,
    "learning_rate": 0.01,
    "eta": 0.01, # Learning rate for xgb.train
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.1,
    "reg_lambda": 1.0,
    "random_state": seed,
    # "n_estimators": 1000, # This is for XGBClassifier, use num_boost_round for xgb.train
}

# === Train model using xgb.train ===
print("Starting XGBoost training...")
xgb_clf = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=1000, # Max number of boosting rounds
    evals=[(dtrain, "train"), (dtest, "eval")], # Evaluation sets
    early_stopping_rounds=10, # Stop if no improvement for 10 rounds
    verbose_eval=10 # Print evaluation results every 10 rounds
)
print("XGBoost training finished.")

# === Predict & evaluate ===
# Make predictions on the test set using the best iteration
val_preds = xgb_clf.predict(dtest, iteration_range=(0, xgb_clf.best_iteration))

# Convert predictions to integer labels
val_preds = val_preds.astype(int)

# Compute metrics, specifically per-class recall using classification_report
print("\nClassification Report on Test Set:")
# Use label_encoder.classes_ for target names to match the original labels
print(classification_report(y_test, val_preds, target_names=label_encoder.classes_))

# Calculate overall metrics (weighted average for multi-class)
val_accuracy = accuracy_score(y_test, val_preds)
val_precision = precision_score(y_test, val_preds, average='weighted', zero_division=0)
val_recall = recall_score(y_test, val_preds, average='weighted', zero_division=0)
val_f1 = f1_score(y_test, val_preds, average='weighted', zero_division=0)

print("\nOverall Validation Metrics (Weighted Average):")
print(f"Accuracy:  {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall:    {val_recall:.4f}")
print(f"F1-Score:  {val_f1:.4f}")

# === Save model ===
joblib.dump(xgb_clf, "xgb_classifier.pkl")
joblib.dump(X_train_bal.columns.tolist(), "feature_names.pkl") # Use columns from X_train_bal
print("✅ Model and feature names saved.")

Starting XGBoost training...
[0]	train-mlogloss:1.27037	eval-mlogloss:1.26225
[10]	train-mlogloss:1.11549	eval-mlogloss:1.10796
[20]	train-mlogloss:0.98730	eval-mlogloss:0.98015
[30]	train-mlogloss:0.87863	eval-mlogloss:0.87172
[40]	train-mlogloss:0.78521	eval-mlogloss:0.77848
[50]	train-mlogloss:0.70400	eval-mlogloss:0.69735
[60]	train-mlogloss:0.63282	eval-mlogloss:0.62626
[70]	train-mlogloss:0.57009	eval-mlogloss:0.56358
[80]	train-mlogloss:0.51437	eval-mlogloss:0.50796
[90]	train-mlogloss:0.46500	eval-mlogloss:0.45863
[100]	train-mlogloss:0.42100	eval-mlogloss:0.41467
[110]	train-mlogloss:0.38169	eval-mlogloss:0.37541
[120]	train-mlogloss:0.34644	eval-mlogloss:0.34020
[130]	train-mlogloss:0.31473	eval-mlogloss:0.30856
[140]	train-mlogloss:0.28624	eval-mlogloss:0.28010
[150]	train-mlogloss:0.26061	eval-mlogloss:0.25453
[160]	train-mlogloss:0.23757	eval-mlogloss:0.23155
[170]	train-mlogloss:0.21686	eval-mlogloss:0.21089
[180]	train-mlogloss:0.19812	eval-mlogloss:0.19219
[190]	train-m

In [None]:
import pandas as pd
import os

# Ensure the output directory exists
output_dir = "/content/"
os.makedirs(output_dir, exist_ok=True)

# Combine X_train_bal and y_train_bal into a single DataFrame for train.csv
train_df = pd.concat([X_train_bal, y_train_bal], axis=1)
# Combine X_test_scaled and y_test into a single DataFrame for test.csv
test_df = pd.concat([X_test_scaled, y_test], axis=1)

# Define file paths
train_csv_path = os.path.join(output_dir, "train.csv")
test_csv_path = os.path.join(output_dir, "test.csv")

# Save to CSV
train_df.to_csv(train_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)

print(f"✅ 'train.csv' saved to: {train_csv_path}")
print(f"✅ 'test.csv' saved to: {test_csv_path}")
print("You can now download these files from your Colab environment's file browser.")

✅ 'train.csv' saved to: /content/train.csv
✅ 'test.csv' saved to: /content/test.csv
You can now download these files from your Colab environment's file browser.


In [None]:
import shap
import joblib
import pandas as pd
import numpy as np # Import numpy for np.abs and np.mean

# Load the trained XGBoost model and feature names
# Make sure 'xgb_classifier.pkl' and 'feature_names.pkl' exist from previous steps
try:
    xgb_clf = joblib.load('xgb_classifier.pkl')
    feature_names = joblib.load('feature_names.pkl')
except FileNotFoundError:
    print("Error: Model or feature names file not found. Please ensure 'xgb_classifier.pkl' and 'feature_names.pkl' are saved.")
    raise

# Ensure X_test_scaled is available and has the correct column names
if 'X_test_scaled' not in locals():
    print("Error: X_test_scaled not found. Please ensure data splitting and scaling steps are run.")
    raise NameError("X_test_scaled is not defined.")
X_test_scaled.columns = feature_names # Assign feature names to X_test_scaled

# Initialize SHAP TreeExplainer with the trained XGBoost model
explainer = shap.TreeExplainer(xgb_clf)

# Calculate SHAP values for a subset of the test data (to speed up computation)
sample_size = 10000 # Using a smaller sample for performance
X_test_sample = X_test_scaled.sample(n=min(sample_size, len(X_test_scaled)), random_state=42)
shap_values = explainer.shap_values(X_test_sample)

# Calculate the mean absolute SHAP values for each feature
# shap_values can be a list of arrays for multi-output models
if isinstance(shap_values, list):
    # For multi-class, shap_values is a list of arrays (one for each class)
    # Each array is (num_samples, num_features)
    # We need to stack them, take absolute values, average across classes and then across samples
    # np.array(shap_values) will give (num_classes, num_samples, num_features)
    abs_shap_values_stacked = np.abs(np.array(shap_values))
    # Average across classes (axis=0) to get (num_samples, num_features)
    mean_abs_shap_per_sample_per_feature = np.mean(abs_shap_values_stacked, axis=0)
    # Then average across samples (axis=0) to get (num_features,)
    mean_abs_shap = np.mean(mean_abs_shap_per_sample_per_feature, axis=0)
elif isinstance(shap_values, np.ndarray):
    # This case usually means binary classification, where shap_values is (num_samples, num_features)
    # or sometimes (num_samples, num_features, num_outputs) if not a list
    abs_shap_values = np.abs(shap_values)
    if abs_shap_values.ndim == 3: # (num_samples, num_features, num_classes)
        # Average over samples (axis 0) and then over classes (axis 2) to get (num_features,)
        mean_abs_shap = np.mean(abs_shap_values, axis=(0, 2))
    elif abs_shap_values.ndim == 2: # (num_samples, num_features)
        # Average over samples (axis 0) to get (num_features,)
        mean_abs_shap = np.mean(abs_shap_values, axis=0)
    else:
        raise ValueError(f"Unexpected number of dimensions for SHAP values: {abs_shap_values.ndim}")
else:
    raise TypeError(f"Unexpected type for SHAP values: {type(shap_values)}")


# Create a DataFrame for feature importance with percentages
feature_importance_df = pd.DataFrame({
    'Feature': X_test_sample.columns,
    'Mean_Abs_SHAP': mean_abs_shap
})

# Sort by importance
feature_importance_df = feature_importance_df.sort_values(by='Mean_Abs_SHAP', ascending=False)

# Calculate percentages
total_shap_importance = feature_importance_df['Mean_Abs_SHAP'].sum()
feature_importance_df['Percentage'] = (feature_importance_df['Mean_Abs_SHAP'] / total_shap_importance) * 100

print("\nFeature Importance (SHAP) with Percentage Contribution:")
display(feature_importance_df)


Feature Importance (SHAP) with Percentage Contribution:


Unnamed: 0,Feature,Mean_Abs_SHAP,Percentage
39,Init Fwd Win Bytes,0.4175,11.592921
40,Init Bwd Win Bytes,0.406836,11.296825
10,Bwd Packet Length Std,0.328993,9.135311
42,Fwd Seg Size Min,0.276106,7.666761
28,Packet Length Mean,0.212652,5.904826
4,Bwd Packets Length Total,0.175648,4.877303
25,Bwd Packets/s,0.174358,4.841498
11,Flow Bytes/s,0.137577,3.820168
5,Fwd Packet Length Max,0.096862,2.689607
3,Fwd Packets Length Total,0.086386,2.39873
