## XGBOOSTClassifier model with RandomizedSearchCV to the best paramater

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import uniform, randint

# Load the dataset
file_path = "cyberdata_clean.csv"
cyberdata = pd.read_csv(file_path)

if "Attack Type" in cyberdata.columns:

    # Define column lists by type
    nominal_cols = [
        "Protocol", "Packet Type", "Traffic Type", "Attack Signature",
        "Action Taken", "Network Segment", "Log Source", "City", "Region", "Browser", "Operating System"
    ]

    ordinal_cols = ["Severity Level", "Anomaly Score Category", "Packet Length Category"]
    numeric_cols = ["Source IP FirstOctet", "Destination IP FirstOctet"]

    # Define feature columns
    feature_cols = nominal_cols + ordinal_cols + numeric_cols

    # Create ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('nom', OneHotEncoder(handle_unknown='ignore'), nominal_cols),
            ('ord', OrdinalEncoder(), ordinal_cols),
            ('num', StandardScaler(), numeric_cols)
        ]
    )

    # Pipeline setup
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
    ])

    # Features and target
    target = cyberdata["Attack Type"]
    features = cyberdata[feature_cols]

    # Encode labels
    label_encoder = LabelEncoder()
    target_encoded = label_encoder.fit_transform(target)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        features, target_encoded, test_size=0.2, random_state=42, stratify=target_encoded
    )

    # Hyperparameter distribution
    param_dist = {
        'xgb__n_estimators': randint(50, 200),
        'xgb__max_depth': randint(3, 10),
        'xgb__learning_rate': uniform(0.01, 0.3),
        'xgb__subsample': uniform(0.6, 0.4),
        'xgb__colsample_bytree': uniform(0.6, 0.4),
        'xgb__gamma': uniform(0, 0.3),
        'xgb__reg_alpha': uniform(0, 0.5),
        'xgb__reg_lambda': uniform(0.5, 1.5)
    }

    # RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=param_dist,
        n_iter=20,
        cv=5,
        scoring='accuracy',
        random_state=42,
        n_jobs=-1
    )

    # Fit model
    random_search.fit(X_train, y_train)

    print("Best hyperparameters:")
    print(random_search.best_params_)

    # Evaluate
    best_pipeline = random_search.best_estimator_
    y_pred = best_pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

    print("Accuracy Score:", accuracy)
    print("Classification Report:\n", classification_rep)

else:
    print("Error: 'Attack Type' does not exist in the dataset")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best hyperparameters:
{'xgb__colsample_bytree': np.float64(0.8347004662655393), 'xgb__gamma': np.float64(0.28957659217924137), 'xgb__learning_rate': np.float64(0.1921102743060054), 'xgb__max_depth': 7, 'xgb__n_estimators': 90, 'xgb__reg_alpha': np.float64(0.1481367528520412), 'xgb__reg_lambda': np.float64(0.7479004085945038), 'xgb__subsample': np.float64(0.6062545626964776)}
Accuracy Score: 0.34175
Classification Report:
               precision    recall  f1-score   support

        DDoS       0.34      0.35      0.34      2686
   Intrusion       0.34      0.33      0.33      2653
     Malware       0.35      0.35      0.35      2661

    accuracy                           0.34      8000
   macro avg       0.34      0.34      0.34      8000
weighted avg       0.34      0.34      0.34      8000



Persisting the model for deployment 


In [15]:
import joblib
# Save the entire pipeline for deployment ( we use pipeline to conserve the same scaler when a new data is entered)
joblib.dump(best_pipeline, "cyberAttackXgboost.joblib")


['cyberAttackXgboost.joblib']

In [17]:
import joblib
best_pipeline = joblib.load("cyberAttackXgboost.joblib")

Predicting

In [18]:
target_encoded

array([2, 2, 0, ..., 0, 2, 1])

In [19]:
print(target) 

0          Malware
1          Malware
2             DDoS
3          Malware
4             DDoS
           ...    
39995         DDoS
39996         DDoS
39997         DDoS
39998      Malware
39999    Intrusion
Name: Attack Type, Length: 40000, dtype: object


{Malware:2
DDos:0
Intrusion:1}

In [22]:
index = X_test.index[1] ## second element in x_test matrix (from previous model testing)
test_index =  X_test.loc[[index]]
print(test_index) 
predictions = best_pipeline.predict(test_index)

print("attack Stype:", predictions)


      Protocol Packet Type Traffic Type Attack Signature Action Taken  \
17942     ICMP     Control         HTTP  Known Pattern B      Blocked   

      Network Segment Log Source     City      Region  Browser  \
17942       Segment C     Server  Barasat  Tamil Nadu  Mozilla   

      Operating System Severity Level Anomaly Score Category  \
17942          Windows           High                   25-0   

      Packet Length Category  Source IP FirstOctet  Destination IP FirstOctet  
17942                  short                     7                        118  
attack type type: [2]
