## XGBOOSTClassifier model with RandomizedSearchCV to the best paramater

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import uniform, randint
from sklearn.feature_selection import SelectFromModel

# Load the dataset
file_path = "cyberdata_clean.csv"
cyberdata = pd.read_csv(file_path)

if "Attack Type" in cyberdata.columns:

    # Define column types
    nominal_cols = [
        "Protocol", "Packet Type", "Traffic Type", "Attack Signature",
        "Action Taken", "Network Segment", "Log Source", "City", "Region", "Browser", "Operating System"
    ]

    ordinal_cols = ["Severity Level", "Anomaly Score Category", "Packet Length Category"]
    numeric_cols = ["Source IP FirstOctet", "Destination IP FirstOctet"]

    feature_cols = nominal_cols + ordinal_cols + numeric_cols

    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('nom', OneHotEncoder(handle_unknown='ignore'), nominal_cols),
            ('ord', OrdinalEncoder(), ordinal_cols),
            ('num', StandardScaler(), numeric_cols)
        ]
    )

    # Features and target
    target = cyberdata["Attack Type"]
    features = cyberdata[feature_cols]

    # Label encoding
    label_encoder = LabelEncoder()
    target_encoded = label_encoder.fit_transform(target)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        features, target_encoded, test_size=0.2, random_state=42, stratify=target_encoded
    )

    # Explicitly preprocess data
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Feature selection (without deprecated parameter)
    selector = SelectFromModel(XGBClassifier(random_state=42, eval_metric='mlogloss'))
    X_train_selected = selector.fit_transform(X_train_processed, y_train)
    X_test_selected = selector.transform(X_test_processed)

    # XGBClassifier with proper multi-class objective
    xgb_model = XGBClassifier(
        objective='multi:softprob',
        random_state=42,
        eval_metric='mlogloss'
    )

    # Hyperparameter tuning
    param_dist = {
        'n_estimators': randint(150, 500),
        'max_depth': randint(5, 15),
        'learning_rate': uniform(0.01, 0.1),
        'subsample': uniform(0.7, 0.3),
        'colsample_bytree': uniform(0.7, 0.3),
        'gamma': uniform(0, 0.5),
        'reg_alpha': uniform(0, 1),
        'reg_lambda': uniform(0, 1.5)
    }

    random_search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_dist,
        n_iter=50,
        cv=5,
        scoring='accuracy',
        random_state=42,
        n_jobs=-1
    )

    # Fit RandomizedSearchCV
    random_search.fit(X_train_selected, y_train)

    # Results
    print("Best hyperparameters:")
    print(random_search.best_params_)

    y_pred = random_search.predict(X_test_selected)

    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(
        y_test, y_pred, target_names=label_encoder.classes_
    )

    print("Accuracy Score:", accuracy)
    print("Classification Report:\n", classification_rep)

else:
    print("Error: 'Attack Type' does not exist in the dataset")


Best hyperparameters:
{'colsample_bytree': np.float64(0.9006523757990821), 'gamma': np.float64(0.33296117830874833), 'learning_rate': np.float64(0.06912977877077271), 'max_depth': 10, 'n_estimators': 184, 'reg_alpha': np.float64(0.4722149251619493), 'reg_lambda': np.float64(0.17939136890745255), 'subsample': np.float64(0.9139734361668984)}
Accuracy Score: 0.33125
Classification Report:
               precision    recall  f1-score   support

        DDoS       0.33      0.35      0.34      2686
   Intrusion       0.33      0.31      0.32      2653
     Malware       0.33      0.33      0.33      2661

    accuracy                           0.33      8000
   macro avg       0.33      0.33      0.33      8000
weighted avg       0.33      0.33      0.33      8000



Persisting the model for deployment 


In [9]:
import joblib
# Save the entire pipeline for deployment ( we use pipeline to conserve the same scaler when a new data is entered)
joblib.dump(random_search.best_estimator_, 'xgb_model.pkl')


['xgb_model.pkl']