### +++++++++++++++++++++++++++++++++++++++++++++++ **Isolation Forest** +++++++++++++++++++++++++++++++++++++++++++++++

In [273]:
def create_isolation_forest(n_estimators, max_samples, contamination, max_features, bootstrap, random_state):
    """
    Create an Isolation Forest model with specified parameters.
    
    Args:
        n_estimators (int): Number of base estimators
        max_samples (int): Number of samples to draw to train each base estimator
        contamination (float): Proportion of outliers in the dataset
        max_features (float): Proportion of features to consider
        bootstrap (bool): Whether samples are drawn with replacement
        random_state (int): Random seed
        
    Returns:
        IsolationForest: Configured Isolation Forest model
    """
    return IsolationForest(
        n_estimators=int(n_estimators),
        max_samples=int(max_samples),
        contamination=contamination,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=int(random_state),
        n_jobs=-1  # Use all available cores
    )

In [274]:

def objective_function(solution, X_train, X_val, y_val, dataset_name):
    """
    Objective function for optimization - evaluates Isolation Forest performance.
    
    Args:
        solution (list): Parameters to optimize [n_estimators, max_samples, contamination, max_features, bootstrap, random_state]
        X_train (array): Training features
        y_train (array): Training labels (0=normal, 1=anomaly)
        
    Returns:
        float: Fitness value (1 - AUC score to be minimized)
    """
    # Convert solution to parameters
    n_estimators, max_samples, contamination, max_features, bootstrap, random_state = solution
    bootstrap = bool(round(bootstrap))
    
    y_val_binary = np.where(y_val > 0, 1, 0)
    # Create and train model
    model = create_isolation_forest(
        n_estimators=n_estimators,
        max_samples=max_samples,
        contamination=contamination,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=random_state
    )
    

    model.fit(X_train)
    
    scores = -model.decision_function(X_val)

    if dataset_name == "kdd99":
        threshold = np.percentile(scores, 60)
    elif dataset_name == "unsw":
        threshold = np.percentile(scores, 40)
    
    y_pred = (scores > threshold).astype(int)
    
    # Use F1 score as optimization target (or whatever metric you prefer)
    try:
        f1 = f1_score(y_val, y_pred)
        return 1 - f1  # Minimize (1 - F1)
    except ValueError:
        return 1.  # Worst possible score if calculation fails

In [275]:

def optimize_isolation_forest(dataset_name, X_train, X_val, y_val, optimizer_type, epochs=50, pop_size=20):
    # Function implementation
    """
    Optimize Isolation Forest parameters using specified optimizer.
    
    Args:
        X_train (array): Training features
        y_train (array): Training labels
        optimizer_type (str): 'WOA' or 'DE'
        epochs (int): Number of optimization iterations
        pop_size (int): Population size
        
    Returns:
        list: Best parameters found
    """
    # Define the search space
    problem_dict = {
        "bounds": FloatVar(
            lb=[
                50,     # n_estimators (50-500)
                100,    # max_samples (100-1000)
                0.01,   # contamination (0.01-0.5)
                0.1,    # max_features (0.1-1.0)
                0,      # bootstrap (0=False, 1=True)
                0       # random_state (0-100)
            ],
            ub=[
                500,1000,0.5,1.0,1,100
            ],
            name="delta",
        ),
        "minmax": "min",  # We're minimizing (1 - AUC)
        "obj_func": lambda solution: objective_function(solution, X_train, X_val, y_val, dataset_name),
    }
    
    # Choose Optimizer
    if optimizer_type == "WOA":
        model = WOA.HI_WOA(epoch=epochs, pop_size=pop_size, feedback_max=10)
    elif optimizer_type == "DE":
        model = DE.JADE(epoch=epochs, pop_size=pop_size, miu_f=0.5, miu_cr=0.5, pt=0.1, ap=0.1)
    else:
        raise ValueError("Optimizer not supported. Choose 'WOA' or 'DE'.")
    
    # Run Optimization
    g_best = model.solve(problem_dict)
    return g_best.solution


In [276]:

def evaluate_model(model, X_test, y_test, dataset_name):
    """
    Evaluate Isolation Forest model performance.
    
    Args:
        model: Trained Isolation Forest model
        X_test (array): Test features
        y_test (array): Test labels (0=normal, 1=anomaly)
        
    Returns:
        tuple: Evaluation metrics (accuracy, precision, recall, f1, auc, confusion_matrix)
    """
    # Get anomaly scores
    scores = -model.decision_function(X_test)  # Convert to positive where higher is more anomalous
    
    # Convert scores to binary predictions using the contamination threshold
    if dataset_name == "kdd99":
        threshold = np.percentile(scores, 60)
    elif dataset_name == "unsw":
        threshold = np.percentile(scores, 40)
    y_pred = (scores > threshold).astype(int)
    
    # Compute Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, scores)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    return accuracy, precision, recall, f1, auc, conf_matrix


In [277]:
def main_pipeline(dataset_name, optimizer_type):
    """
    Main pipeline for optimizing and evaluating Isolation Forest.
    
    Args:
        dataset_name (str): 'kdd99' or 'unsw'
        optimizer_type (str): 'WOA' or 'DE'
    """
    if dataset_name == "kdd99":
        X_train = df_kdd99_train
        X_val = df_kdd99_validation
        y_val = df_kdd99_validation_label
        X_test = df_kdd99_testing
        y_test = df_kdd99_testing_label
    elif dataset_name == "unsw":
        X_train = df_unsw_train
        X_val = df_unsw_validation
        y_val = df_unsw_validation_label
        X_test = df_unsw_testing
        y_test = df_unsw_testing_label
    else:
        raise ValueError("Dataset not supported. Choose 'kdd99' or 'unsw'.")
    
    best_params = optimize_isolation_forest(dataset_name, X_train, X_val, y_val, optimizer_type)
        
    # Extract parameters
    (n_estimators, max_samples, contamination, 
     max_features, bootstrap, random_state) = best_params
    
    # Convert binary bootstrap flag to boolean
    bootstrap = bool(round(bootstrap))
    
    # Create final model with best parameters
    model = create_isolation_forest(
        n_estimators=int(n_estimators),
        max_samples=int(max_samples),
        contamination=contamination,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=int(random_state))
    
    # Train final model
    model.fit(X_train)
    
    # Evaluate on test set
    accuracy, precision, recall, f1, auc, conf_matrix = evaluate_model(model, X_test, y_test, dataset_name)
    
    # Print results
    print(f"\n=== For Dataset {dataset_name}, {optimizer_type} optimizer ===")
    
    print("\n=== Best Parameters ===")
    print(f"n_estimators: {int(n_estimators)}")
    print(f"max_samples: {int(max_samples)}")
    print(f"contamination: {contamination:.4f}")
    print(f"max_features: {max_features:.4f}")
    print(f"bootstrap: {bootstrap}")
    print(f"random_state: {int(random_state)}")
    
    print("\n=== Evaluation Metrics ===")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC: {auc:.4f}")
    print(f"Confusion Matrix:\n{conf_matrix}")



In [278]:
main_pipeline('kdd99', 'WOA')

2025/03/26 08:53:14 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: Solving single objective optimization problem.
2025/03/26 08:54:30 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 1, Current best: 0.32554530562979567, Global best: 0.32554530562979567, Runtime: 32.07965 seconds
2025/03/26 08:54:57 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 2, Current best: 0.32554530562979567, Global best: 0.32554530562979567, Runtime: 27.32851 seconds
2025/03/26 08:55:29 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 3, Current best: 0.32554530562979567, Global best: 0.32554530562979567, Runtime: 31.31265 seconds
2025/03/26 08:56:04 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 4, Current best: 0.318130873351151, Global best: 0.318130873351151, Runtime: 34.90811 seconds
2025/03/26 08:56:23 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 5, Current best: 0.318130873351151, Global best: 0.318130873351151, Runtime: 19.63727 sec


=== For Dataset kdd99, WOA optimizer ===

=== Best Parameters ===
n_estimators: 53
max_samples: 142
contamination: 0.1650
max_features: 0.1174
bootstrap: False
random_state: 49

=== Evaluation Metrics ===
Accuracy: 0.9177
Precision: 0.9248
Recall: 0.8761
F1 Score: 0.8998
AUC: 0.9729
Confusion Matrix:
[[53354  2925]
 [ 5087 35959]]


In [279]:
main_pipeline('kdd99', 'DE')

2025/03/26 09:06:30 PM, INFO, mealpy.evolutionary_based.DE.JADE: Solving single objective optimization problem.
2025/03/26 09:07:49 PM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 1, Current best: 0.36632468316234157, Global best: 0.36632468316234157, Runtime: 41.17496 seconds
2025/03/26 09:08:35 PM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 2, Current best: 0.3646003965859126, Global best: 0.3646003965859126, Runtime: 45.82095 seconds
2025/03/26 09:09:30 PM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 3, Current best: 0.36356582464005516, Global best: 0.36356582464005516, Runtime: 55.13234 seconds
2025/03/26 09:10:26 PM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 4, Current best: 0.3437365290111216, Global best: 0.3437365290111216, Runtime: 55.47235 seconds
2025/03/26 09:11:09 PM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 5, Current best: 0.3437365290111216, Global best: 0.3437365290111


=== For Dataset kdd99, DE optimizer ===

=== Best Parameters ===
n_estimators: 50
max_samples: 656
contamination: 0.5000
max_features: 0.1000
bootstrap: True
random_state: 74

=== Evaluation Metrics ===
Accuracy: 0.9713
Precision: 0.9914
Recall: 0.9402
F1 Score: 0.9651
AUC: 0.9937
Confusion Matrix:
[[55943   336]
 [ 2455 38591]]


In [280]:
main_pipeline('unsw', 'WOA')


2025/03/26 09:24:29 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: Solving single objective optimization problem.
2025/03/26 09:25:50 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 1, Current best: 0.43402360276107776, Global best: 0.43402360276107776, Runtime: 31.59609 seconds
2025/03/26 09:26:16 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 2, Current best: 0.43402360276107776, Global best: 0.43402360276107776, Runtime: 26.15176 seconds
2025/03/26 09:26:37 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 3, Current best: 0.394477844578045, Global best: 0.394477844578045, Runtime: 20.87440 seconds
2025/03/26 09:26:54 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 4, Current best: 0.394477844578045, Global best: 0.394477844578045, Runtime: 17.24975 seconds
2025/03/26 09:27:14 PM, INFO, mealpy.swarm_based.WOA.HI_WOA: >>>Problem: P, Epoch: 5, Current best: 0.394477844578045, Global best: 0.394477844578045, Runtime: 20.21273 seconds


=== For Dataset unsw, WOA optimizer ===

=== Best Parameters ===
n_estimators: 52
max_samples: 693
contamination: 0.1629
max_features: 0.1040
bootstrap: False
random_state: 2

=== Evaluation Metrics ===
Accuracy: 0.4321
Precision: 0.4856
Recall: 0.5292
F1 Score: 0.5065
AUC: 0.4289
Confusion Matrix:
[[11590 25410]
 [21343 23989]]


In [281]:
main_pipeline('unsw', 'DE')

2025/03/26 09:38:40 PM, INFO, mealpy.evolutionary_based.DE.JADE: Solving single objective optimization problem.
2025/03/26 09:40:38 PM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 1, Current best: 0.4315742596303719, Global best: 0.4315742596303719, Runtime: 61.34091 seconds
2025/03/26 09:41:32 PM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 2, Current best: 0.4286795813849922, Global best: 0.4286795813849922, Runtime: 54.75517 seconds
2025/03/26 09:42:26 PM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 3, Current best: 0.4286795813849922, Global best: 0.4286795813849922, Runtime: 53.14702 seconds
2025/03/26 09:43:13 PM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 4, Current best: 0.4286795813849922, Global best: 0.4286795813849922, Runtime: 47.75386 seconds
2025/03/26 09:44:01 PM, INFO, mealpy.evolutionary_based.DE.JADE: >>>Problem: P, Epoch: 5, Current best: 0.4286795813849922, Global best: 0.4286795813849922,


=== For Dataset unsw, DE optimizer ===

=== Best Parameters ===
n_estimators: 64
max_samples: 997
contamination: 0.0224
max_features: 0.1201
bootstrap: True
random_state: 100

=== Evaluation Metrics ===
Accuracy: 0.4159
Precision: 0.4721
Recall: 0.5144
F1 Score: 0.4924
AUC: 0.4341
Confusion Matrix:
[[10924 26076]
 [22011 23321]]
