In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

class GazelleOptimizationAlgorithm:
    """
    Implementation of Gazelle Optimization Algorithm (GOA) for hyperparameter optimization
    """
    
    def __init__(self, population_size=30, max_iterations=50, dim=10):
        self.population_size = population_size
        self.max_iterations = max_iterations
        self.dim = dim
        self.best_position = None
        self.best_fitness = float('inf')
        
    def initialize_population(self, bounds):
        """Initialize gazelle population within bounds"""
        population = []
        for _ in range(self.population_size):
            gazelle = []
            for i in range(self.dim):
                lower, upper = bounds[i]
                gazelle.append(np.random.uniform(lower, upper))
            population.append(gazelle)
        return np.array(population)
    
    def fitness_function(self, position, X_train, y_train, X_val, y_val):
        """
        Fitness function for hyperparameter optimization
        Returns validation accuracy of hybrid model
        """
        try:
            # Decode hyperparameters from position
            rf_n_estimators = int(position[0])
            rf_max_depth = int(position[1]) if position[1] > 0 else None
            rf_min_samples_split = int(position[2])
            rf_min_samples_leaf = int(position[3])
            
            xgb_n_estimators = int(position[4])
            xgb_max_depth = int(position[5])
            xgb_learning_rate = position[6]
            xgb_subsample = position[7]
            xgb_colsample_bytree = position[8]
            
            # Weight for ensemble
            rf_weight = position[9]
            xgb_weight = 1 - rf_weight
            
            # Train Random Forest
            rf_model = RandomForestClassifier(
                n_estimators=rf_n_estimators,
                max_depth=rf_max_depth,
                min_samples_split=rf_min_samples_split,
                min_samples_leaf=rf_min_samples_leaf,
                random_state=42,
                n_jobs=-1
            )
            rf_model.fit(X_train, y_train)
            rf_pred_proba = rf_model.predict_proba(X_val)
            
            # Train XGBoost
            xgb_model = xgb.XGBClassifier(
                n_estimators=xgb_n_estimators,
                max_depth=xgb_max_depth,
                learning_rate=xgb_learning_rate,
                subsample=xgb_subsample,
                colsample_bytree=xgb_colsample_bytree,
                random_state=42,
                n_jobs=-1,
                eval_metric='logloss'
            )
            xgb_model.fit(X_train, y_train)
            xgb_pred_proba = xgb_model.predict_proba(X_val)
            
            # Ensemble prediction
            ensemble_pred_proba = rf_weight * rf_pred_proba + xgb_weight * xgb_pred_proba
            ensemble_pred = np.argmax(ensemble_pred_proba, axis=1)
            
            # Calculate accuracy
            accuracy = accuracy_score(y_val, ensemble_pred)
            
            # Return negative accuracy for minimization
            return 1 - accuracy
            
        except Exception as e:
            print(f"Error in fitness function: {e}")
            return float('inf')
    
    def optimize(self, X_train, y_train, X_val, y_val, bounds):
        """
        Main optimization loop for GOA
        """
        print("Starting Gazelle Optimization Algorithm...")
        
        # Initialize population
        population = self.initialize_population(bounds)
        fitness_values = []
        
        # Evaluate initial population
        for i, gazelle in enumerate(population):
            fitness = self.fitness_function(gazelle, X_train, y_train, X_val, y_val)
            fitness_values.append(fitness)
            
            if fitness < self.best_fitness:
                self.best_fitness = fitness
                self.best_position = gazelle.copy()
        
        fitness_values = np.array(fitness_values)
        
        # Main optimization loop
        for iteration in range(self.max_iterations):
            print(f"GOA Iteration {iteration + 1}/{self.max_iterations}, Best Fitness: {self.best_fitness:.4f}")
            
            # Update positions based on GOA algorithm
            for i in range(self.population_size):
                # Select random gazelles
                r1, r2 = np.random.choice(self.population_size, 2, replace=False)
                while r1 == i or r2 == i:
                    r1, r2 = np.random.choice(self.population_size, 2, replace=False)
                
                # Calculate step size
                step_size = 2 * np.random.random() - 1
                
                # Update position
                for j in range(self.dim):
                    # Gazelle movement equation
                    if np.random.random() < 0.5:
                        # Exploration
                        population[i][j] = population[i][j] + step_size * (population[r1][j] - population[r2][j])
                    else:
                        # Exploitation
                        population[i][j] = population[i][j] + step_size * (self.best_position[j] - population[i][j])
                    
                    # Boundary handling
                    lower, upper = bounds[j]
                    population[i][j] = np.clip(population[i][j], lower, upper)
                
                # Evaluate new position
                new_fitness = self.fitness_function(population[i], X_train, y_train, X_val, y_val)
                
                # Update if better
                if new_fitness < fitness_values[i]:
                    fitness_values[i] = new_fitness
                    
                    if new_fitness < self.best_fitness:
                        self.best_fitness = new_fitness
                        self.best_position = population[i].copy()
        
        print(f"GOA optimization completed. Best fitness: {self.best_fitness:.4f}")
        return self.best_position

class CICIoT2023MLAlgorithm:
    """
    Implementation of ML algorithms for CICIoT2023 dataset with GOA and hybrid RF-XGB
    """
    
    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.rf_model = None
        self.xgb_model = None
        self.best_params = None
        self.results = {}
        
    def load_and_preprocess_data(self, file_path):
        """
        Load and preprocess the CICIoT2023 dataset
        """
        print("Loading dataset...")
        try:
            df = pd.read_csv(file_path)
            print(f"Dataset loaded successfully. Shape: {df.shape}")
            
            # Display basic info about the dataset
            print(f"Columns: {list(df.columns)}")
            
            # Check for target column (assume it's the last column or named 'label')
            if 'label' in df.columns:
                target_col = 'label'
            else:
                target_col = df.columns[-1]
                print(f"Assuming '{target_col}' is the target column")
            
            # Move target column to the end if it's not already there
            if target_col != df.columns[-1]:
                cols = [col for col in df.columns if col != target_col] + [target_col]
                df = df[cols]
            
            print(f"Target column: {target_col}")
            print(f"Target distribution: {df[target_col].value_counts()}")
            
        except Exception as e:
            print(f"Error loading dataset: {e}")
            raise e
        
        # Handle missing values and outliers
        df = self.handle_missing_values(df)
        df = self.handle_outliers(df)
        
        return df
    
    def handle_missing_values(self, df):
        """
        Handle missing value with improved imputation strategies
        """
        print("Handling missing values and duplicates...")
        
        # Remove exact duplicates first
        initial_shape = df.shape
        df = df.drop_duplicates()
        print(f"Removed {initial_shape[0] - df.shape[0]} duplicate rows")
        
        # Handle missing values
        target_col = df.columns[-1]
        
        for column in df.columns:
            if df[column].isnull().sum() > 0:
                if column == target_col:
                    # For target column, drop rows with missing values
                    df = df.dropna(subset=[column])
                elif df[column].dtype in ['float64', 'int64']:
                    # For numerical columns, use median imputation
                    median_val = df[column].median()
                    df[column].fillna(median_val, inplace=True)
                else:
                    # For categorical columns, use mode imputation
                    mode_val = df[column].mode()
                    if len(mode_val) > 0:
                        df[column].fillna(mode_val[0], inplace=True)
                    else:
                        df[column].fillna('Unknown', inplace=True)
        
        print(f"Final dataset shape after cleaning: {df.shape}")
        return df
    
    def handle_outliers(self, df):
        """
        Handle outliers using IQR method
        """
        print("Handling outliers...")
        
        target_col = df.columns[-1]
        numerical_columns = df.select_dtypes(include=[np.number]).columns
        numerical_columns = [col for col in numerical_columns if col != target_col]
        
        for column in numerical_columns:
            Q1 = df[column].quantile(0.25)
            Q3 = df[column].quantile(0.75)
            IQR = Q3 - Q1
            
            if IQR > 0:
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                
                # Cap outliers instead of removing them
                df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
                df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
        
        return df
    
    def create_class_representation(self, df, representation_type='2-class'):
        """
        Create different class representations as specified
        """
        print(f"Creating {representation_type} representation...")
        
        target_col = df.columns[-1]
        
        # First, let's examine the actual class labels in the dataset
        unique_classes = df[target_col].unique()
        print(f"Unique classes in dataset: {unique_classes}")
        print(f"Class counts: {df[target_col].value_counts()}")
        
        if representation_type == '2-class':
            # 2-Class: Normal vs Attack (all malicious classes as 'Attack')
            df_copy = df.copy()
            
            # Identify normal/benign traffic - check for common variations
            normal_labels = ['Normal', 'BENIGN', 'Benign', 'normal', 'benign']
            normal_class = None
            
            for label in normal_labels:
                if label in unique_classes:
                    normal_class = label
                    break
            
            if normal_class is None:
                # If no clear normal class, treat the most frequent class as normal
                # or use a heuristic based on class names
                class_counts = df[target_col].value_counts()
                potential_normal = class_counts.index[0]  # Most frequent class
                
                # Check if any class name suggests it's normal/benign
                for class_name in class_counts.index:
                    if any(keyword in str(class_name).lower() for keyword in ['normal', 'benign', 'legitimate']):
                        potential_normal = class_name
                        break
                
                normal_class = potential_normal
                print(f"No explicit 'Normal' class found. Using '{normal_class}' as normal class.")
            
            # Create binary classification
            df_copy[target_col] = df_copy[target_col].apply(
                lambda x: 'Normal' if x == normal_class else 'Attack'
            )
            
            # Random sampling to create balanced dataset with 8450 samples per class
            normal_samples = df_copy[df_copy[target_col] == 'Normal']
            attack_samples = df_copy[df_copy[target_col] == 'Attack']
            
            print(f"Normal samples: {len(normal_samples)}, Attack samples: {len(attack_samples)}")
            
            # Check if we have enough samples
            if len(normal_samples) == 0:
                raise ValueError("No normal/benign samples found in the dataset!")
            if len(attack_samples) == 0:
                raise ValueError("No attack samples found in the dataset!")
            
            # Sample 8450 from each class
            target_samples = min(8450, len(normal_samples), len(attack_samples))
            if target_samples < 8450:
                print(f"Warning: Insufficient samples. Using {target_samples} samples per class instead of 8450")
            
            if len(normal_samples) >= target_samples:
                normal_samples = normal_samples.sample(n=target_samples, random_state=42)
            else:
                normal_samples = resample(normal_samples, n_samples=target_samples, random_state=42, replace=True)
            
            if len(attack_samples) >= target_samples:
                attack_samples = attack_samples.sample(n=target_samples, random_state=42)
            else:
                attack_samples = resample(attack_samples, n_samples=target_samples, random_state=42, replace=True)
            
            df_balanced = pd.concat([normal_samples, attack_samples], ignore_index=True)
            
        elif representation_type == '8-class':
            # 8-Class: Group attacks into 8 categories
            df_copy = df.copy()
            
            # Create attack mapping based on actual classes in the dataset
            attack_mapping = {}
            
            # Find normal class
            normal_labels = ['Normal', 'BENIGN', 'Benign', 'normal', 'benign']
            normal_class = None
            for label in normal_labels:
                if label in unique_classes:
                    normal_class = label
                    break
            
            if normal_class:
                attack_mapping['Normal'] = [normal_class]
            
            # Group other classes based on keywords
            remaining_classes = [c for c in unique_classes if c != normal_class]
            
            # Initialize attack categories
            attack_categories = {
                'DDoS': [],
                'DoS': [],
                'Recon': [],
                'Web': [],
                'BruteForce': [],
                'Spoofing': [],
                'Mirai': []
            }
            
            # Classify attacks based on class names
            for class_name in remaining_classes:
                class_name_lower = str(class_name).lower()
                
                if 'ddos' in class_name_lower:
                    attack_categories['DDoS'].append(class_name)
                elif 'dos' in class_name_lower:
                    attack_categories['DoS'].append(class_name)
                elif any(keyword in class_name_lower for keyword in ['recon', 'scan', 'ping', 'port']):
                    attack_categories['Recon'].append(class_name)
                elif any(keyword in class_name_lower for keyword in ['web', 'sql', 'xss', 'injection']):
                    attack_categories['Web'].append(class_name)
                elif 'brute' in class_name_lower or 'force' in class_name_lower:
                    attack_categories['BruteForce'].append(class_name)
                elif any(keyword in class_name_lower for keyword in ['spoof', 'mitm', 'arp']):
                    attack_categories['Spoofing'].append(class_name)
                elif 'mirai' in class_name_lower:
                    attack_categories['Mirai'].append(class_name)
                else:
                    # Put remaining attacks in the most appropriate category or create "Other"
                    attack_categories['DDoS'].append(class_name)  # Default to DDoS
            
            # Apply mapping
            for new_class, old_classes in attack_categories.items():
                if old_classes:  # Only apply if there are classes to map
                    df_copy[target_col] = df_copy[target_col].replace(old_classes, new_class)
            
            # Add normal class mapping
            if normal_class:
                df_copy[target_col] = df_copy[target_col].replace(normal_class, 'Normal')
            
            # Remove empty categories
            remaining_classes = df_copy[target_col].unique()
            target_samples = min(33800, len(df_copy) // len(remaining_classes))
            
            print(f"8-class categories: {remaining_classes}")
            print(f"Target samples per class: {target_samples}")
            
            # Balance dataset using SMOTE and undersampling
            df_balanced = self.balance_multiclass_dataset(df_copy, target_samples=target_samples, use_smote=True)
            
        elif representation_type == '34-class':
            # 34-Class: Keep all original classes
            df_copy = df.copy()
            
            # Calculate target samples based on dataset size and number of classes
            total_classes = len(unique_classes)
            target_samples = min(84500, len(df_copy) // total_classes)
            
            if target_samples < 100:
                target_samples = min(1000, len(df_copy) // total_classes)
            
            print(f"34-class: {total_classes} classes, target samples per class: {target_samples}")
            
            # Balance dataset using SMOTE and undersampling
            df_balanced = self.balance_multiclass_dataset(df_copy, target_samples=target_samples, use_smote=True)
        
        print(f"Final {representation_type} dataset shape: {df_balanced.shape}")
        print(f"Class distribution:")
        print(df_balanced[target_col].value_counts())
        
        return df_balanced
    
    def balance_multiclass_dataset(self, df, target_samples, use_smote=True):
        """
        Balance multiclass dataset using undersampling and SMOTE
        """
        target_col = df.columns[-1]
        
        # Check current class distribution
        class_counts = df[target_col].value_counts()
        print(f"Original class distribution: {class_counts}")
        
        # Ensure target_samples is reasonable
        min_class_size = class_counts.min()
        max_class_size = class_counts.max()
        
        if target_samples > max_class_size:
            target_samples = max_class_size
            print(f"Adjusted target samples to {target_samples} (max available)")
        
        # Separate features and target
        X = df.drop(target_col, axis=1)
        y = df[target_col]
        
        # Apply undersampling first for majority classes
        df_balanced_list = []
        for class_label in y.unique():
            class_data = df[df[target_col] == class_label]
            
            if len(class_data) > target_samples:
                # Undersample majority class
                class_data = class_data.sample(n=target_samples, random_state=42)
            elif len(class_data) < target_samples and use_smote:
                # Keep minority classes as-is for now, SMOTE will handle them
                pass
            
            df_balanced_list.append(class_data)
        
        df_undersampled = pd.concat(df_balanced_list, ignore_index=True)
        
        if use_smote:
            # Apply SMOTE for minority classes
            X_under = df_undersampled.drop(target_col, axis=1)
            y_under = df_undersampled[target_col]
            
            # Encode labels for SMOTE
            le = LabelEncoder()
            y_encoded = le.fit_transform(y_under)
            
            try:
                # Create sampling strategy
                unique_classes, class_counts = np.unique(y_encoded, return_counts=True)
                sampling_strategy = {}
                
                for class_idx, count in zip(unique_classes, class_counts):
                    if count < target_samples:
                        # Only upsample if we have enough neighbors
                        min_neighbors = min(5, count - 1) if count > 1 else 1
                        if count >= min_neighbors:
                            sampling_strategy[class_idx] = min(target_samples, count * 2)
                
                if sampling_strategy:
                    # Adjust k_neighbors based on smallest class size
                    min_samples = min([np.sum(y_encoded == cls) for cls in sampling_strategy.keys()])
                    k_neighbors = min(3, max(1, min_samples - 1))
                    
                    smote = SMOTE(random_state=42, k_neighbors=k_neighbors, sampling_strategy=sampling_strategy)
                    X_smote, y_smote = smote.fit_resample(X_under, y_encoded)
                    
                    # Decode labels back
                    y_smote_decoded = le.inverse_transform(y_smote)
                    
                    # Combine back to dataframe
                    df_balanced = pd.concat([
                        pd.DataFrame(X_smote, columns=X_under.columns),
                        pd.Series(y_smote_decoded, name=target_col)
                    ], axis=1)
                else:
                    df_balanced = df_undersampled
                    
            except Exception as e:
                print(f"SMOTE failed: {e}, using undersampled data")
                df_balanced = df_undersampled
        else:
            df_balanced = df_undersampled
        
        return df_balanced
    
    def normalize_features(self, X_train, X_test):
        """
        Normalize features using StandardScaler
        """
        print("Normalizing features...")
        
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        return X_train_scaled, X_test_scaled
    
    def train_hybrid_model(self, X_train, y_train, X_val, y_val):
        """
        Train hybrid RF-XGB model using GOA optimization
        """
        print("Training hybrid Random Forest + XGBoost model with GOA optimization...")
        
        # Encode labels
        y_train_encoded = self.label_encoder.fit_transform(y_train)
        y_val_encoded = self.label_encoder.transform(y_val)
        
        # Define hyperparameter bounds for GOA
        bounds = [
            (50, 200),    # RF n_estimators
            (3, 20),      # RF max_depth
            (2, 10),      # RF min_samples_split
            (1, 5),       # RF min_samples_leaf
            (50, 200),    # XGB n_estimators
            (3, 10),      # XGB max_depth
            (0.01, 0.3),  # XGB learning_rate
            (0.6, 1.0),   # XGB subsample
            (0.6, 1.0),   # XGB colsample_bytree
            (0.3, 0.7)    # RF weight in ensemble
        ]
        
        # Initialize and run GOA
        goa = GazelleOptimizationAlgorithm(population_size=20, max_iterations=30, dim=10)
        self.best_params = goa.optimize(X_train, y_train_encoded, X_val, y_val_encoded, bounds)
        
        # Train final models with best parameters
        rf_params = {
            'n_estimators': int(self.best_params[0]),
            'max_depth': int(self.best_params[1]) if self.best_params[1] > 0 else None,
            'min_samples_split': int(self.best_params[2]),
            'min_samples_leaf': int(self.best_params[3]),
            'random_state': 42,
            'n_jobs': -1
        }
        
        xgb_params = {
            'n_estimators': int(self.best_params[4]),
            'max_depth': int(self.best_params[5]),
            'learning_rate': self.best_params[6],
            'subsample': self.best_params[7],
            'colsample_bytree': self.best_params[8],
            'random_state': 42,
            'n_jobs': -1,
            'eval_metric': 'logloss'
        }
        
        self.rf_weight = self.best_params[9]
        self.xgb_weight = 1 - self.rf_weight
        
        print(f"Best RF parameters: {rf_params}")
        print(f"Best XGB parameters: {xgb_params}")
        print(f"Ensemble weights - RF: {self.rf_weight:.3f}, XGB: {self.xgb_weight:.3f}")
        
        # Train final models
        self.rf_model = RandomForestClassifier(**rf_params)
        self.xgb_model = xgb.XGBClassifier(**xgb_params)
        
        self.rf_model.fit(X_train, y_train_encoded)
        self.xgb_model.fit(X_train, y_train_encoded)
        
        print("Hybrid model training completed!")
    
    def predict_hybrid(self, X_test):
        """
        Make predictions using hybrid model
        """
        rf_pred_proba = self.rf_model.predict_proba(X_test)
        xgb_pred_proba = self.xgb_model.predict_proba(X_test)
        
        # Ensemble prediction
        ensemble_pred_proba = self.rf_weight * rf_pred_proba + self.xgb_weight * xgb_pred_proba
        ensemble_pred = np.argmax(ensemble_pred_proba, axis=1)
        
        # Decode labels back
        ensemble_pred_decoded = self.label_encoder.inverse_transform(ensemble_pred)
        
        return ensemble_pred_decoded
    
    def evaluate_model(self, X_test, y_test):
        """
        Evaluate the hybrid model
        """
        print("Evaluating hybrid model...")
        
        y_pred = self.predict_hybrid(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        
        self.results = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1
        }
        
        return self.results
    
    def run_algorithm(self, file_path, representation_type='2-class'):
        """
        Run the complete algorithm with GOA optimization
        """
        print(f"Starting CICIoT2023 ML Algorithm with {representation_type} representation...")
        
        # Step 1: Load and preprocess data
        df = self.load_and_preprocess_data(file_path)
        
        # Step 2: Create class representation
        df_balanced = self.create_class_representation(df, representation_type)
        
        # Step 3: Prepare features and target
        X = df_balanced.iloc[:, :-1].values
        y = df_balanced.iloc[:, -1].values
        
        # Step 4: Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Split training data for validation (used in GOA)
        X_train_opt, X_val, y_train_opt, y_val = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
        )
        
        print(f"Training set size: {X_train_opt.shape[0]}")
        print(f"Validation set size: {X_val.shape[0]}")
        print(f"Test set size: {X_test.shape[0]}")
        
        # Step 5: Normalize features
        X_train_scaled, X_test_scaled = self.normalize_features(X_train_opt, X_test)
        X_val_scaled = self.scaler.transform(X_val)
        
        # Step 6: Train hybrid model with GOA optimization
        self.train_hybrid_model(X_train_scaled, y_train_opt, X_val_scaled, y_val)
        
        # Step 7: Evaluate model
        results = self.evaluate_model(X_test_scaled, y_test)
        
        # Display results
        self.display_results(representation_type)
        
        return results
    
    def display_results(self, representation_type):
        """
        Display evaluation results
        """
        print("\n" + "="*80)
        print(f"EVALUATION RESULTS - {representation_type.upper()} REPRESENTATION")
        print("="*80)
        
        print(f"Hybrid Random Forest + XGBoost Model (GOA Optimized):")
        print(f"  Accuracy:  {self.results['Accuracy']:.4f}")
        print(f"  Precision: {self.results['Precision']:.4f}")
        print(f"  Recall:    {self.results['Recall']:.4f}")
        print(f"  F1-Score:  {self.results['F1-Score']:.4f}")
        
        print(f"\nOptimized Hyperparameters:")
        print(f"  RF n_estimators: {int(self.best_params[0])}")
        print(f"  RF max_depth: {int(self.best_params[1]) if self.best_params[1] > 0 else None}")
        print(f"  XGB n_estimators: {int(self.best_params[4])}")
        print(f"  XGB learning_rate: {self.best_params[6]:.3f}")
        print(f"  Ensemble weights - RF: {self.rf_weight:.3f}, XGB: {self.xgb_weight:.3f}")

# Example usage
if __name__ == "__main__":
    # Initialize the algorithm
    ml_algorithm = CICIoT2023MLAlgorithm()
    
    print("CICIoT2023 Dataset ML Algorithm with GOA and Hybrid RF-XGB")
    print("This implementation includes:")
    print("- Gazelle Optimization Algorithm (GOA) for hyperparameter optimization")
    print("- Hybrid Random Forest + XGBoost ensemble model")
    print("- Three class representations: 2-class, 8-class, and 34-class")
    print("\nTo use this algorithm:")
    print("1. Ensure you have the CICIoT2023 dataset in CSV format")
    print("2. Install required packages: pip install pandas scikit-learn imbalanced-learn xgboost")
    print("3. Run for different representations:")
    print("   - 2-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '2-class')")
    print("   - 8-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '8-class')")
    print("   - 34-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '34-class')")
    
     # Uncomment the following lines and provide the correct path to run the algorithm
    results_2class = ml_algorithm.run_algorithm('C:\\ProgramData\\anaconda3\\Lib\\site-packages\\pandas\\io\\parsers\\capstone\\part-00112-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', '2-class')
    results_8class = ml_algorithm.run_algorithm('C:\\ProgramData\\anaconda3\\Lib\\site-packages\\pandas\\io\\parsers\\capstone\\part-00112-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', '8-class')
    results_34class = ml_algorithm.run_algorithm('C:\\ProgramData\\anaconda3\\Lib\\site-packages\\pandas\\io\\parsers\\capstone\\part-00112-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', '34-class')

CICIoT2023 Dataset ML Algorithm with GOA and Hybrid RF-XGB
This implementation includes:
- Gazelle Optimization Algorithm (GOA) for hyperparameter optimization
- Hybrid Random Forest + XGBoost ensemble model
- Three class representations: 2-class, 8-class, and 34-class

To use this algorithm:
1. Ensure you have the CICIoT2023 dataset in CSV format
2. Install required packages: pip install pandas scikit-learn imbalanced-learn xgboost
3. Run for different representations:
   - 2-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '2-class')
   - 8-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '8-class')
   - 34-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '34-class')
Starting CICIoT2023 ML Algorithm with 2-class representation...
Loading dataset...
Dataset loaded successfully. Shape: (253575, 47)
Columns: ['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

class GazelleOptimizationAlgorithm:
    """
    Implementation of Gazelle Optimization Algorithm (GOA) for hyperparameter optimization
    """
    
    def __init__(self, population_size=30, max_iterations=50, dim=10):
        self.population_size = population_size
        self.max_iterations = max_iterations
        self.dim = dim
        self.best_position = None
        self.best_fitness = float('inf')
        
    def initialize_population(self, bounds):
        """Initialize gazelle population within bounds"""
        population = []
        for _ in range(self.population_size):
            gazelle = []
            for i in range(self.dim):
                lower, upper = bounds[i]
                gazelle.append(np.random.uniform(lower, upper))
            population.append(gazelle)
        return np.array(population)
    
    def fitness_function(self, position, X_train, y_train, X_val, y_val):
        """
        Fitness function for hyperparameter optimization
        Returns validation accuracy of hybrid model
        """
        try:
            # Decode hyperparameters from position
            rf_n_estimators = int(position[0])
            rf_max_depth = int(position[1]) if position[1] > 0 else None
            rf_min_samples_split = int(position[2])
            rf_min_samples_leaf = int(position[3])
            
            xgb_n_estimators = int(position[4])
            xgb_max_depth = int(position[5])
            xgb_learning_rate = position[6]
            xgb_subsample = position[7]
            xgb_colsample_bytree = position[8]
            
            # Weight for ensemble
            rf_weight = position[9]
            xgb_weight = 1 - rf_weight
            
            # Train Random Forest
            rf_model = RandomForestClassifier(
                n_estimators=rf_n_estimators,
                max_depth=rf_max_depth,
                min_samples_split=rf_min_samples_split,
                min_samples_leaf=rf_min_samples_leaf,
                random_state=42,
                n_jobs=-1
            )
            rf_model.fit(X_train, y_train)
            rf_pred_proba = rf_model.predict_proba(X_val)
            
            # Train XGBoost
            xgb_model = xgb.XGBClassifier(
                n_estimators=xgb_n_estimators,
                max_depth=xgb_max_depth,
                learning_rate=xgb_learning_rate,
                subsample=xgb_subsample,
                colsample_bytree=xgb_colsample_bytree,
                random_state=42,
                n_jobs=-1,
                eval_metric='logloss'
            )
            xgb_model.fit(X_train, y_train)
            xgb_pred_proba = xgb_model.predict_proba(X_val)
            
            # Ensemble prediction
            ensemble_pred_proba = rf_weight * rf_pred_proba + xgb_weight * xgb_pred_proba
            ensemble_pred = np.argmax(ensemble_pred_proba, axis=1)
            
            # Calculate accuracy
            accuracy = accuracy_score(y_val, ensemble_pred)
            
            # Return negative accuracy for minimization
            return 1 - accuracy
            
        except Exception as e:
            print(f"Error in fitness function: {e}")
            return float('inf')
    
    def optimize(self, X_train, y_train, X_val, y_val, bounds):
        """
        Main optimization loop for GOA
        """
        print("Starting Gazelle Optimization Algorithm...")
        
        # Initialize population
        population = self.initialize_population(bounds)
        fitness_values = []
        
        # Evaluate initial population
        for i, gazelle in enumerate(population):
            fitness = self.fitness_function(gazelle, X_train, y_train, X_val, y_val)
            fitness_values.append(fitness)
            
            if fitness < self.best_fitness:
                self.best_fitness = fitness
                self.best_position = gazelle.copy()
        
        fitness_values = np.array(fitness_values)
        
        # Main optimization loop
        for iteration in range(self.max_iterations):
            print(f"GOA Iteration {iteration + 1}/{self.max_iterations}, Best Fitness: {self.best_fitness:.4f}")
            
            # Update positions based on GOA algorithm
            for i in range(self.population_size):
                # Select random gazelles
                r1, r2 = np.random.choice(self.population_size, 2, replace=False)
                while r1 == i or r2 == i:
                    r1, r2 = np.random.choice(self.population_size, 2, replace=False)
                
                # Calculate step size
                step_size = 2 * np.random.random() - 1
                
                # Update position
                for j in range(self.dim):
                    # Gazelle movement equation
                    if np.random.random() < 0.5:
                        # Exploration
                        population[i][j] = population[i][j] + step_size * (population[r1][j] - population[r2][j])
                    else:
                        # Exploitation
                        population[i][j] = population[i][j] + step_size * (self.best_position[j] - population[i][j])
                    
                    # Boundary handling
                    lower, upper = bounds[j]
                    population[i][j] = np.clip(population[i][j], lower, upper)
                
                # Evaluate new position
                new_fitness = self.fitness_function(population[i], X_train, y_train, X_val, y_val)
                
                # Update if better
                if new_fitness < fitness_values[i]:
                    fitness_values[i] = new_fitness
                    
                    if new_fitness < self.best_fitness:
                        self.best_fitness = new_fitness
                        self.best_position = population[i].copy()
        
        print(f"GOA optimization completed. Best fitness: {self.best_fitness:.4f}")
        return self.best_position

class CICIoT2023MLAlgorithm:
    """
    Implementation of ML algorithms for CICIoT2023 dataset with GOA and hybrid RF-XGB
    """
    
    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()
        self.rf_model = None
        self.xgb_model = None
        self.best_params = None
        self.results = {}
        
    def load_and_preprocess_data(self, file_path):
        """
        Load and preprocess the CICIoT2023 dataset
        """
        print("Loading dataset...")
        try:
            df = pd.read_csv(file_path)
            print(f"Dataset loaded successfully. Shape: {df.shape}")
            
            # Display basic info about the dataset
            print(f"Columns: {list(df.columns)}")
            
            # Check for target column (assume it's the last column or named 'label')
            if 'label' in df.columns:
                target_col = 'label'
            else:
                target_col = df.columns[-1]
                print(f"Assuming '{target_col}' is the target column")
            
            # Move target column to the end if it's not already there
            if target_col != df.columns[-1]:
                cols = [col for col in df.columns if col != target_col] + [target_col]
                df = df[cols]
            
            print(f"Target column: {target_col}")
            print(f"Target distribution: {df[target_col].value_counts()}")
            
        except Exception as e:
            print(f"Error loading dataset: {e}")
            raise e
        
        # Handle missing values and outliers
        df = self.handle_missing_values(df)
        df = self.handle_outliers(df)
        
        return df
    
    def handle_missing_values(self, df):
        """
        Handle missing value with improved imputation strategies
        """
        print("Handling missing values and duplicates...")
        
        # Remove exact duplicates first
        initial_shape = df.shape
        df = df.drop_duplicates()
        print(f"Removed {initial_shape[0] - df.shape[0]} duplicate rows")
        
        # Handle missing values
        target_col = df.columns[-1]
        
        for column in df.columns:
            if df[column].isnull().sum() > 0:
                if column == target_col:
                    # For target column, drop rows with missing values
                    df = df.dropna(subset=[column])
                elif df[column].dtype in ['float64', 'int64']:
                    # For numerical columns, use median imputation
                    median_val = df[column].median()
                    df[column].fillna(median_val, inplace=True)
                else:
                    # For categorical columns, use mode imputation
                    mode_val = df[column].mode()
                    if len(mode_val) > 0:
                        df[column].fillna(mode_val[0], inplace=True)
                    else:
                        df[column].fillna('Unknown', inplace=True)
        
        print(f"Final dataset shape after cleaning: {df.shape}")
        return df
    
    def handle_outliers(self, df):
        """
        Handle outliers using IQR method
        """
        print("Handling outliers...")
        
        target_col = df.columns[-1]
        numerical_columns = df.select_dtypes(include=[np.number]).columns
        numerical_columns = [col for col in numerical_columns if col != target_col]
        
        for column in numerical_columns:
            Q1 = df[column].quantile(0.25)
            Q3 = df[column].quantile(0.75)
            IQR = Q3 - Q1
            
            if IQR > 0:
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                
                # Cap outliers instead of removing them
                df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
                df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
        
        return df
    
    def create_class_representation(self, df, representation_type='2-class'):
        """
        Create different class representations as specified
        """
        print(f"Creating {representation_type} representation...")
        
        target_col = df.columns[-1]
        
        # First, let's examine the actual class labels in the dataset
        unique_classes = df[target_col].unique()
        print(f"Unique classes in dataset: {unique_classes}")
        print(f"Class counts: {df[target_col].value_counts()}")
        
        if representation_type == '2-class':
            # 2-Class: Normal vs Attack (all malicious classes as 'Attack')
            df_copy = df.copy()
            
            # Identify normal/benign traffic - check for common variations
            normal_labels = ['Normal', 'BENIGN', 'Benign', 'normal', 'benign']
            normal_class = None
            
            for label in normal_labels:
                if label in unique_classes:
                    normal_class = label
                    break
            
            if normal_class is None:
                # If no clear normal class, treat the most frequent class as normal
                # or use a heuristic based on class names
                class_counts = df[target_col].value_counts()
                potential_normal = class_counts.index[0]  # Most frequent class
                
                # Check if any class name suggests it's normal/benign
                for class_name in class_counts.index:
                    if any(keyword in str(class_name).lower() for keyword in ['normal', 'benign', 'legitimate']):
                        potential_normal = class_name
                        break
                
                normal_class = potential_normal
                print(f"No explicit 'Normal' class found. Using '{normal_class}' as normal class.")
            
            # Create binary classification
            df_copy[target_col] = df_copy[target_col].apply(
                lambda x: 'Normal' if x == normal_class else 'Attack'
            )
            
            # Random sampling to create balanced dataset with 8450 samples per class
            normal_samples = df_copy[df_copy[target_col] == 'Normal']
            attack_samples = df_copy[df_copy[target_col] == 'Attack']
            
            print(f"Normal samples: {len(normal_samples)}, Attack samples: {len(attack_samples)}")
            
            # Check if we have enough samples
            if len(normal_samples) == 0:
                raise ValueError("No normal/benign samples found in the dataset!")
            if len(attack_samples) == 0:
                raise ValueError("No attack samples found in the dataset!")
            
            # Sample 8450 from each class
            target_samples = min(8450, len(normal_samples), len(attack_samples))
            if target_samples < 8450:
                print(f"Warning: Insufficient samples. Using {target_samples} samples per class instead of 8450")
            
            if len(normal_samples) >= target_samples:
                normal_samples = normal_samples.sample(n=target_samples, random_state=42)
            else:
                normal_samples = resample(normal_samples, n_samples=target_samples, random_state=42, replace=True)
            
            if len(attack_samples) >= target_samples:
                attack_samples = attack_samples.sample(n=target_samples, random_state=42)
            else:
                attack_samples = resample(attack_samples, n_samples=target_samples, random_state=42, replace=True)
            
            df_balanced = pd.concat([normal_samples, attack_samples], ignore_index=True)
            
        elif representation_type == '8-class':
            # 8-Class: Group attacks into 8 categories
            df_copy = df.copy()
            
            # Create attack mapping based on actual classes in the dataset
            attack_mapping = {}
            
            # Find normal class
            normal_labels = ['Normal', 'BENIGN', 'Benign', 'normal', 'benign']
            normal_class = None
            for label in normal_labels:
                if label in unique_classes:
                    normal_class = label
                    break
            
            if normal_class:
                attack_mapping['Normal'] = [normal_class]
            
            # Group other classes based on keywords
            remaining_classes = [c for c in unique_classes if c != normal_class]
            
            # Initialize attack categories
            attack_categories = {
                'DDoS': [],
                'DoS': [],
                'Recon': [],
                'Web': [],
                'BruteForce': [],
                'Spoofing': [],
                'Mirai': []
            }
            
            # Classify attacks based on class names
            for class_name in remaining_classes:
                class_name_lower = str(class_name).lower()
                
                if 'ddos' in class_name_lower:
                    attack_categories['DDoS'].append(class_name)
                elif 'dos' in class_name_lower:
                    attack_categories['DoS'].append(class_name)
                elif any(keyword in class_name_lower for keyword in ['recon', 'scan', 'ping', 'port']):
                    attack_categories['Recon'].append(class_name)
                elif any(keyword in class_name_lower for keyword in ['web', 'sql', 'xss', 'injection']):
                    attack_categories['Web'].append(class_name)
                elif 'brute' in class_name_lower or 'force' in class_name_lower:
                    attack_categories['BruteForce'].append(class_name)
                elif any(keyword in class_name_lower for keyword in ['spoof', 'mitm', 'arp']):
                    attack_categories['Spoofing'].append(class_name)
                elif 'mirai' in class_name_lower:
                    attack_categories['Mirai'].append(class_name)
                else:
                    # Put remaining attacks in the most appropriate category or create "Other"
                    attack_categories['DDoS'].append(class_name)  # Default to DDoS
            
            # Apply mapping
            for new_class, old_classes in attack_categories.items():
                if old_classes:  # Only apply if there are classes to map
                    df_copy[target_col] = df_copy[target_col].replace(old_classes, new_class)
            
            # Add normal class mapping
            if normal_class:
                df_copy[target_col] = df_copy[target_col].replace(normal_class, 'Normal')
            
            # Remove empty categories
            remaining_classes = df_copy[target_col].unique()
            target_samples = min(33800, len(df_copy) // len(remaining_classes))
            
            print(f"8-class categories: {remaining_classes}")
            print(f"Target samples per class: {target_samples}")
            
            # Balance dataset using SMOTE and undersampling
            df_balanced = self.balance_multiclass_dataset(df_copy, target_samples=target_samples, use_smote=True)
            
        elif representation_type == '34-class':
            # 34-Class: Keep all original classes
            df_copy = df.copy()
            
            # Calculate target samples based on dataset size and number of classes
            total_classes = len(unique_classes)
            target_samples = min(84500, len(df_copy) // total_classes)
            
            if target_samples < 100:
                target_samples = min(1000, len(df_copy) // total_classes)
            
            print(f"34-class: {total_classes} classes, target samples per class: {target_samples}")
            
            # Balance dataset using SMOTE and undersampling
            df_balanced = self.balance_multiclass_dataset(df_copy, target_samples=target_samples, use_smote=True)
        
        print(f"Final {representation_type} dataset shape: {df_balanced.shape}")
        print(f"Class distribution:")
        print(df_balanced[target_col].value_counts())
        
        return df_balanced
    
    def balance_multiclass_dataset(self, df, target_samples, use_smote=True):
        """
        Balance multiclass dataset using undersampling and SMOTE
        """
        target_col = df.columns[-1]
        
        # Check current class distribution
        class_counts = df[target_col].value_counts()
        print(f"Original class distribution: {class_counts}")
        
        # Ensure target_samples is reasonable
        min_class_size = class_counts.min()
        max_class_size = class_counts.max()
        
        if target_samples > max_class_size:
            target_samples = max_class_size
            print(f"Adjusted target samples to {target_samples} (max available)")
        
        # Separate features and target
        X = df.drop(target_col, axis=1)
        y = df[target_col]
        
        # Apply undersampling first for majority classes
        df_balanced_list = []
        for class_label in y.unique():
            class_data = df[df[target_col] == class_label]
            
            if len(class_data) > target_samples:
                # Undersample majority class
                class_data = class_data.sample(n=target_samples, random_state=42)
            elif len(class_data) < target_samples and use_smote:
                # Keep minority classes as-is for now, SMOTE will handle them
                pass
            
            df_balanced_list.append(class_data)
        
        df_undersampled = pd.concat(df_balanced_list, ignore_index=True)
        
        if use_smote:
            # Apply SMOTE for minority classes
            X_under = df_undersampled.drop(target_col, axis=1)
            y_under = df_undersampled[target_col]
            
            # Encode labels for SMOTE
            le = LabelEncoder()
            y_encoded = le.fit_transform(y_under)
            
            try:
                # Create sampling strategy
                unique_classes, class_counts = np.unique(y_encoded, return_counts=True)
                sampling_strategy = {}
                
                for class_idx, count in zip(unique_classes, class_counts):
                    if count < target_samples:
                        # Only upsample if we have enough neighbors
                        min_neighbors = min(5, count - 1) if count > 1 else 1
                        if count >= min_neighbors:
                            sampling_strategy[class_idx] = min(target_samples, count * 2)
                
                if sampling_strategy:
                    # Adjust k_neighbors based on smallest class size
                    min_samples = min([np.sum(y_encoded == cls) for cls in sampling_strategy.keys()])
                    k_neighbors = min(3, max(1, min_samples - 1))
                    
                    smote = SMOTE(random_state=42, k_neighbors=k_neighbors, sampling_strategy=sampling_strategy)
                    X_smote, y_smote = smote.fit_resample(X_under, y_encoded)
                    
                    # Decode labels back
                    y_smote_decoded = le.inverse_transform(y_smote)
                    
                    # Combine back to dataframe
                    df_balanced = pd.concat([
                        pd.DataFrame(X_smote, columns=X_under.columns),
                        pd.Series(y_smote_decoded, name=target_col)
                    ], axis=1)
                else:
                    df_balanced = df_undersampled
                    
            except Exception as e:
                print(f"SMOTE failed: {e}, using undersampled data")
                df_balanced = df_undersampled
        else:
            df_balanced = df_undersampled
        
        return df_balanced
    
    def normalize_features(self, X_train, X_test):
        """
        Normalize features using StandardScaler
        """
        print("Normalizing features...")
        
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        return X_train_scaled, X_test_scaled
    
    def train_hybrid_model(self, X_train, y_train, X_val, y_val):
        """
        Train hybrid RF-XGB model using GOA optimization
        """
        print("Training hybrid Random Forest + XGBoost model with GOA optimization...")
        
        # Encode labels
        y_train_encoded = self.label_encoder.fit_transform(y_train)
        y_val_encoded = self.label_encoder.transform(y_val)
        
        # Define hyperparameter bounds for GOA
        bounds = [
            (50, 200),    # RF n_estimators
            (3, 20),      # RF max_depth
            (2, 10),      # RF min_samples_split
            (1, 5),       # RF min_samples_leaf
            (50, 200),    # XGB n_estimators
            (3, 10),      # XGB max_depth
            (0.01, 0.3),  # XGB learning_rate
            (0.6, 1.0),   # XGB subsample
            (0.6, 1.0),   # XGB colsample_bytree
            (0.3, 0.7)    # RF weight in ensemble
        ]
        
        # Initialize and run GOA
        goa = GazelleOptimizationAlgorithm(population_size=20, max_iterations=30, dim=10)
        self.best_params = goa.optimize(X_train, y_train_encoded, X_val, y_val_encoded, bounds)
        
        # Train final models with best parameters
        rf_params = {
            'n_estimators': int(self.best_params[0]),
            'max_depth': int(self.best_params[1]) if self.best_params[1] > 0 else None,
            'min_samples_split': int(self.best_params[2]),
            'min_samples_leaf': int(self.best_params[3]),
            'random_state': 42,
            'n_jobs': -1
        }
        
        xgb_params = {
            'n_estimators': int(self.best_params[4]),
            'max_depth': int(self.best_params[5]),
            'learning_rate': self.best_params[6],
            'subsample': self.best_params[7],
            'colsample_bytree': self.best_params[8],
            'random_state': 42,
            'n_jobs': -1,
            'eval_metric': 'logloss'
        }
        
        self.rf_weight = self.best_params[9]
        self.xgb_weight = 1 - self.rf_weight
        
        print(f"Best RF parameters: {rf_params}")
        print(f"Best XGB parameters: {xgb_params}")
        print(f"Ensemble weights - RF: {self.rf_weight:.3f}, XGB: {self.xgb_weight:.3f}")
        
        # Train final models
        self.rf_model = RandomForestClassifier(**rf_params)
        self.xgb_model = xgb.XGBClassifier(**xgb_params)
        
        self.rf_model.fit(X_train, y_train_encoded)
        self.xgb_model.fit(X_train, y_train_encoded)
        
        print("Hybrid model training completed!")
    
    def predict_hybrid(self, X_test):
        """
        Make predictions using hybrid model
        """
        rf_pred_proba = self.rf_model.predict_proba(X_test)
        xgb_pred_proba = self.xgb_model.predict_proba(X_test)
        
        # Ensemble prediction
        ensemble_pred_proba = self.rf_weight * rf_pred_proba + self.xgb_weight * xgb_pred_proba
        ensemble_pred = np.argmax(ensemble_pred_proba, axis=1)
        
        # Decode labels back
        ensemble_pred_decoded = self.label_encoder.inverse_transform(ensemble_pred)
        
        return ensemble_pred_decoded
    
    def evaluate_model(self, X_test, y_test):
        """
        Evaluate the hybrid model
        """
        print("Evaluating hybrid model...")
        
        y_pred = self.predict_hybrid(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        
        self.results = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1
        }
        
        return self.results
    
    def run_algorithm(self, file_path, representation_type='2-class'):
        """
        Run the complete algorithm with GOA optimization
        """
        print(f"Starting CICIoT2023 ML Algorithm with {representation_type} representation...")
        
        # Step 1: Load and preprocess data
        df = self.load_and_preprocess_data(file_path)
        
        # Step 2: Create class representation
        df_balanced = self.create_class_representation(df, representation_type)
        
        # Step 3: Prepare features and target
        X = df_balanced.iloc[:, :-1].values
        y = df_balanced.iloc[:, -1].values
        
        # Step 4: Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Split training data for validation (used in GOA)
        X_train_opt, X_val, y_train_opt, y_val = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
        )
        
        print(f"Training set size: {X_train_opt.shape[0]}")
        print(f"Validation set size: {X_val.shape[0]}")
        print(f"Test set size: {X_test.shape[0]}")
        
        # Step 5: Normalize features
        X_train_scaled, X_test_scaled = self.normalize_features(X_train_opt, X_test)
        X_val_scaled = self.scaler.transform(X_val)
        
        # Step 6: Train hybrid model with GOA optimization
        self.train_hybrid_model(X_train_scaled, y_train_opt, X_val_scaled, y_val)
        
        # Step 7: Evaluate model
        results = self.evaluate_model(X_test_scaled, y_test)
        
        # Display results
        self.display_results(representation_type)
        
        return results
    
    def display_results(self, representation_type):
        """
        Display evaluation results
        """
        print("\n" + "="*80)
        print(f"EVALUATION RESULTS - {representation_type.upper()} REPRESENTATION")
        print("="*80)
        
        print(f"Hybrid Random Forest + XGBoost Model (GOA Optimized):")
        print(f"  Accuracy:  {self.results['Accuracy']:.4f}")
        print(f"  Precision: {self.results['Precision']:.4f}")
        print(f"  Recall:    {self.results['Recall']:.4f}")
        print(f"  F1-Score:  {self.results['F1-Score']:.4f}")
        
        print(f"\nOptimized Hyperparameters:")
        print(f"  RF n_estimators: {int(self.best_params[0])}")
        print(f"  RF max_depth: {int(self.best_params[1]) if self.best_params[1] > 0 else None}")
        print(f"  XGB n_estimators: {int(self.best_params[4])}")
        print(f"  XGB learning_rate: {self.best_params[6]:.3f}")
        print(f"  Ensemble weights - RF: {self.rf_weight:.3f}, XGB: {self.xgb_weight:.3f}")

# Example usage
if __name__ == "__main__":
    # Initialize the algorithm
    ml_algorithm = CICIoT2023MLAlgorithm()
    
    print("CICIoT2023 Dataset ML Algorithm with GOA and Hybrid RF-XGB")
    print("This implementation includes:")
    print("- Gazelle Optimization Algorithm (GOA) for hyperparameter optimization")
    print("- Hybrid Random Forest + XGBoost ensemble model")
    print("- Three class representations: 2-class, 8-class, and 34-class")
    print("\nTo use this algorithm:")
    print("1. Ensure you have the CICIoT2023 dataset in CSV format")
    print("2. Install required packages: pip install pandas scikit-learn imbalanced-learn xgboost")
    print("3. Run for different representations:")
    print("   - 2-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '2-class')")
    print("   - 8-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '8-class')")
    print("   - 34-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '34-class')")
    
     # Uncomment the following lines and provide the correct path to run the algorithm
   # results_2class = ml_algorithm.run_algorithm('C:\\ProgramData\\anaconda3\\Lib\\site-packages\\pandas\\io\\parsers\\capstone\\part-00112-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', '2-class')
    #results_8class = ml_algorithm.run_algorithm('C:\\ProgramData\\anaconda3\\Lib\\site-packages\\pandas\\io\\parsers\\capstone\\part-00112-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', '8-class')
    results_34class = ml_algorithm.run_algorithm('C:\\ProgramData\\anaconda3\\Lib\\site-packages\\pandas\\io\\parsers\\capstone\\part-00112-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', '34-class')

CICIoT2023 Dataset ML Algorithm with GOA and Hybrid RF-XGB
This implementation includes:
- Gazelle Optimization Algorithm (GOA) for hyperparameter optimization
- Hybrid Random Forest + XGBoost ensemble model
- Three class representations: 2-class, 8-class, and 34-class

To use this algorithm:
1. Ensure you have the CICIoT2023 dataset in CSV format
2. Install required packages: pip install pandas scikit-learn imbalanced-learn xgboost
3. Run for different representations:
   - 2-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '2-class')
   - 8-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '8-class')
   - 34-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '34-class')
Starting CICIoT2023 ML Algorithm with 34-class representation...
Loading dataset...
Dataset loaded successfully. Shape: (253575, 47)
Columns: ['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

class AdvancedGazelleOptimizationAlgorithm:
    """
    Enhanced Gazelle Optimization Algorithm with adaptive mechanisms
    """
    
    def __init__(self, population_size=40, max_iterations=100, dim=12):
        self.population_size = population_size
        self.max_iterations = max_iterations
        self.dim = dim
        self.best_position = None
        self.best_fitness = float('inf')
        self.fitness_history = []
        self.diversity_threshold = 0.1
        
    def initialize_population(self, bounds):
        """Initialize gazelle population with better diversity"""
        population = []
        for _ in range(self.population_size):
            gazelle = []
            for i in range(self.dim):
                lower, upper = bounds[i]
                # Use different initialization strategies
                if np.random.random() < 0.5:
                    gazelle.append(np.random.uniform(lower, upper))
                else:
                    # Bias towards middle values for some parameters
                    gazelle.append(np.random.normal((lower + upper) / 2, (upper - lower) / 6))
                    gazelle[-1] = np.clip(gazelle[-1], lower, upper)
            population.append(gazelle)
        return np.array(population)
    
    def fitness_function(self, position, X_train, y_train, X_val, y_val):
        """Enhanced fitness function with cross-validation"""
        try:
            # Decode hyperparameters
            rf_n_estimators = int(position[0])
            rf_max_depth = int(position[1]) if position[1] > 0 else None
            rf_min_samples_split = int(position[2])
            rf_min_samples_leaf = int(position[3])
            
            xgb_n_estimators = int(position[4])
            xgb_max_depth = int(position[5])
            xgb_learning_rate = position[6]
            xgb_subsample = position[7]
            xgb_colsample_bytree = position[8]
            
            # New parameters for LightGBM
            lgb_n_estimators = int(position[9])
            lgb_learning_rate = position[10]
            
            # Ensemble weights
            rf_weight = position[11] / 3
            xgb_weight = position[11] / 3
            lgb_weight = 1 - rf_weight - xgb_weight
            
            # Use 3-fold cross-validation for more robust evaluation
            skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
            cv_scores = []
            
            for train_idx, val_idx in skf.split(X_train, y_train):
                X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
                y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]
                
                # Train Random Forest
                rf_model = RandomForestClassifier(
                    n_estimators=rf_n_estimators,
                    max_depth=rf_max_depth,
                    min_samples_split=rf_min_samples_split,
                    min_samples_leaf=rf_min_samples_leaf,
                    random_state=42,
                    n_jobs=-1,
                    class_weight='balanced'
                )
                rf_model.fit(X_fold_train, y_fold_train)
                rf_pred_proba = rf_model.predict_proba(X_fold_val)
                
                # Train XGBoost
                xgb_model = xgb.XGBClassifier(
                    n_estimators=xgb_n_estimators,
                    max_depth=xgb_max_depth,
                    learning_rate=xgb_learning_rate,
                    subsample=xgb_subsample,
                    colsample_bytree=xgb_colsample_bytree,
                    random_state=42,
                    n_jobs=-1,
                    eval_metric='logloss',
                    scale_pos_weight=len(y_fold_train) / (2 * np.sum(y_fold_train == y_fold_train[0])) if len(np.unique(y_fold_train)) == 2 else 1
                )
                xgb_model.fit(X_fold_train, y_fold_train)
                xgb_pred_proba = xgb_model.predict_proba(X_fold_val)
                
                # Train LightGBM
                lgb_model = lgb.LGBMClassifier(
                    n_estimators=lgb_n_estimators,
                    learning_rate=lgb_learning_rate,
                    random_state=42,
                    n_jobs=-1,
                    class_weight='balanced',
                    verbosity=-1
                )
                lgb_model.fit(X_fold_train, y_fold_train)
                lgb_pred_proba = lgb_model.predict_proba(X_fold_val)
                
                # Ensemble prediction
                ensemble_pred_proba = rf_weight * rf_pred_proba + xgb_weight * xgb_pred_proba + lgb_weight * lgb_pred_proba
                ensemble_pred = np.argmax(ensemble_pred_proba, axis=1)
                
                # Calculate F1-score (more robust than accuracy for imbalanced data)
                f1 = f1_score(y_fold_val, ensemble_pred, average='weighted')
                cv_scores.append(f1)
            
            # Return negative mean F1-score for minimization
            return 1 - np.mean(cv_scores)
            
        except Exception as e:
            print(f"Error in fitness function: {e}")
            return float('inf')
    
    def optimize(self, X_train, y_train, X_val, y_val, bounds):
        """Enhanced optimization with adaptive mechanisms"""
        print("Starting Advanced Gazelle Optimization Algorithm...")
        
        population = self.initialize_population(bounds)
        fitness_values = []
        
        # Evaluate initial population
        for i, gazelle in enumerate(population):
            fitness = self.fitness_function(gazelle, X_train, y_train, X_val, y_val)
            fitness_values.append(fitness)
            
            if fitness < self.best_fitness:
                self.best_fitness = fitness
                self.best_position = gazelle.copy()
        
        fitness_values = np.array(fitness_values)
        
        # Main optimization loop
        for iteration in range(self.max_iterations):
            print(f"GOA Iteration {iteration + 1}/{self.max_iterations}, Best Fitness: {self.best_fitness:.4f}")
            
            # Adaptive parameter adjustment
            exploration_factor = 1 - (iteration / self.max_iterations)
            
            for i in range(self.population_size):
                # Select random gazelles
                candidates = list(range(self.population_size))
                candidates.remove(i)
                r1, r2 = np.random.choice(candidates, 2, replace=False)
                
                # Adaptive step size
                step_size = (2 * np.random.random() - 1) * exploration_factor
                
                # Update position with improved movement equations
                for j in range(self.dim):
                    if np.random.random() < 0.5:
                        # Enhanced exploration
                        population[i][j] = population[i][j] + step_size * (population[r1][j] - population[r2][j])
                    else:
                        # Enhanced exploitation with levy flight
                        levy_step = self.levy_flight()
                        population[i][j] = population[i][j] + step_size * (self.best_position[j] - population[i][j]) + 0.01 * levy_step
                    
                    # Boundary handling
                    lower, upper = bounds[j]
                    population[i][j] = np.clip(population[i][j], lower, upper)
                
                # Evaluate new position
                new_fitness = self.fitness_function(population[i], X_train, y_train, X_val, y_val)
                
                # Update if better
                if new_fitness < fitness_values[i]:
                    fitness_values[i] = new_fitness
                    
                    if new_fitness < self.best_fitness:
                        self.best_fitness = new_fitness
                        self.best_position = population[i].copy()
            
            self.fitness_history.append(self.best_fitness)
            
            # Diversity maintenance
            if iteration % 10 == 0:
                self.maintain_diversity(population, bounds)
        
        print(f"Advanced GOA optimization completed. Best fitness: {self.best_fitness:.4f}")
        return self.best_position
    
    def levy_flight(self):
        """Generate Levy flight random walk"""
        beta = 3/2
        sigma = (np.math.gamma(1 + beta) * np.sin(np.pi * beta / 2) / 
                (np.math.gamma((1 + beta) / 2) * beta * (2 ** ((beta - 1) / 2)))) ** (1 / beta)
        u = np.random.normal(0, sigma)
        v = np.random.normal(0, 1)
        return u / (abs(v) ** (1 / beta))
    
    def maintain_diversity(self, population, bounds):
        """Maintain population diversity"""
        diversity = np.std(population, axis=0)
        avg_diversity = np.mean(diversity)
        
        if avg_diversity < self.diversity_threshold:
            # Reinitialize some random individuals
            num_reinit = self.population_size // 4
            indices = np.random.choice(self.population_size, num_reinit, replace=False)
            
            for idx in indices:
                for j in range(self.dim):
                    lower, upper = bounds[j]
                    population[idx][j] = np.random.uniform(lower, upper)

class EnhancedCICIoT2023MLAlgorithm:
    """
    Enhanced ML algorithms with improved data balancing and feature engineering
    """
    
    def __init__(self):
        self.scaler = RobustScaler()  # More robust to outliers
        self.label_encoder = LabelEncoder()
        self.rf_model = None
        self.xgb_model = None
        self.lgb_model = None
        self.best_params = None
        self.results = {}
        self.feature_importance = {}
        
    def load_and_preprocess_data(self, file_path):
        """Enhanced data loading and preprocessing"""
        print("Loading dataset...")
        try:
            df = pd.read_csv(file_path)
            print(f"Dataset loaded successfully. Shape: {df.shape}")
            
            # Identify target column
            if 'label' in df.columns:
                target_col = 'label'
            elif 'Label' in df.columns:
                target_col = 'Label'
            else:
                target_col = df.columns[-1]
            
            print(f"Target column: {target_col}")
            print(f"Target distribution: {df[target_col].value_counts()}")
            
            # Move target column to the end
            if target_col != df.columns[-1]:
                cols = [col for col in df.columns if col != target_col] + [target_col]
                df = df[cols]
            
        except Exception as e:
            print(f"Error loading dataset: {e}")
            raise e
        
        # Enhanced preprocessing
        df = self.advanced_preprocessing(df)
        
        return df
    
    def advanced_preprocessing(self, df):
        """Advanced preprocessing with feature engineering"""
        print("Advanced preprocessing...")
        
        target_col = df.columns[-1]
        
        # Remove exact duplicates
        initial_shape = df.shape
        df = df.drop_duplicates()
        print(f"Removed {initial_shape[0] - df.shape[0]} duplicate rows")
        
        # Handle missing values more intelligently
        for column in df.columns:
            if df[column].isnull().sum() > 0:
                if column == target_col:
                    df = df.dropna(subset=[column])
                elif df[column].dtype in ['float64', 'int64']:
                    # Use median for numerical columns
                    df[column].fillna(df[column].median(), inplace=True)
                else:
                    # Use mode for categorical columns
                    mode_val = df[column].mode()
                    if len(mode_val) > 0:
                        df[column].fillna(mode_val[0], inplace=True)
        
        # Advanced outlier handling using IQR method
        numerical_columns = df.select_dtypes(include=[np.number]).columns
        numerical_columns = [col for col in numerical_columns if col != target_col]
        
        for column in numerical_columns:
            Q1 = df[column].quantile(0.25)
            Q3 = df[column].quantile(0.75)
            IQR = Q3 - Q1
            
            if IQR > 0:
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                
                # Cap extreme outliers
                df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
                df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
        
        # Feature engineering - create new features
        if len(numerical_columns) > 1:
            # Create interaction features for top correlated features
            feature_corr = df[numerical_columns].corr().abs()
            
            # Find top correlated pairs
            high_corr_pairs = []
            for i in range(len(feature_corr.columns)):
                for j in range(i+1, len(feature_corr.columns)):
                    if 0.3 < feature_corr.iloc[i, j] < 0.9:  # Avoid perfect correlation
                        high_corr_pairs.append((feature_corr.columns[i], feature_corr.columns[j]))
            
            # Create interaction features for top 5 pairs
            for i, (col1, col2) in enumerate(high_corr_pairs[:5]):
                df[f'interaction_{i}'] = df[col1] * df[col2]
                df[f'ratio_{i}'] = df[col1] / (df[col2] + 1e-8)
        
        print(f"Final dataset shape after advanced preprocessing: {df.shape}")
        return df
    
    def enhanced_class_balancing(self, df, representation_type='2-class'):
        """Enhanced class balancing with proper handling of all classes"""
        print(f"Enhanced class balancing for {representation_type}...")
        
        target_col = df.columns[-1]
        unique_classes = df[target_col].unique()
        
        if representation_type == '2-class':
            df_copy = df.copy()
            
            # Identify normal class more robustly
            normal_keywords = ['normal', 'benign', 'legitimate', 'clean']
            normal_class = None
            
            for class_name in unique_classes:
                if any(keyword in str(class_name).lower() for keyword in normal_keywords):
                    normal_class = class_name
                    break
            
            if normal_class is None:
                # Use the most frequent class as normal
                class_counts = df[target_col].value_counts()
                normal_class = class_counts.index[0]
            
            # Create binary classification
            df_copy[target_col] = df_copy[target_col].apply(
                lambda x: 'Normal' if x == normal_class else 'Attack'
            )
            
            # Balance with 20,000 samples per class for better performance
            target_samples = 20000
            df_balanced = self.balance_binary_classes(df_copy, target_samples)
            
        elif representation_type == '8-class':
            df_copy = df.copy()
            
            # Enhanced 8-class mapping
            class_mapping = self.create_8_class_mapping(unique_classes)
            
            # Apply mapping
            for new_class, old_classes in class_mapping.items():
                df_copy[target_col] = df_copy[target_col].replace(old_classes, new_class)
            
            # Balance all 8 classes properly
            target_samples = 15000  # Increased for better performance
            df_balanced = self.balance_multiclass_advanced(df_copy, target_samples)
            
        elif representation_type == '34-class':
            df_copy = df.copy()
            
            # Keep all original classes but balance them properly
            target_samples = 5000  # Reasonable sample size for 34 classes
            df_balanced = self.balance_multiclass_advanced(df_copy, target_samples)
        
        print(f"Final balanced dataset shape: {df_balanced.shape}")
        print(f"Class distribution after balancing:")
        print(df_balanced[target_col].value_counts().sort_index())
        
        return df_balanced
    
    def create_8_class_mapping(self, unique_classes):
        """Create intelligent 8-class mapping"""
        class_mapping = {
            'Normal': [],
            'DDoS': [],
            'DoS': [],
            'Reconnaissance': [],
            'Web_Attack': [],
            'Brute_Force': [],
            'Spoofing': [],
            'Botnet': []
        }
        
        # Find normal class
        normal_keywords = ['normal', 'benign', 'legitimate', 'clean']
        for class_name in unique_classes:
            class_lower = str(class_name).lower()
            
            if any(keyword in class_lower for keyword in normal_keywords):
                class_mapping['Normal'].append(class_name)
            elif 'ddos' in class_lower:
                class_mapping['DDoS'].append(class_name)
            elif 'dos' in class_lower and 'ddos' not in class_lower:
                class_mapping['DoS'].append(class_name)
            elif any(keyword in class_lower for keyword in ['recon', 'scan', 'probe', 'ping']):
                class_mapping['Reconnaissance'].append(class_name)
            elif any(keyword in class_lower for keyword in ['web', 'sql', 'xss', 'injection']):
                class_mapping['Web_Attack'].append(class_name)
            elif any(keyword in class_lower for keyword in ['brute', 'force', 'dictionary']):
                class_mapping['Brute_Force'].append(class_name)
            elif any(keyword in class_lower for keyword in ['spoof', 'mitm', 'arp']):
                class_mapping['Spoofing'].append(class_name)
            elif any(keyword in class_lower for keyword in ['bot', 'mirai', 'trojan']):
                class_mapping['Botnet'].append(class_name)
            else:
                # Distribute remaining classes
                if len(class_mapping['DDoS']) < len(class_mapping['DoS']):
                    class_mapping['DDoS'].append(class_name)
                else:
                    class_mapping['DoS'].append(class_name)
        
        return class_mapping
    
    def balance_binary_classes(self, df, target_samples):
        """Balance binary classes with advanced techniques"""
        target_col = df.columns[-1]
        
        normal_data = df[df[target_col] == 'Normal']
        attack_data = df[df[target_col] == 'Attack']
        
        # Ensure we have enough samples
        if len(normal_data) < target_samples:
            normal_data = resample(normal_data, n_samples=target_samples, 
                                 random_state=42, replace=True)
        else:
            normal_data = normal_data.sample(n=target_samples, random_state=42)
        
        if len(attack_data) < target_samples:
            attack_data = resample(attack_data, n_samples=target_samples, 
                                 random_state=42, replace=True)
        else:
            attack_data = attack_data.sample(n=target_samples, random_state=42)
        
        return pd.concat([normal_data, attack_data], ignore_index=True)
    
    def balance_multiclass_advanced(self, df, target_samples):
        """Advanced multiclass balancing ensuring all classes are properly balanced"""
        target_col = df.columns[-1]
        unique_classes = df[target_col].unique()
        
        print(f"Balancing {len(unique_classes)} classes to {target_samples} samples each...")
        
        balanced_data = []
        
        for class_label in unique_classes:
            class_data = df[df[target_col] == class_label]
            current_samples = len(class_data)
            
            print(f"Class '{class_label}': {current_samples} -> {target_samples} samples")
            
            if current_samples == 0:
                print(f"Warning: No samples found for class '{class_label}', skipping...")
                continue
            
            if current_samples < target_samples:
                # Upsample minority class
                upsampled = resample(class_data, n_samples=target_samples, 
                                   random_state=42, replace=True)
                balanced_data.append(upsampled)
            elif current_samples > target_samples:
                # Downsample majority class
                downsampled = class_data.sample(n=target_samples, random_state=42)
                balanced_data.append(downsampled)
            else:
                # Already balanced
                balanced_data.append(class_data)
        
        df_balanced = pd.concat(balanced_data, ignore_index=True)
        
        # Apply SMOTE for additional balancing
        X = df_balanced.drop(target_col, axis=1)
        y = df_balanced[target_col]
        
        try:
            # Use SMOTE with adjusted parameters
            smote = SMOTE(random_state=42, k_neighbors=min(3, target_samples-1))
            X_resampled, y_resampled = smote.fit_resample(X, y)
            
            # Combine back to dataframe
            df_final = pd.concat([
                pd.DataFrame(X_resampled, columns=X.columns),
                pd.Series(y_resampled, name=target_col)
            ], axis=1)
            
            return df_final
            
        except Exception as e:
            print(f"SMOTE failed: {e}, using basic balancing")
            return df_balanced
    
    def train_enhanced_hybrid_model(self, X_train, y_train, X_val, y_val):
        """Train enhanced hybrid model with 3 algorithms"""
        print("Training enhanced hybrid model (RF + XGB + LightGBM)...")
        
        # Encode labels
        y_train_encoded = self.label_encoder.fit_transform(y_train)
        y_val_encoded = self.label_encoder.transform(y_val)
        
        # Enhanced hyperparameter bounds
        bounds = [
            (100, 300),   # RF n_estimators
            (5, 25),      # RF max_depth
            (2, 10),      # RF min_samples_split
            (1, 5),       # RF min_samples_leaf
            (100, 500),   # XGB n_estimators
            (3, 15),      # XGB max_depth
            (0.01, 0.3),  # XGB learning_rate
            (0.6, 1.0),   # XGB subsample
            (0.6, 1.0),   # XGB colsample_bytree
            (50, 300),    # LGB n_estimators
            (0.01, 0.3),  # LGB learning_rate
            (0.0, 3.0)    # Weight parameter (will be normalized)
        ]
        
        # Run enhanced GOA
        goa = AdvancedGazelleOptimizationAlgorithm(
            population_size=50, max_iterations=150, dim=12
        )
        self.best_params = goa.optimize(X_train, y_train_encoded, X_val, y_val_encoded, bounds)
        
        # Extract optimized parameters
        rf_params = {
            'n_estimators': int(self.best_params[0]),
            'max_depth': int(self.best_params[1]),
            'min_samples_split': int(self.best_params[2]),
            'min_samples_leaf': int(self.best_params[3]),
            'random_state': 42,
            'n_jobs': -1,
            'class_weight': 'balanced'
        }
        
        xgb_params = {
            'n_estimators': int(self.best_params[4]),
            'max_depth': int(self.best_params[5]),
            'learning_rate': self.best_params[6],
            'subsample': self.best_params[7],
            'colsample_bytree': self.best_params[8],
            'random_state': 42,
            'n_jobs': -1,
            'eval_metric': 'logloss'
        }
        
        lgb_params = {
            'n_estimators': int(self.best_params[9]),
            'learning_rate': self.best_params[10],
            'random_state': 42,
            'n_jobs': -1,
            'class_weight': 'balanced',
            'verbosity': -1
        }
        
        # Normalize ensemble weights
        total_weight = self.best_params[11]
        self.rf_weight = total_weight / 3
        self.xgb_weight = total_weight / 3
        self.lgb_weight = 1 - self.rf_weight - self.xgb_weight
        
        print(f"Optimized parameters:")
        print(f"RF: {rf_params}")
        print(f"XGB: {xgb_params}")
        print(f"LGB: {lgb_params}")
        print(f"Weights - RF: {self.rf_weight:.3f}, XGB: {self.xgb_weight:.3f}, LGB: {self.lgb_weight:.3f}")
        
        # Train models
        self.rf_model = RandomForestClassifier(**rf_params)
        self.xgb_model = xgb.XGBClassifier(**xgb_params)
        self.lgb_model = lgb.LGBMClassifier(**lgb_params)
        
        self.rf_model.fit(X_train, y_train_encoded)
        self.xgb_model.fit(X_train, y_train_encoded)
        self.lgb_model.fit(X_train, y_train_encoded)
        
        # Store feature importance
        self.feature_importance = {
            'rf': self.rf_model.feature_importances_,
            'xgb': self.xgb_model.feature_importances_,
            'lgb': self.lgb_model.feature_importances_
        }
        
        print("Enhanced hybrid model training completed!")
    
    def predict_enhanced_hybrid(self, X_test):
        """Make predictions using enhanced hybrid model"""
        rf_pred_proba = self.rf_model.predict_proba(X_test)
        xgb_pred_proba = self.xgb_model.predict_proba(X_test)
        lgb_pred_proba = self.lgb_model.predict_proba(X_test)
        
        # Weighted ensemble prediction
        ensemble_pred_proba = (self.rf_weight * rf_pred_proba + 
                              self.xgb_weight * xgb_pred_proba + 
                              self.lgb_weight * lgb_pred_proba)
        
        ensemble_pred = np.argmax(ensemble_pred_proba, axis=1)
        
        return self.label_encoder.inverse_transform(ensemble_pred)
    
    def evaluate_enhanced_model(self, X_test, y_test):
        """Enhanced model evaluation"""
        print("Evaluating enhanced hybrid model...")
        
        y_pred = self.predict_enhanced_hybrid(X_test)
        
        # Calculate comprehensive metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        
        # Calculate per-class metrics
        per_class_metrics = classification_report(y_test, y_pred, output_dict=True)
        
        self.results = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'Per_Class_Metrics': per_class_metrics
        }
        
        return self.results
    
    def run_enhanced_algorithm(self, file_path, representation_type='2-class'):
        """Run the enhanced algorithm"""
        print(f"Starting Enhanced CICIoT2023 ML Algorithm with {representation_type}...")
        
        # Load and preprocess data
        df = self.load_and_preprocess_data(file_path)
        
        # Enhanced class balancing
        df_balanced = self.enhanced_class_balancing(df, representation_type)
        
        # Prepare features and target
        X = df_balanced.iloc[:, :-1].values
        y = df_balanced.iloc[:, -1].values
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        X_train_opt, X_val, y_train_opt, y_val = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
        )
        
        print(f"Training set size: {X_train_opt.shape[0]}")
        print(f"Validation set size: {X_val.shape[0]}")
        print(f"Test set size: {X_test.shape[0]}")
        
        # Step 5: Normalize features
        X_train_scaled, X_test_scaled = self.normalize_features(X_train_opt, X_test)
        X_val_scaled = self.scaler.transform(X_val)
        
        # Step 6: Train hybrid model with GOA optimization
        self.train_hybrid_model(X_train_scaled, y_train_opt, X_val_scaled, y_val)
        
        # Step 7: Evaluate model
        results = self.evaluate_model(X_test_scaled, y_test)
        
        # Display results
        self.display_results(representation_type)
        
        return results
    
    def display_results(self, representation_type):
        """
        Display evaluation results
        """
        print("\n" + "="*80)
        print(f"EVALUATION RESULTS - {representation_type.upper()} REPRESENTATION")
        print("="*80)
        
        print(f"Hybrid Random Forest + XGBoost Model (GOA Optimized):")
        print(f"  Accuracy:  {self.results['Accuracy']:.4f}")
        print(f"  Precision: {self.results['Precision']:.4f}")
        print(f"  Recall:    {self.results['Recall']:.4f}")
        print(f"  F1-Score:  {self.results['F1-Score']:.4f}")
        
        print(f"\nOptimized Hyperparameters:")
        print(f"  RF n_estimators: {int(self.best_params[0])}")
        print(f"  RF max_depth: {int(self.best_params[1]) if self.best_params[1] > 0 else None}")
        print(f"  XGB n_estimators: {int(self.best_params[4])}")
        print(f"  XGB learning_rate: {self.best_params[6]:.3f}")
        print(f"  Ensemble weights - RF: {self.rf_weight:.3f}, XGB: {self.xgb_weight:.3f}")

# Example usage
if __name__ == "__main__":
    # Initialize the algorithm
    ml_algorithm = AdvancedGazelleOptimizationAlgorithm()
    
    print("CICIoT2023 Dataset ML Algorithm with GOA and Hybrid RF-XGB")
    print("This implementation includes:")
    print("- Gazelle Optimization Algorithm (GOA) for hyperparameter optimization")
    print("- Hybrid Random Forest + XGBoost ensemble model")
    print("- Three class representations: 2-class, 8-class, and 34-class")
    print("\nTo use this algorithm:")
    print("1. Ensure you have the CICIoT2023 dataset in CSV format")
    print("2. Install required packages: pip install pandas scikit-learn imbalanced-learn xgboost")
    print("3. Run for different representations:")
    print("   - 2-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '2-class')")
    print("   - 8-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '8-class')")
    print("   - 34-class: ml_algorithm.run_algorithm('path_to_dataset.csv', '34-class')")
    
     # Uncomment the following lines and provide the correct path to run the algorithm
    results_2class = ml_algorithm.run_enhanced_algorithm('C:\\ProgramData\\anaconda3\\Lib\\site-packages\\pandas\\io\\parsers\\capstone\\part-00112-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', '2-class')
    results_8class = ml_algorithm.run_enhanced_algorithm('C:\\ProgramData\\anaconda3\\Lib\\site-packages\\pandas\\io\\parsers\\capstone\\part-00112-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', '8-class')
    #results_34class = ml_algorithm.run_algorithm('C:\\ProgramData\\anaconda3\\Lib\\site-packages\\pandas\\io\\parsers\\capstone\\part-00112-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', '34-class')