In [None]:
#This code is done using a random split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, roc_curve, f1_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
import xgboost as xgb
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

def calculate_detailed_metrics(y_true, y_pred, model_name):
    """
    Calculate detailed metrics for model evaluation.

    Args:
        y_true: True labels
        y_pred: Predicted labels
        model_name: Name of the model for display

    Returns:
        Dictionary of metrics
    """
    # Basic metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')

    # Confusion matrix to get FP and FN
    cm = confusion_matrix(y_true, y_pred)

    # For multi-class, calculate total false positives and false negatives
    false_positives = 0
    false_negatives = 0

    # Loop through each class
    for i in range(len(cm)):
        # False positives are sum of column i minus value at position (i,i)
        false_positives += sum(cm[:,i]) - cm[i,i]
        # False negatives are sum of row i minus value at position (i,i)
        false_negatives += sum(cm[i,:]) - cm[i,i]

    # Calculate detection rate (True Positives / (True Positives + False Negatives))
    # For multiclass, detection is macro-averaged recall
    detection = np.sum(np.diag(cm)) / np.sum(cm)

    # Print results in tabular format
    print(f"{model_name:<20}{accuracy:.8f}{f1:.8f}{false_positives:<12}{false_negatives:<12}{detection:.8f}")

    # Return metrics dict
    return {
        'Accuracy': accuracy,
        'F1 Score': f1,
        'False Positives': false_positives,
        'False Negatives': false_negatives,
        'Detection': detection
    }

def load_data(file_path):
    """
    Load network flow data from file path, attempting multiple delimiters.
    """
    try:
        # First try comma delimiter (most common for CICIDS2017)
        df = pd.read_csv(file_path, delimiter=',', low_memory=False)
        print(f"Loaded dataset with comma delimiter. Shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error with comma delimiter: {e}")

        # Try with tab delimiter
        try:
            df = pd.read_csv(file_path, delimiter='\t', low_memory=False)

            # Check if we ended up with only one column containing all data
            if len(df.columns) == 1 and ',' in df.iloc[0, 0]:
                print("Data loaded as a single column. Trying comma delimiter again...")

                # Try with comma delimiter
                df = pd.read_csv(file_path, delimiter=',', low_memory=False)
                print(f"Loaded dataset with comma delimiter. Shape: {df.shape}")
                return df

            print(f"Loaded dataset with tab delimiter. Shape: {df.shape}")
            return df
        except Exception as e:
            print(f"Error with tab delimiter: {e}")

            # Try a manual approach
            try:
                with open(file_path, 'r') as f:
                    lines = f.readlines()

                # Detect delimiter from first line
                first_line = lines[0].strip()
                if '\t' in first_line and ',' in first_line:
                    # If both tab and comma exist, use the one that gives more splits
                    tab_count = first_line.count('\t')
                    comma_count = first_line.count(',')
                    delimiter = '\t' if tab_count > comma_count else ','
                elif '\t' in first_line:
                    delimiter = '\t'
                elif ',' in first_line:
                    delimiter = ','
                else:
                    delimiter = ',' # Default to comma

                print(f"Using manual parsing with delimiter: '{delimiter}'")

                # Parse manually
                headers = lines[0].strip().split(delimiter)
                data = []

                for i in range(1, len(lines)):
                    if lines[i].strip():  # Skip empty lines
                        row = lines[i].strip().split(delimiter)
                        if len(row) == len(headers):
                            data.append(row)
                        else:
                            print(f"Warning: Line {i+1} has {len(row)} fields, expected {len(headers)}")

                df = pd.DataFrame(data, columns=headers)
                print(f"Manually loaded dataset with shape: {df.shape}")
                return df
            except Exception as e:
                print(f"Error with manual parsing: {e}")
                raise

def preprocess_data(df, scaler=None, label_encoder=None, fit_scaler=False, fit_encoder=False):
    """
    Preprocess the network flow data for multiclass classification:
    - Remove flow identifiers
    - Apply min-max normalization
    - Encode attack labels

    Args:
        df: The dataframe to preprocess
        scaler: An optional pre-fitted scaler (for test data)
        label_encoder: An optional pre-fitted label encoder (for test data)
        fit_scaler: Whether to fit the scaler on this data (for train data)
        fit_encoder: Whether to fit the label encoder on this data (for train data)

    Returns:
        Preprocessed dataframe, scaler, and label encoder
    """
    print("\nPreprocessing data:")
    print(f"Initial columns: {df.columns.tolist()[:5]}... (total: {len(df.columns)})")

    # Make a copy to avoid modifying the original
    df_processed = df.copy()

    # Identify the label column (usually 'Label' in CICIDS2017)
    label_col = None
    for possible_label in ['Label', 'label', 'CLASS', 'class', 'Attack_Type']:
        if possible_label in df_processed.columns:
            label_col = possible_label
            break

    if label_col is None:
        raise ValueError("Could not find a label column in the dataset")

    # Check label distribution
    label_counts = df_processed[label_col].value_counts()
    print(f"Label distribution: {label_counts.to_dict()}")

    # Keep the original label
    original_label = df_processed[label_col].copy()

    # Label encode the attack types
    if fit_encoder:
        if label_encoder is None:
            label_encoder = LabelEncoder()
            encoded_labels = label_encoder.fit_transform(original_label)
        else:
            raise ValueError("Cannot fit new encoder when fit_encoder=True but encoder is provided")
    else:
        if label_encoder is None:
            raise ValueError("Either provide a fitted label encoder or set fit_encoder=True")
        try:
            encoded_labels = label_encoder.transform(original_label)
        except:
            # Handle unknown labels by mapping them to the most common label
            print("Warning: Found unknown labels in test data. Mapping to 'BENIGN'")
            unknown_labels = set(original_label) - set(label_encoder.classes_)
            print(f"Unknown labels: {unknown_labels}")

            # Create a temporary series with known labels
            temp_labels = original_label.copy()
            temp_labels[temp_labels.isin(unknown_labels)] = 'BENIGN'
            encoded_labels = label_encoder.transform(temp_labels)

    # Add encoded labels column
    df_processed['Label_Encoded'] = encoded_labels

    # Display mapping
    if fit_encoder:
        print("\nLabel encoding mapping:")
        for i, label in enumerate(label_encoder.classes_):
            print(f"  {i}: {label}")

    # Remove flow identifiers and other non-feature columns
    columns_to_drop = [
        'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
        'Timestamp', 'Label', label_col, 'id'
    ]

    # Only drop columns that exist in the dataframe
    columns_to_drop = list(set([col for col in columns_to_drop if col in df_processed.columns]))
    df_cleaned = df_processed.drop(columns=columns_to_drop, errors='ignore')

    # Convert all columns to numeric, coercing errors to NaN
    numeric_cols = []
    for col in df_cleaned.columns:
        if col != 'Label_Encoded' and col != 'Label_Original':
            try:
                df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
                numeric_cols.append(col)
            except Exception as e:
                print(f"Error converting column {col} to numeric: {e}")
                df_cleaned = df_cleaned.drop(columns=[col])

    # Display info about numeric columns
    print(f"Number of numeric columns after conversion: {len(numeric_cols)}")
    if len(numeric_cols) == 0:
        raise ValueError("No numeric columns available after preprocessing")

    # Handle NaN values
    print(f"NaN values before handling: {df_cleaned[numeric_cols].isna().sum().sum()}")

    # Replace infinity values with NaN
    df_cleaned = df_cleaned.replace([np.inf, -np.inf], np.nan)

    # Check for columns with all NaN values
    null_cols = [col for col in numeric_cols if df_cleaned[col].isna().all()]
    if null_cols:
        print(f"Dropping columns with all NaN values: {null_cols}")
        df_cleaned = df_cleaned.drop(columns=null_cols)
        numeric_cols = [col for col in numeric_cols if col not in null_cols]

    # Fill remaining NaN values with column means
    for col in numeric_cols:
        if df_cleaned[col].isna().any():
            col_mean = df_cleaned[col].mean()
            df_cleaned[col] = df_cleaned[col].fillna(col_mean)

    print(f"NaN values after handling: {df_cleaned[numeric_cols].isna().sum().sum()}")

    # Verify we have data to work with
    if len(numeric_cols) == 0:
        raise ValueError("No numeric columns available after preprocessing")

    # Apply min-max scaling to all numeric columns
    features = df_cleaned[numeric_cols]
    labels = df_cleaned['Label_Encoded']

    # Apply scaling
    if scaler is None and fit_scaler:
        scaler = MinMaxScaler()
        scaled_features = scaler.fit_transform(features)
    elif scaler is not None:
        scaled_features = scaler.transform(features)
    else:
        raise ValueError("Either provide a fitted scaler or set fit_scaler=True")

    # Create a new dataframe with scaled features
    scaled_df = pd.DataFrame(scaled_features, columns=features.columns)

    # Add back the label column
    scaled_df['Label_Encoded'] = labels.values
    scaled_df['Label_Original'] = original_label.values

    print(f"Preprocessed dataset shape: {scaled_df.shape}")
    print(f"Number of unique classes: {scaled_df['Label_Encoded'].nunique()}")

    if fit_scaler or fit_encoder:
        return scaled_df, scaler, label_encoder
    else:
        return scaled_df

def handle_class_imbalance(X, y, sampling_strategy='auto', random_state=42):
    """
    Apply SMOTE to balance the multiclass dataset.

    Args:
        X: Feature matrix
        y: Target labels
        sampling_strategy: Strategy for SMOTE
        random_state: Random seed

    Returns:
        Balanced X and y
    """
    print("\nHandling class imbalance with SMOTE:")

    # Check class distribution before SMOTE
    class_dist_before = pd.Series(y).value_counts().sort_index()
    print("Class distribution before SMOTE:")
    for cls, count in class_dist_before.items():
        print(f"  Class {cls}: {count} samples ({count/len(y)*100:.2f}%)")

    # Apply SMOTE if we have enough samples
    min_samples_per_class = pd.Series(y).value_counts().min()
    if min_samples_per_class < 6:
        print(f"Warning: Minimum samples per class ({min_samples_per_class}) is too small for SMOTE.")
        print("Using original imbalanced data")
        return X, y

    try:
        # Instantiate SMOTE
        smote = SMOTE(sampling_strategy=sampling_strategy, random_state=random_state)

        # Apply SMOTE
        X_resampled, y_resampled = smote.fit_resample(X, y)

        # Check class distribution after SMOTE
        class_dist_after = pd.Series(y_resampled).value_counts().sort_index()
        print("\nClass distribution after SMOTE:")
        for cls, count in class_dist_after.items():
            print(f"  Class {cls}: {count} samples ({count/len(y_resampled)*100:.2f}%)")

        print(f"SMOTE successfully applied. New sample count: {len(y_resampled)} (was {len(y)})")
        return X_resampled, y_resampled

    except Exception as e:
        print(f"Error applying SMOTE: {e}")
        print("Using original imbalanced data")
        return X, y

def train_model(X_train, y_train, X_val=None, y_val=None, params=None):
    """
    Train an XGBoost model for multiclass classification

    Args:
        X_train: Training features
        y_train: Training labels
        X_val: Validation features
        y_val: Validation labels
        params: XGBoost parameters

    Returns:
        Trained XGBoost model
    """
    print("\nTraining XGBoost model for multiclass classification...")

    # Count number of classes
    num_classes = len(np.unique(y_train))
    print(f"Number of classes: {num_classes}")

    # Set default parameters if not provided
    if params is None:
        params = {
            'objective': 'multi:softprob',
            'num_class': num_classes,
            'eta': 0.1,
            'max_depth': 6,
            'min_child_weight': 1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'tree_method': 'hist',  # For faster training
            'eval_metric': 'mlogloss',
            'use_label_encoder': False
        }

    # Create XGBoost model
    model = xgb.XGBClassifier(**params)

    # Train with validation if provided
    if X_val is not None and y_val is not None:
        print("Training with validation set...")
        eval_set = [(X_train, y_train), (X_val, y_val)]
        model.fit(
            X_train, y_train,
            eval_set=eval_set,
            early_stopping_rounds=10,
            verbose=True
        )
    else:
        print("Training without validation set...")
        model.fit(X_train, y_train)

    return model

def evaluate_multiclass_model(model, X_test, y_test, label_encoder=None):
    """
    Evaluate the multiclass model and calculate performance metrics

    Args:
        model: Trained XGBoost model
        X_test: Test features
        y_test: Test labels (encoded)
        label_encoder: Label encoder to convert indices to original labels

    Returns:
        Dictionary of performance metrics
    """
    print("\nEvaluating multiclass model...")

    # Print metrics header first
    print(f"\n{'Model':<20}{'Accuracy':<12}{'F1 Score':<12}{'False Postive':<12}{'False Negative':<12}{'Detection':<12}")

    # Measure prediction time
    start_time = time.time()
    y_pred_proba = model.predict_proba(X_test)
    y_pred = model.predict(X_test)
    end_time = time.time()

    # Calculate detailed metrics
    detailed_metrics = calculate_detailed_metrics(y_test, y_pred, "XGBoost, Dataset A")

    # Calculate prediction time per sample in microseconds
    prediction_time = (end_time - start_time) * 1000000 / len(X_test)
    print(f"Prediction time: {prediction_time:.2f} μs/sample")

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)

    # Calculate macro and weighted F1 scores
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')

    # Get unique classes present in the test set
    unique_test_classes = np.unique(y_test)

    # Print class names if label encoder is provided
    if label_encoder is not None:
        # Filter class names to only include those present in the test set
        class_names = [label_encoder.classes_[i] for i in unique_test_classes]
    else:
        class_names = None

    # Print classification report with only the classes present in the test set
    print("\nClassification Report:")
    if class_names:
        print(classification_report(y_test, y_pred,
                                   labels=unique_test_classes,
                                   target_names=class_names))
    else:
        print(classification_report(y_test, y_pred))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=unique_test_classes)

    # Create a dictionary of metrics
    metrics = {
        'Accuracy': accuracy,
        'Macro F1 Score': macro_f1,
        'Weighted F1 Score': weighted_f1,
        'Prediction Time (μs/sample)': prediction_time,
        'False Positives': detailed_metrics['False Positives'],
        'False Negatives': detailed_metrics['False Negatives'],
        'Detection': detailed_metrics['Detection']
    }

    # Print metrics
    print("\nPerformance Metrics:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.6f}")

    # Create confusion matrix plot
    plt.figure(figsize=(16, 14))

    # Use class names if available, otherwise use class indices
    if class_names:
        xticklabels = class_names
        yticklabels = class_names
    else:
        xticklabels = unique_test_classes
        yticklabels = unique_test_classes

    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=xticklabels,
                yticklabels=yticklabels)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig('confusion_matrix_multiclass.png')
    plt.close()
    print("Confusion matrix saved as 'confusion_matrix_multiclass.png'")

    # Create feature importance plot
    plt.figure(figsize=(12, 10))
    xgb.plot_importance(model, max_num_features=20)
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig('feature_importance_multiclass.png')
    plt.close()
    print("Feature importance plot saved as 'feature_importance_multiclass.png'")

    return metrics, cm

def plot_multiclass_roc_curves(model, X_test, y_test, label_encoder=None):
    """
    Plot ROC curves for each class in a one-vs-rest fashion

    Args:
        model: Trained XGBoost model
        X_test: Test features
        y_test: Test labels (encoded)
        label_encoder: Label encoder to convert indices to original labels
    """
    # Get predictions
    y_score = model.predict_proba(X_test)

    # Get number of classes
    n_classes = y_score.shape[1]

    # Get class names
    class_names = label_encoder.classes_ if label_encoder is not None else [f"Class {i}" for i in range(n_classes)]

    # Compute ROC curve and ROC area for each class
    plt.figure(figsize=(12, 10))

    # Compute macro-average ROC curve and ROC area
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([np.linspace(0, 1, 100) for i in range(n_classes)]))

    # Then interpolate all ROC curves at these points
    mean_tpr = np.zeros_like(all_fpr)

    # Plot ROC curves for each class
    for i in range(n_classes):
        # Convert to one-vs-rest
        y_true_bin = (y_test == i).astype(int)
        y_score_bin = y_score[:, i]

        # Calculate ROC curve
        fpr, tpr, _ = roc_curve(y_true_bin, y_score_bin)
        roc_auc = auc(fpr, tpr)

        # Plot class ROC curve
        plt.plot(fpr, tpr, lw=2,
                 label=f'{class_names[i]} (AUC = {roc_auc:.2f})')

        # Interpolate tpr values for macro-average
        mean_tpr += np.interp(all_fpr, fpr, tpr)

    # Finish macro-average ROC curve
    mean_tpr /= n_classes
    mean_auc = auc(all_fpr, mean_tpr)
    plt.plot(all_fpr, mean_tpr, 'k--',
             label=f'Macro-average (AUC = {mean_auc:.2f})',
             lw=3)

    # Plot baseline
    plt.plot([0, 1], [0, 1], 'r--', label='Random Classifier')

    # Set plot details
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Multiclass ROC Curves (One-vs-Rest)')
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.tight_layout()

    # Save plot
    plt.savefig('roc_curves_multiclass.png')
    plt.close()
    print("Multiclass ROC curves saved as 'roc_curves_multiclass.png'")

def main():
    """
    Main function to run multiclass classification on the combined CICIDS2017 dataset
    """
    from sklearn.model_selection import train_test_split

    # First, make sure Google Drive is mounted
    try:
        from google.colab import drive
        print("Mounting Google Drive...")
        drive.mount('/content/drive')
        print("Google Drive mounted successfully.")
    except:
        print("Not running in Colab or Drive already mounted.")

    # List files in the expected directory to check path
    import os
    cicids_dir = "/content/drive/My Drive/CICIDS2017_improved"
    if os.path.exists(cicids_dir):
        print(f"\nDirectory exists: {cicids_dir}")
        print("Files in directory:")
        for file in os.listdir(cicids_dir):
            print(f"  - {file}")
    else:
        print(f"\nDirectory not found: {cicids_dir}")
        print("Searching for the dataset...")

        # Try to find the combined dataset by searching common locations
        possible_paths = [
            "/content/drive/MyDrive/CICIDS2017_improved/combined_dataset.csv",
            "/content/drive/My Drive/CICIDS2017/combined_dataset.csv",
            "/content/drive/MyDrive/CICIDS2017/combined_dataset.csv",
            "/content/combined_dataset.csv"
        ]

        for path in possible_paths:
            if os.path.exists(path):
                print(f"Found dataset at: {path}")
                combined_dataset_path = path
                break
        else:
            # Let the user specify the path
            print("\nCannot find the combined dataset automatically.")
            combined_dataset_path = input("Please enter the full path to combined_dataset.csv: ")

    # Define the file path to your combined dataset
    combined_dataset_path = "/content/drive/My Drive/CICIDS2017_improved/combined_dataset.csv"

    try:
        print(f"Loading combined dataset from {combined_dataset_path}...")
        combined_df = load_data(combined_dataset_path)
    except Exception as e:
        print(f"Error loading combined dataset: {e}")
        # Ask for user input
        combined_dataset_path = input("Please enter the correct path to the combined dataset CSV file: ")
        try:
            print(f"Trying to load from {combined_dataset_path}...")
            combined_df = load_data(combined_dataset_path)
        except Exception as e:
            print(f"Still unable to load the dataset: {e}")
            return

    # Split the dataset into training and testing sets without stratification
    print("\nSplitting dataset into training (70%) and testing (30%) sets...")

    # Find the label column
    label_col = None
    for possible_label in ['Label', 'label', 'CLASS', 'class']:
        if possible_label in combined_df.columns:
            label_col = possible_label
            break

    if label_col is None:
        print("Could not find label column in the combined dataset")
        return

    # Print class distribution before split
    print("\nFull dataset class distribution:")
    class_counts = combined_df[label_col].value_counts()
    for cls, count in class_counts.items():
        print(f"  {cls}: {count} ({count/len(combined_df)*100:.2f}%)")

    # Simple random split with no stratification
    train_df, test_df = train_test_split(
        combined_df,
        test_size=0.3,
        random_state=42
    )

    print(f"Training set: {len(train_df)} samples")
    print(f"Test set: {len(test_df)} samples")

    # Display class distribution in splits
    print("\nTraining set class distribution:")
    train_class_dist = train_df[label_col].value_counts()
    for cls, count in train_class_dist.items():
        print(f"  {cls}: {count} ({count/len(train_df)*100:.2f}%)")

    print("\nTest set class distribution:")
    test_class_dist = test_df[label_col].value_counts()
    for cls, count in test_class_dist.items():
        print(f"  {cls}: {count} ({count/len(test_df)*100:.2f}%)")

    # Preprocess the training data
    try:
        print("\nPreprocessing training data...")
        train_processed, scaler, label_encoder = preprocess_data(
            train_df,
            fit_scaler=True,
            fit_encoder=True
        )
    except Exception as e:
        print(f"Error preprocessing training data: {e}")
        return

    # Preprocess the test data using the same scaler and encoder
    try:
        print("\nPreprocessing test data...")
        test_processed = preprocess_data(
            test_df,
            scaler=scaler,
            label_encoder=label_encoder,
            fit_scaler=False,
            fit_encoder=False
        )
    except Exception as e:
        print(f"Error preprocessing test data: {e}")
        return

    # Split features and target for training data
    X_train = train_processed.drop(['Label_Encoded', 'Label_Original'], axis=1)
    y_train = train_processed['Label_Encoded'].astype(int)

    # Split features and target for test data
    X_test = test_processed.drop(['Label_Encoded', 'Label_Original'], axis=1)
    y_test = test_processed['Label_Encoded'].astype(int)

    print(f"\nTraining features shape: {X_train.shape}")
    print(f"Training target shape: {y_train.shape}")
    print(f"Test features shape: {X_test.shape}")
    print(f"Test target shape: {y_test.shape}")

    # Handle class imbalance with SMOTE
    X_train_balanced, y_train_balanced = handle_class_imbalance(X_train, y_train)

    # Train XGBoost multiclass model
    model = train_model(X_train_balanced, y_train_balanced)

    # Evaluate model on test set
    metrics, confusion_mat = evaluate_multiclass_model(model, X_test, y_test, label_encoder)

    # Create a subset of 10k samples for additional evaluation
    if len(X_test) > 10000:
        print("\nCreating 10k sample subset for additional evaluation...")
        idx = np.random.choice(len(X_test), 10000, replace=False)
        X_subset = X_test.iloc[idx]
        y_subset = y_test.iloc[idx]

        print("\nEvaluating on 10k subset:")
        print(f"{'Model':<20}{'Accuracy':<12}{'F1 Score':<12}{'False Postive':<12}{'False Negative':<12}{'Detection':<12}")
        subset_metrics = calculate_detailed_metrics(y_subset, model.predict(X_subset), "XGBoost, Dataset A, 10k subsample")

    # Plot ROC curves for multiclass
    plot_multiclass_roc_curves(model, X_test, y_test, label_encoder)

    # Save model for future use
    model.save_model('xgboost_multiclass_model.json')
    print("\nModel saved as 'xgboost_multiclass_model.json'")

    # Print comparison with baseline
    print("\nPerformance Comparison:")
    print(f"Baseline F1 Score (from paper): ~70%")
    print(f"Our Model's Weighted F1 Score: {metrics['Weighted F1 Score']:.2%}")
    improvement = (metrics['Weighted F1 Score'] - 0.7) / 0.7 * 100
    print(f"Improvement: {improvement:.2f}%")

    print("\nAnalysis complete!")

if __name__ == "__main__":
    main()

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.

Directory exists: /content/drive/My Drive/CICIDS2017_improved
Files in directory:
  - monday.csv
  - tuesday.csv
  - wednesday.csv
  - thursday.csv
  - friday.csv
  - combined_dataset.csv
  - balanced_multiclass_10k.csv
  - train.csv
  - metrics_by_class.png
  - class_metrics.csv
  - prediction_results.csv
  - confusion_matrix.csv
  - confusion_metrics_by_class.png
  - fixed_train.csv
  - fixed_test.csv
  - enhanced_train.csv
  - plots_multiclass
  - multiclass_metrics.csv
Loading combined dataset from /content/drive/My Drive/CICIDS2017_improved/combined_dataset.csv...
Loaded dataset with comma delimiter. Shape: (250000, 91)

Splitting dataset into training (70%) and testing (30%) sets...

Full dataset class distribution:
  BENIGN: 197771 (79.11%)
  DoS Hulk: 15917 (6.37%)
  Portscan: 14170 (5.67%)
  Infiltration - Portscan: 9886 (3.95%)
  DDoS: 8585 (3.43%)
  DoS GoldenEye: 733 (0.29%)
  FTP-Patator

<Figure size 1200x1000 with 0 Axes>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import xgboost as xgb
import warnings
import os
warnings.filterwarnings('ignore')

def calculate_detailed_metrics(y_true, y_pred, model_name):
    """
    Calculate detailed metrics for model evaluation.

    Args:
        y_true: True labels
        y_pred: Predicted labels
        model_name: Name of the model for display

    Returns:
        Dictionary of metrics
    """
    # Basic metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')

    # Confusion matrix to get FP and FN
    cm = confusion_matrix(y_true, y_pred)

    # For multi-class, calculate total false positives and false negatives
    false_positives = 0
    false_negatives = 0

    # Loop through each class
    for i in range(len(cm)):
        # False positives are sum of column i minus value at position (i,i)
        false_positives += sum(cm[:,i]) - cm[i,i]
        # False negatives are sum of row i minus value at position (i,i)
        false_negatives += sum(cm[i,:]) - cm[i,i]

    # Calculate detection rate (True Positives / (True Positives + False Negatives))
    # For multiclass, detection is macro-averaged recall
    detection = np.sum(np.diag(cm)) / np.sum(cm)

    # Print results in tabular format
    print(f"{model_name:<20}{accuracy:.8f}{f1:.8f}{false_positives:<12}{false_negatives:<12}{detection:.8f}")

    # Return metrics dict
    return {
        'Accuracy': accuracy,
        'F1 Score': f1,
        'False Positives': false_positives,
        'False Negatives': false_negatives,
        'Detection': detection
    }

def load_data_by_days(base_dir):
    """
    Load individual day-wise datasets and combine them for training and testing.
    """
    train_days = ["monday.csv", "tuesday.csv", "wednesday.csv"]
    test_days = ["thursday.csv", "friday.csv"]

    train_df_list, test_df_list = [], []

    for file in train_days:
        path = os.path.join(base_dir, file)
        if os.path.exists(path):
            df = pd.read_csv(path, delimiter=',', low_memory=False)
            train_df_list.append(df)
        else:
            print(f"Warning: {file} not found")

    for file in test_days:
        path = os.path.join(base_dir, file)
        if os.path.exists(path):
            df = pd.read_csv(path, delimiter=',', low_memory=False)
            test_df_list.append(df)
        else:
            print(f"Warning: {file} not found")

    train_df = pd.concat(train_df_list, ignore_index=True)
    test_df = pd.concat(test_df_list, ignore_index=True)

    print(f"Training dataset shape: {train_df.shape}")
    print(f"Testing dataset shape: {test_df.shape}")

    return train_df, test_df

def preprocess_data(df, scaler=None, label_encoder=None, fit_scaler=False, fit_encoder=False):
    """
    Preprocess the dataset by cleaning, encoding labels, and normalizing features.
    Handles unseen labels in the test set by mapping them to 'UNKNOWN_ATTACK'.
    """
    print("\nPreprocessing data:")
    print(f"Initial columns: {df.columns.tolist()[:5]}... (total: {len(df.columns)})")

    df_processed = df.copy()
    label_col = None
    for possible_label in ['Label', 'label', 'CLASS', 'class']:
        if possible_label in df_processed.columns:
            label_col = possible_label
            break

    if label_col is None:
        raise ValueError("Label column not found")

    original_label = df_processed[label_col].copy()

    # Check label distribution
    label_counts = df_processed[label_col].value_counts()
    print(f"Label distribution: {label_counts.to_dict()}")

    if fit_encoder:
        label_encoder = LabelEncoder()
        df_processed['Label_Encoded'] = label_encoder.fit_transform(original_label)

        # Display mapping
        print("\nLabel encoding mapping:")
        for i, label in enumerate(label_encoder.classes_):
            print(f"  {i}: {label}")
    else:
        unknown_labels = set(original_label) - set(label_encoder.classes_)
        if unknown_labels:
            print(f"⚠️ Warning: Found unseen labels in test data: {unknown_labels}")
            df_processed[label_col] = df_processed[label_col].apply(
                lambda x: x if x in label_encoder.classes_ else 'UNKNOWN_ATTACK'
            )

        if 'UNKNOWN_ATTACK' not in label_encoder.classes_:
            all_classes = list(label_encoder.classes_) + ['UNKNOWN_ATTACK']
            label_encoder.classes_ = np.array(all_classes)

        df_processed['Label_Encoded'] = label_encoder.transform(df_processed[label_col])

    drop_cols = ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp', label_col, 'id']
    columns_to_drop = [col for col in drop_cols if col in df_processed.columns]
    df_cleaned = df_processed.drop(columns=columns_to_drop, errors='ignore')

    # Convert all columns to numeric, coercing errors to NaN
    numeric_cols = []
    for col in df_cleaned.columns:
        if col != 'Label_Encoded' and col != 'Label_Original':
            try:
                df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
                numeric_cols.append(col)
            except Exception as e:
                print(f"Error converting column {col} to numeric: {e}")
                df_cleaned = df_cleaned.drop(columns=[col])

    # Display info about numeric columns
    print(f"Number of numeric columns after conversion: {len(numeric_cols)}")
    if len(numeric_cols) == 0:
        raise ValueError("No numeric columns available after preprocessing")

    # Handle NaN values
    print(f"NaN values before handling: {df_cleaned[numeric_cols].isna().sum().sum()}")

    df_cleaned.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Handle outliers by clipping
    for col in numeric_cols:
        upper_limit = df_cleaned[col].quantile(0.99)
        lower_limit = df_cleaned[col].quantile(0.01)
        df_cleaned[col] = np.clip(df_cleaned[col], lower_limit, upper_limit)

    # Fill remaining NaN values with column means
    for col in numeric_cols:
        if df_cleaned[col].isna().any():
            col_mean = df_cleaned[col].mean()
            df_cleaned[col] = df_cleaned[col].fillna(col_mean)

    print(f"NaN values after handling: {df_cleaned[numeric_cols].isna().sum().sum()}")

    features = df_cleaned[numeric_cols]
    labels = df_cleaned['Label_Encoded']

    if fit_scaler:
        scaler = MinMaxScaler()
        scaled_features = scaler.fit_transform(features)
    else:
        scaled_features = scaler.transform(features)

    scaled_df = pd.DataFrame(scaled_features, columns=features.columns)
    scaled_df['Label_Encoded'] = labels.values
    scaled_df['Label_Original'] = original_label.values

    print(f"Preprocessed dataset shape: {scaled_df.shape}")
    print(f"Number of unique classes: {scaled_df['Label_Encoded'].nunique()}")

    return scaled_df, scaler, label_encoder

def train_model(X_train, y_train, X_val=None, y_val=None, params=None):
    """
    Train an XGBoost model for multiclass classification

    Args:
        X_train: Training features
        y_train: Training labels
        X_val: Validation features
        y_val: Validation labels
        params: XGBoost parameters

    Returns:
        Trained XGBoost model
    """
    print("\nTraining XGBoost model for multiclass classification...")

    # Count number of classes
    num_classes = len(np.unique(y_train))
    print(f"Number of classes: {num_classes}")

    # Set default parameters if not provided
    if params is None:
        params = {
            'objective': 'multi:softprob',
            'num_class': num_classes,
            'eta': 0.1,
            'max_depth': 6,
            'min_child_weight': 1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'tree_method': 'hist',  # For faster training
            'eval_metric': 'mlogloss',
            'use_label_encoder': False
        }

    # Create XGBoost model
    model = xgb.XGBClassifier(**params)

    # Train with validation if provided
    if X_val is not None and y_val is not None:
        print("Training with validation set...")
        eval_set = [(X_train, y_train), (X_val, y_val)]
        model.fit(
            X_train, y_train,
            eval_set=eval_set,
            early_stopping_rounds=10,
            verbose=True
        )
    else:
        print("Training without validation set...")
        model.fit(X_train, y_train)

    return model

def evaluate_model(model, X_test, y_test, label_encoder):
    """Evaluate the trained model on the test dataset."""
    print("\nEvaluating multiclass model...")

    # Print metrics header
    print(f"\n{'Model':<20}{'Accuracy':<12}{'F1 Score':<12}{'False Postive':<12}{'False Negative':<12}{'Detection':<12}")

    # Predict and time
    start_time = time.time()
    y_pred = model.predict(X_test)
    end_time = time.time()

    # Calculate prediction time per sample in microseconds
    prediction_time = (end_time - start_time) * 1000000 / len(X_test)

    # Calculate detailed metrics
    detailed_metrics = calculate_detailed_metrics(y_test, y_pred, "XGBoost, Dataset B")

    # Calculate standard metrics
    accuracy = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    weighted_f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"\nPrediction time: {prediction_time:.2f} μs/sample")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Macro F1 Score: {macro_f1:.4f}")
    print(f"Weighted F1 Score: {weighted_f1:.4f}")

    unique_test_classes = np.unique(y_test)
    filtered_class_names = [label_encoder.classes_[i] for i in unique_test_classes]

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, labels=unique_test_classes, target_names=filtered_class_names))

    # Create confusion matrix plot
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(16, 14))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=filtered_class_names,
                yticklabels=filtered_class_names)
    plt.title('Confusion Matrix - Dataset B (Temporal Split)')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig('confusion_matrix_dataset_b.png')
    plt.close()
    print("Confusion matrix saved as 'confusion_matrix_dataset_b.png'")

    # Create feature importance plot
    plt.figure(figsize=(12, 10))
    xgb.plot_importance(model, max_num_features=20)
    plt.title('Feature Importance - Dataset B')
    plt.tight_layout()
    plt.savefig('feature_importance_dataset_b.png')
    plt.close()
    print("Feature importance plot saved as 'feature_importance_dataset_b.png'")

    return detailed_metrics

def main():
    # First, make sure Google Drive is mounted
    try:
        from google.colab import drive
        print("Mounting Google Drive...")
        drive.mount('/content/drive')
        print("Google Drive mounted successfully.")
    except:
        print("Not running in Colab or Drive already mounted.")

    base_dir = "/content/drive/My Drive/CICIDS2017_improved"

    # Check if directory exists
    if os.path.exists(base_dir):
        print(f"\nDirectory exists: {base_dir}")
        print("Files in directory:")
        for file in os.listdir(base_dir):
            print(f"  - {file}")
    else:
        print(f"\nDirectory not found: {base_dir}")
        return

    # Load data by days (temporal split)
    train_df, test_df = load_data_by_days(base_dir)

    # Preprocess training data
    train_processed, scaler, label_encoder = preprocess_data(train_df, fit_scaler=True, fit_encoder=True)

    # Preprocess test data
    test_processed, _, _ = preprocess_data(test_df, scaler=scaler, label_encoder=label_encoder)

    # Split features and target for training data
    X_train = train_processed.drop(columns=['Label_Encoded', 'Label_Original'])
    y_train = train_processed['Label_Encoded'].astype(int)

    # Split features and target for test data
    X_test = test_processed.drop(columns=['Label_Encoded', 'Label_Original'])
    y_test = test_processed['Label_Encoded'].astype(int)

    print(f"\nTraining features shape: {X_train.shape}")
    print(f"Training target shape: {y_train.shape}")
    print(f"Test features shape: {X_test.shape}")
    print(f"Test target shape: {y_test.shape}")

    # Train XGBoost model
    model = train_model(X_train, y_train)

    # Evaluate model
    metrics = evaluate_model(model, X_test, y_test, label_encoder)

    # Create a subset of 10k samples for additional evaluation
    if len(X_test) > 10000:
        print("\nCreating 10k sample subset for additional evaluation...")
        idx = np.random.choice(len(X_test), 10000, replace=False)
        X_subset = X_test.iloc[idx]
        y_subset = y_test.iloc[idx]

        print("\nEvaluating on 10k subset:")
        print(f"{'Model':<20}{'Accuracy':<12}{'F1 Score':<12}{'False Postive':<12}{'False Negative':<12}{'Detection':<12}")
        subset_metrics = calculate_detailed_metrics(y_subset, model.predict(X_subset), "XGBoost, Dataset B, 10k subsample")

    # Save model
    model.save_model('xgboost_dataset_b_model.json')
    print("\nModel saved as 'xgboost_dataset_b_model.json'")

    print("\nAnalysis complete!")

if __name__ == "__main__":
    main()

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.

Directory exists: /content/drive/My Drive/CICIDS2017_improved
Files in directory:
  - monday.csv
  - tuesday.csv
  - wednesday.csv
  - thursday.csv
  - friday.csv
  - combined_dataset.csv
  - balanced_multiclass_10k.csv
  - train.csv
  - metrics_by_class.png
  - class_metrics.csv
  - prediction_results.csv
  - confusion_matrix.csv
  - confusion_metrics_by_class.png
  - fixed_train.csv
  - fixed_test.csv
  - enhanced_train.csv
  - plots_multiclass
  - multiclass_metrics.csv
Training dataset shape: (1190343, 91)
Testing dataset shape: (909633, 91)

Preprocessing data:
Initial columns: ['id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP']... (total: 91)
Label distribution: {'BENIGN': 1005850, 'DoS Hulk': 158468, 'DoS GoldenEye': 7567, 'FTP-Patator': 3972, 'DoS Slowloris': 3859, 'DoS Slowhttptest - Attempte

<Figure size 1200x1000 with 0 Axes>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
import warnings
import os
warnings.filterwarnings('ignore')

def calculate_detailed_metrics(y_true, y_pred, model_name):
    """
    Calculate detailed metrics for model evaluation.

    Args:
        y_true: True labels
        y_pred: Predicted labels
        model_name: Name of the model for display

    Returns:
        Dictionary of metrics
    """
    # Basic metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')

    # Confusion matrix to get FP and FN
    cm = confusion_matrix(y_true, y_pred)

    # For multi-class, calculate total false positives and false negatives
    false_positives = 0
    false_negatives = 0

    # Loop through each class
    for i in range(len(cm)):
        # False positives are sum of column i minus value at position (i,i)
        false_positives += sum(cm[:,i]) - cm[i,i]
        # False negatives are sum of row i minus value at position (i,i)
        false_negatives += sum(cm[i,:]) - cm[i,i]

    # Calculate detection rate (True Positives / (True Positives + False Negatives))
    # For multiclass, detection is macro-averaged recall
    detection = np.sum(np.diag(cm)) / np.sum(cm)

    # Print results in tabular format
    print(f"{model_name:<20}{accuracy:.8f}{f1:.8f}{false_positives:<12}{false_negatives:<12}{detection:.8f}")

    # Return metrics dict
    return {
        'Accuracy': accuracy,
        'F1 Score': f1,
        'False Positives': false_positives,
        'False Negatives': false_negatives,
        'Detection': detection
    }

def preprocess_data(df, scaler=None, label_encoder=None, fit_scaler=False, fit_encoder=False):
    """
    Preprocess the dataset by cleaning, encoding labels, and normalizing features.
    """
    print("\nPreprocessing data:")
    print(f"Initial shape: {df.shape}")

    # Make a copy to avoid modifying the original
    df_processed = df.copy()

    # Identify the label column
    label_col = 'Label'  # From the CSV info, we know it's called 'Label'
    if label_col not in df_processed.columns:
        raise ValueError(f"Label column '{label_col}' not found in the dataset")

    # Check label distribution
    label_counts = df_processed[label_col].value_counts()
    print(f"Label distribution: {label_counts.to_dict()}")

    # Keep the original label
    original_label = df_processed[label_col].copy()

    # Label encode the attack types
    if fit_encoder:
        label_encoder = LabelEncoder()
        encoded_labels = label_encoder.fit_transform(original_label)

        # Display mapping
        print("\nLabel encoding mapping:")
        for i, label in enumerate(label_encoder.classes_):
            print(f"  {i}: {label}")
    else:
        encoded_labels = label_encoder.transform(original_label)

    # Add encoded labels column
    df_processed['Label_Encoded'] = encoded_labels

    # Remove identifiers and other non-feature columns
    columns_to_drop = [
        'id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port',
        'Protocol', 'Timestamp', 'Label'
    ]

    # Only drop columns that exist in the dataframe
    columns_to_drop = [col for col in columns_to_drop if col in df_processed.columns]
    df_cleaned = df_processed.drop(columns=columns_to_drop, errors='ignore')

    # Convert all columns to numeric except Label_Encoded
    numeric_cols = []
    for col in df_cleaned.columns:
        if col != 'Label_Encoded':
            try:
                df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
                numeric_cols.append(col)
            except Exception as e:
                print(f"Error converting column {col} to numeric: {e}")
                df_cleaned = df_cleaned.drop(columns=[col])

    # Display info about numeric columns
    print(f"Number of numeric columns after conversion: {len(numeric_cols)}")

    # Handle NaN values
    print(f"NaN values before handling: {df_cleaned[numeric_cols].isna().sum().sum()}")

    # Replace infinity values with NaN
    df_cleaned = df_cleaned.replace([np.inf, -np.inf], np.nan)

    # Handle outliers with quantile clipping
    for col in numeric_cols:
        upper_limit = df_cleaned[col].quantile(0.99)
        lower_limit = df_cleaned[col].quantile(0.01)
        df_cleaned[col] = np.clip(df_cleaned[col], lower_limit, upper_limit)

    # Fill NaN values with column means
    for col in numeric_cols:
        if df_cleaned[col].isna().any():
            col_mean = df_cleaned[col].mean()
            df_cleaned[col] = df_cleaned[col].fillna(col_mean)

    print(f"NaN values after handling: {df_cleaned[numeric_cols].isna().sum().sum()}")

    # Apply min-max scaling to numeric columns
    features = df_cleaned[numeric_cols]
    labels = df_cleaned['Label_Encoded']

    # Apply scaling
    if fit_scaler:
        scaler = MinMaxScaler()
        scaled_features = scaler.fit_transform(features)
    else:
        scaled_features = scaler.transform(features)

    # Create a new dataframe with scaled features
    scaled_df = pd.DataFrame(scaled_features, columns=features.columns)

    # Add back the encoded label column
    scaled_df['Label_Encoded'] = labels.values

    print(f"Preprocessed dataset shape: {scaled_df.shape}")
    print(f"Number of unique classes: {scaled_df['Label_Encoded'].nunique()}")

    # Return processed dataframe and optionally scaler and encoder
    if fit_scaler or fit_encoder:
        return scaled_df, scaler, label_encoder
    else:
        return scaled_df

def train_and_evaluate_model(X_train, y_train, X_test, y_test, label_encoder):
    """
    Train an XGBoost model and evaluate its performance
    """
    print("\nTraining XGBoost model...")

    # Count number of classes
    num_classes = len(np.unique(y_train))
    print(f"Number of classes: {num_classes}")

    # Set parameters
    params = {
        'objective': 'multi:softprob',
        'num_class': num_classes,
        'eta': 0.1,
        'max_depth': 6,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'tree_method': 'hist',
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }

    # Create and train the model
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    # Evaluate the model
    print("\nEvaluating model...")

    # Print metrics header
    print(f"\n{'Model':<20}{'Accuracy':<12}{'F1 Score':<12}{'False Postive':<12}{'False Negative':<12}{'Detection':<12}")

    # Predict and time
    start_time = time.time()
    y_pred = model.predict(X_test)
    end_time = time.time()

    # Calculate prediction time per sample
    prediction_time = (end_time - start_time) * 1000000 / len(X_test)
    print(f"Prediction time: {prediction_time:.2f} μs/sample")

    # Calculate detailed metrics
    detailed_metrics = calculate_detailed_metrics(y_test, y_pred, "XGBoost, Dataset B")

    # Generate classification report
    unique_test_classes = np.unique(y_test)
    class_names = [label_encoder.classes_[i] for i in unique_test_classes]

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, labels=unique_test_classes, target_names=class_names))

    # Create confusion matrix visualization
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.title('Confusion Matrix - Dataset B')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig('confusion_matrix_balanced_dataset_b.png')
    plt.close()
    print("Confusion matrix saved as 'confusion_matrix_balanced_dataset_b.png'")

    # Create feature importance plot
    plt.figure(figsize=(12, 10))
    xgb.plot_importance(model, max_num_features=20)
    plt.title('Feature Importance - Dataset B')
    plt.tight_layout()
    plt.savefig('feature_importance_balanced_dataset_b.png')
    plt.close()
    print("Feature importance plot saved as 'feature_importance_balanced_dataset_b.png'")

    return model, detailed_metrics

def main():
    # Mount Google Drive if running in Colab
    try:
        from google.colab import drive
        print("Mounting Google Drive...")
        drive.mount('/content/drive')
        print("Google Drive mounted successfully.")
    except:
        print("Not running in Colab or Drive already mounted.")

    # Set base directory
    base_dir = "/content/drive/My Drive/CICIDS2017_improved"

    # Search for the file in the same directory
    print(f"\nSearching for balanced_10k_test.csv in {base_dir}...")

    # List all files in the directory to help find the file
    if os.path.exists(base_dir):
        print("Files in directory:")
        for file in os.listdir(base_dir):
            print(f"  - {file}")

        # Look for files containing "10k" in the name
        balanced_files = [f for f in os.listdir(base_dir) if "10k" in f.lower()]
        if balanced_files:
            print("\nPossible balanced dataset files found:")
            for file in balanced_files:
                print(f"  - {file}")

            # Use the first matching file
            if balanced_files:
                balanced_filename = balanced_files[0]
                balanced_test_path = os.path.join(base_dir, balanced_filename)
                print(f"\nUsing file: {balanced_filename}")
        else:
            # If no files with "10k" found, prompt for path
            print("\nNo files with '10k' in the name found.")
            balanced_test_path = input("Please enter the full path to balanced_10k_test.csv: ")
    else:
        print(f"Directory not found: {base_dir}")
        balanced_test_path = input("Please enter the full path to balanced_10k_test.csv: ")

    # Load the dataset
    print(f"Loading balanced 10k test dataset from {balanced_test_path}...")
    try:
        df = pd.read_csv(balanced_test_path)
        print(f"Dataset loaded successfully. Shape: {df.shape}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    # Split data into training and testing sets
    print("\nSplitting dataset into 70% training and 30% testing...")
    train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
    print(f"Training set: {len(train_df)} samples")
    print(f"Test set: {len(test_df)} samples")

    # Preprocess training data
    train_processed, scaler, label_encoder = preprocess_data(
        train_df, fit_scaler=True, fit_encoder=True
    )

    # Preprocess test data
    test_processed = preprocess_data(
        test_df, scaler=scaler, label_encoder=label_encoder,
        fit_scaler=False, fit_encoder=False
    )

    # Prepare features and labels
    X_train = train_processed.drop('Label_Encoded', axis=1)
    y_train = train_processed['Label_Encoded']
    X_test = test_processed.drop('Label_Encoded', axis=1)
    y_test = test_processed['Label_Encoded']

    print(f"\nTraining features shape: {X_train.shape}")
    print(f"Training target shape: {y_train.shape}")
    print(f"Test features shape: {X_test.shape}")
    print(f"Test target shape: {y_test.shape}")

    # Train and evaluate model
    model, metrics = train_and_evaluate_model(X_train, y_train, X_test, y_test, label_encoder)

    # Save model
    model.save_model('xgboost_balanced_dataset_b.json')
    print("\nModel saved as 'xgboost_balanced_dataset_b.json'")

    print("\nAnalysis complete!")

if __name__ == "__main__":
    main()

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.

Searching for balanced_10k_test.csv in /content/drive/My Drive/CICIDS2017_improved...
Files in directory:
  - monday.csv
  - tuesday.csv
  - wednesday.csv
  - thursday.csv
  - friday.csv
  - combined_dataset.csv
  - balanced_multiclass_10k.csv
  - train.csv
  - metrics_by_class.png
  - class_metrics.csv
  - prediction_results.csv
  - confusion_matrix.csv
  - confusion_metrics_by_class.png
  - fixed_train.csv
  - fixed_test.csv
  - enhanced_train.csv
  - plots_multiclass
  - multiclass_metrics.csv

Possible balanced dataset files found:
  - balanced_multiclass_10k.csv

Using file: balanced_multiclass_10k.csv
Loading balanced 10k test dataset from /content/drive/My Drive/CICIDS2017_improved/balanced_multiclass_10k.csv...
Dataset loaded successfully. Shape: (10000, 91)

Splitting dataset into 70% train

<Figure size 1200x1000 with 0 Axes>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb
import warnings
import os
warnings.filterwarnings('ignore')

def calculate_detailed_metrics(y_true, y_pred, model_name):
    """
    Calculate detailed metrics for model evaluation.

    Args:
        y_true: True labels
        y_pred: Predicted labels
        model_name: Name of the model for display

    Returns:
        Dictionary of metrics
    """
    # Basic metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')

    # Confusion matrix to get FP and FN
    cm = confusion_matrix(y_true, y_pred)

    # For multi-class, calculate total false positives and false negatives
    false_positives = 0
    false_negatives = 0

    # Loop through each class
    for i in range(len(cm)):
        # False positives are sum of column i minus value at position (i,i)
        false_positives += sum(cm[:,i]) - cm[i,i]
        # False negatives are sum of row i minus value at position (i,i)
        false_negatives += sum(cm[i,:]) - cm[i,i]

    # Calculate detection rate (True Positives / (True Positives + False Negatives))
    # For multiclass, detection is macro-averaged recall
    detection = np.sum(np.diag(cm)) / np.sum(cm)

    # Print results in tabular format
    print(f"{model_name:<20}{accuracy:.8f}{f1:.8f}{false_positives:<12}{false_negatives:<12}{detection:.8f}")

    # Return metrics dict
    return {
        'Accuracy': accuracy,
        'F1 Score': f1,
        'False Positives': false_positives,
        'False Negatives': false_negatives,
        'Detection': detection
    }

def preprocess_data(df, scaler=None, label_encoder=None, fit_scaler=False, fit_encoder=False):
    """
    Preprocess the dataset by cleaning, encoding labels, and normalizing features.
    """
    print("\nPreprocessing data:")
    print(f"Initial shape: {df.shape}")

    # Make a copy to avoid modifying the original
    df_processed = df.copy()

    # Identify the label column
    label_col = 'Label'  # From the CSV info, we know it's called 'Label'
    if label_col not in df_processed.columns:
        raise ValueError(f"Label column '{label_col}' not found in the dataset")

    # Check label distribution
    label_counts = df_processed[label_col].value_counts()
    print(f"Label distribution: {label_counts.to_dict()}")

    # Keep the original label
    original_label = df_processed[label_col].copy()

    # Label encode the attack types
    if fit_encoder:
        label_encoder = LabelEncoder()
        encoded_labels = label_encoder.fit_transform(original_label)

        # Display mapping
        print("\nLabel encoding mapping:")
        for i, label in enumerate(label_encoder.classes_):
            print(f"  {i}: {label}")
    else:
        encoded_labels = label_encoder.transform(original_label)

    # Add encoded labels column
    df_processed['Label_Encoded'] = encoded_labels

    # Remove identifiers and other non-feature columns
    columns_to_drop = [
        'id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port',
        'Protocol', 'Timestamp', 'Label'
    ]

    # Only drop columns that exist in the dataframe
    columns_to_drop = [col for col in columns_to_drop if col in df_processed.columns]
    df_cleaned = df_processed.drop(columns=columns_to_drop, errors='ignore')

    # Convert all columns to numeric except Label_Encoded
    numeric_cols = []
    for col in df_cleaned.columns:
        if col != 'Label_Encoded':
            try:
                df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
                numeric_cols.append(col)
            except Exception as e:
                print(f"Error converting column {col} to numeric: {e}")
                df_cleaned = df_cleaned.drop(columns=[col])

    # Display info about numeric columns
    print(f"Number of numeric columns after conversion: {len(numeric_cols)}")

    # Handle NaN values
    print(f"NaN values before handling: {df_cleaned[numeric_cols].isna().sum().sum()}")

    # Replace infinity values with NaN
    df_cleaned = df_cleaned.replace([np.inf, -np.inf], np.nan)

    # Handle outliers with quantile clipping
    for col in numeric_cols:
        upper_limit = df_cleaned[col].quantile(0.99)
        lower_limit = df_cleaned[col].quantile(0.01)
        df_cleaned[col] = np.clip(df_cleaned[col], lower_limit, upper_limit)

    # Fill NaN values with column means
    for col in numeric_cols:
        if df_cleaned[col].isna().any():
            col_mean = df_cleaned[col].mean()
            df_cleaned[col] = df_cleaned[col].fillna(col_mean)

    print(f"NaN values after handling: {df_cleaned[numeric_cols].isna().sum().sum()}")

    # Apply min-max scaling to numeric columns
    features = df_cleaned[numeric_cols]
    labels = df_cleaned['Label_Encoded']

    # Apply scaling
    if fit_scaler:
        scaler = MinMaxScaler()
        scaled_features = scaler.fit_transform(features)
    else:
        scaled_features = scaler.transform(features)

    # Create a new dataframe with scaled features
    scaled_df = pd.DataFrame(scaled_features, columns=features.columns)

    # Add back the encoded label column
    scaled_df['Label_Encoded'] = labels.values

    print(f"Preprocessed dataset shape: {scaled_df.shape}")
    print(f"Number of unique classes: {scaled_df['Label_Encoded'].nunique()}")

    # Return processed dataframe and optionally scaler and encoder
    if fit_scaler or fit_encoder:
        return scaled_df, scaler, label_encoder
    else:
        return scaled_df

def train_and_evaluate_model(X_train, y_train, X_test, y_test, label_encoder):
    """
    Train an XGBoost model and evaluate its performance
    """
    print("\nTraining XGBoost model...")

    # Count number of classes
    num_classes = len(np.unique(y_train))
    print(f"Number of classes: {num_classes}")

    # Set parameters
    params = {
        'objective': 'multi:softprob',
        'num_class': num_classes,
        'eta': 0.1,
        'max_depth': 6,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'tree_method': 'hist',
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }

    # Create and train the model
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    # Evaluate the model
    print("\nEvaluating model...")

    # Print metrics header
    print(f"\n{'Model':<20}{'Accuracy':<12}{'F1 Score':<12}{'False Postive':<12}{'False Negative':<12}{'Detection':<12}")

    # Predict and time
    start_time = time.time()
    y_pred = model.predict(X_test)
    end_time = time.time()

    # Calculate prediction time per sample
    prediction_time = (end_time - start_time) * 1000000 / len(X_test)
    print(f"Prediction time: {prediction_time:.2f} μs/sample")

    # Calculate detailed metrics
    detailed_metrics = calculate_detailed_metrics(y_test, y_pred, "XGBoost, Dataset A, 10k")

    # Generate classification report
    unique_test_classes = np.unique(y_test)
    class_names = [label_encoder.classes_[i] for i in unique_test_classes]

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, labels=unique_test_classes, target_names=class_names))

    # Create confusion matrix visualization
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.title('Confusion Matrix - Dataset A, 10k')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig('confusion_matrix_dataset_a_10k.png')
    plt.close()
    print("Confusion matrix saved as 'confusion_matrix_dataset_a_10k.png'")

    # Create feature importance plot
    plt.figure(figsize=(12, 10))
    xgb.plot_importance(model, max_num_features=20)
    plt.title('Feature Importance - Dataset A, 10k')
    plt.tight_layout()
    plt.savefig('feature_importance_dataset_a_10k.png')
    plt.close()
    print("Feature importance plot saved as 'feature_importance_dataset_a_10k.png'")

    return model, detailed_metrics

def main():
    # Mount Google Drive if running in Colab
    try:
        from google.colab import drive
        print("Mounting Google Drive...")
        drive.mount('/content/drive')
        print("Google Drive mounted successfully.")
    except:
        print("Not running in Colab or Drive already mounted.")

    # Set base directory
    base_dir = "/content/drive/My Drive/CICIDS2017_improved"

    # Specify the dataset file
    dataset_name = "dataset_A_10k.csv"
    dataset_path = os.path.join(base_dir, dataset_name)

    # Check if the file exists
    if not os.path.exists(dataset_path):
        print(f"File not found: {dataset_path}")
        print("Files in directory:")
        if os.path.exists(base_dir):
            for file in os.listdir(base_dir):
                print(f"  - {file}")
        dataset_path = input(f"Please enter the full path to {dataset_name}: ")

    # Load the dataset
    print(f"Loading dataset from {dataset_path}...")
    try:
        df = pd.read_csv(dataset_path)
        print(f"Dataset loaded successfully. Shape: {df.shape}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    # Split data into training and testing sets
    print("\nSplitting dataset into 70% training and 30% testing...")
    train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
    print(f"Training set: {len(train_df)} samples")
    print(f"Test set: {len(test_df)} samples")

    # Preprocess training data
    train_processed, scaler, label_encoder = preprocess_data(
        train_df, fit_scaler=True, fit_encoder=True
    )

    # Preprocess test data
    test_processed = preprocess_data(
        test_df, scaler=scaler, label_encoder=label_encoder,
        fit_scaler=False, fit_encoder=False
    )

    # Prepare features and labels
    X_train = train_processed.drop('Label_Encoded', axis=1)
    y_train = train_processed['Label_Encoded']
    X_test = test_processed.drop('Label_Encoded', axis=1)
    y_test = test_processed['Label_Encoded']

    print(f"\nTraining features shape: {X_train.shape}")
    print(f"Training target shape: {y_train.shape}")
    print(f"Test features shape: {X_test.shape}")
    print(f"Test target shape: {y_test.shape}")

    # Train and evaluate model
    model, metrics = train_and_evaluate_model(X_train, y_train, X_test, y_test, label_encoder)

    # Save model
    model.save_model('xgboost_dataset_a_10k.json')
    print("\nModel saved as 'xgboost_dataset_a_10k.json'")

    print("\nAnalysis complete!")

if __name__ == "__main__":
    main()

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.
Loading dataset from /content/drive/My Drive/CICIDS2017_improved/dataset_A_10k.csv...
Dataset loaded successfully. Shape: (10000, 91)

Splitting dataset into 70% training and 30% testing...
Training set: 7000 samples
Test set: 3000 samples

Preprocessing data:
Initial shape: (7000, 91)
Label distribution: {'BENIGN': 3461, 'Portscan': 1116, 'DoS Hulk': 1067, 'DDoS': 654, 'Infiltration - Portscan': 492, 'DoS GoldenEye': 55, 'DoS Slowloris': 28, 'Botnet - Attempted': 26, 'FTP-Patator': 25, 'SSH-Patator': 19, 'DoS Slowhttptest - Attempted': 18, 'DoS Slowhttptest': 12, 'Web Attack - Brute Force - Attempted': 10, 'DoS Slowloris - Attempted': 5, 'Web Attack - XSS - Attempted': 5, 'Web Attack - Brute Force': 3, 'Botnet': 2, 'DoS Hulk - Attempted': 1, 'Infiltration': 1}

Label encoding mapping:
  0: BENIGN
  1: Botnet
  2: Botnet - Attempted
  3: DDoS
  4: DoS GoldenEye
  5: DoS Hulk
  6: DoS Hulk - Attempted


<Figure size 1200x1000 with 0 Axes>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import xgboost as xgb
import warnings
import os
warnings.filterwarnings('ignore')

def calculate_detailed_metrics(y_true, y_pred, model_name):
    """
    Calculate detailed metrics for model evaluation.
    """
    # Basic metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')

    # Confusion matrix to get FP and FN
    cm = confusion_matrix(y_true, y_pred)

    # For multi-class, calculate total false positives and false negatives
    false_positives = 0
    false_negatives = 0

    # Loop through each class
    for i in range(len(cm)):
        # False positives are sum of column i minus value at position (i,i)
        false_positives += sum(cm[:,i]) - cm[i,i]
        # False negatives are sum of row i minus value at position (i,i)
        false_negatives += sum(cm[i,:]) - cm[i,i]

    # Calculate detection rate (True Positives / (True Positives + False Negatives))
    detection = np.sum(np.diag(cm)) / np.sum(cm)

    # Print results in tabular format
    print(f"{model_name:<35}{accuracy:.8f}{f1:.8f}{false_positives:<12}{false_negatives:<12}{detection:.8f}")

    return {
        'Accuracy': accuracy,
        'F1 Score': f1,
        'False Positives': false_positives,
        'False Negatives': false_negatives,
        'Detection': detection
    }

def load_data(file_path):
    """
    Load network flow data from a CSV file.
    """
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded dataset with shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

def load_combined_data(base_dir, days=None):
    """
    Load and combine data from multiple day files.

    Args:
        base_dir: Base directory for data files
        days: List of day files to load (e.g., ['monday.csv', 'tuesday.csv'])
              If None, load all 5 days

    Returns:
        Combined DataFrame
    """
    if days is None:
        days = ["monday.csv", "tuesday.csv", "wednesday.csv", "thursday.csv", "friday.csv"]

    print(f"Loading data from days: {days}")

    combined_df_list = []
    for day in days:
        file_path = os.path.join(base_dir, day)
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            print(f"  - {day}: {df.shape[0]} samples")
            combined_df_list.append(df)
        else:
            print(f"  - Warning: {day} not found")

    # Combine all dataframes
    combined_df = pd.concat(combined_df_list, ignore_index=True)
    print(f"Combined dataset shape: {combined_df.shape}")

    return combined_df

def preprocess_data(df, scaler=None, label_encoder=None, fit_scaler=False, fit_encoder=False, handle_unknown=False):
    """
    Preprocess the dataset by cleaning, encoding labels, and normalizing features.
    """
    print(f"\nPreprocessing dataset with shape: {df.shape}")

    # Make a copy to avoid modifying the original
    df_processed = df.copy()

    # Identify the label column
    label_col = 'Label'
    if label_col not in df_processed.columns:
        raise ValueError(f"Label column '{label_col}' not found in the dataset")

    # Check label distribution
    label_counts = df_processed[label_col].value_counts()
    print(f"Label distribution: {len(label_counts)} unique classes")
    print(f"Top 5 classes: {dict(label_counts.head(5))}")

    # Keep the original label
    original_label = df_processed[label_col].copy()

    # Label encode the attack types
    if fit_encoder:
        label_encoder = LabelEncoder()
        encoded_labels = label_encoder.fit_transform(original_label)

        # Display mapping
        print("\nLabel encoding mapping:")
        for i, label in enumerate(label_encoder.classes_):
            print(f"  {i}: {label}")
    else:
        if handle_unknown:
            # Handle unknown labels in test set
            unknown_labels = set(original_label) - set(label_encoder.classes_)
            if unknown_labels:
                print(f"Found {len(unknown_labels)} unknown labels in test data")
                # Map unknown labels to 'UNKNOWN'
                df_processed[label_col] = df_processed[label_col].apply(
                    lambda x: x if x in label_encoder.classes_ else 'UNKNOWN'
                )

                # Update original_label
                original_label = df_processed[label_col].copy()

                # Add 'UNKNOWN' to classes if not already present
                if 'UNKNOWN' not in label_encoder.classes_:
                    label_encoder.classes_ = np.append(label_encoder.classes_, 'UNKNOWN')

        try:
            encoded_labels = label_encoder.transform(original_label)
        except Exception as e:
            print(f"Error transforming labels: {e}")
            raise

    # Add encoded labels column
    df_processed['Label_Encoded'] = encoded_labels

    # Remove identifiers and other non-feature columns
    columns_to_drop = [
        'id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port',
        'Protocol', 'Timestamp', 'Label'
    ]

    # Only drop columns that exist in the dataframe
    columns_to_drop = [col for col in columns_to_drop if col in df_processed.columns]
    df_cleaned = df_processed.drop(columns=columns_to_drop, errors='ignore')

    # Convert all columns to numeric except Label_Encoded
    numeric_cols = []
    for col in df_cleaned.columns:
        if col != 'Label_Encoded':
            try:
                df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
                numeric_cols.append(col)
            except Exception as e:
                print(f"Error converting column {col} to numeric: {e}")
                df_cleaned = df_cleaned.drop(columns=[col])

    # Handle NaN values
    print(f"NaN values before handling: {df_cleaned[numeric_cols].isna().sum().sum()}")

    # Replace infinity values with NaN
    df_cleaned = df_cleaned.replace([np.inf, -np.inf], np.nan)

    # Fill NaN values with column means
    for col in numeric_cols:
        if df_cleaned[col].isna().any():
            col_mean = df_cleaned[col].mean()
            df_cleaned[col] = df_cleaned[col].fillna(col_mean)

    print(f"NaN values after handling: {df_cleaned[numeric_cols].isna().sum().sum()}")

    # Apply scaling
    features = df_cleaned[numeric_cols]
    labels = df_cleaned['Label_Encoded']

    if fit_scaler:
        scaler = MinMaxScaler()
        scaled_features = scaler.fit_transform(features)
    else:
        scaled_features = scaler.transform(features)

    # Create a new dataframe with scaled features
    scaled_df = pd.DataFrame(scaled_features, columns=features.columns)

    # Add back the encoded label column
    scaled_df['Label_Encoded'] = labels.values

    print(f"Preprocessed dataset shape: {scaled_df.shape}")
    print(f"Number of unique classes: {scaled_df['Label_Encoded'].nunique()}")

    if fit_scaler or fit_encoder:
        return scaled_df, scaler, label_encoder
    else:
        return scaled_df

def train_model(X_train, y_train, num_classes):
    """
    Train an XGBoost multiclass classifier.
    """
    print("\nTraining XGBoost model...")
    print(f"Training data shape: {X_train.shape}")
    print(f"Number of classes: {num_classes}")

    # Set parameters
    params = {
        'objective': 'multi:softprob',
        'num_class': num_classes,
        'eta': 0.1,
        'max_depth': 6,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'tree_method': 'hist',
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }

    # Create and train model
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    return model

def evaluate_model(model, X_test, y_test, label_encoder, model_name):
    """
    Evaluate model performance on test data.
    """
    print(f"\nEvaluating model: {model_name}")
    print(f"Test data shape: {X_test.shape}")

    # Measure prediction time
    start_time = time.time()
    y_pred = model.predict(X_test)
    end_time = time.time()

    # Calculate prediction time
    prediction_time = (end_time - start_time) * 1000000 / len(X_test)
    print(f"Prediction time: {prediction_time:.2f} μs/sample")

    # Calculate detailed metrics
    metrics = calculate_detailed_metrics(y_test, y_pred, model_name)

    # Classification report
    unique_classes = np.unique(y_test)
    class_names = [label_encoder.classes_[i] for i in unique_classes]

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, labels=unique_classes, target_names=class_names))

    # Create confusion matrix visualization
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()

    # Create safe filename
    filename = model_name.replace('[', '').replace(']', '').replace(' ', '_').lower()
    plt.savefig(f'confusion_matrix_{filename}.png')
    plt.close()
    print(f"Confusion matrix saved as 'confusion_matrix_{filename}.png'")

    return metrics

def main():
    # Mount Google Drive if running in Colab
    try:
        from google.colab import drive
        print("Mounting Google Drive...")
        drive.mount('/content/drive')
        print("Google Drive mounted successfully.")
    except:
        print("Not running in Colab or Drive already mounted.")

    # Base directory
    base_dir = "/content/drive/My Drive/CICIDS2017_improved"

    # Define all test files
    dataset_a_path = os.path.join(base_dir, "dataset_A_10k.csv")
    dataset_b_path = os.path.join(base_dir, "balanced_10k_test.csv")

    # Check if files exist
    for path in [dataset_a_path, dataset_b_path]:
        if not os.path.exists(path):
            print(f"File not found: {path}")

    # Print files in directory if needed
    print("\nFiles in directory:")
    if os.path.exists(base_dir):
        files = os.listdir(base_dir)
        for file in files:
            print(f"  - {file}")

    # Print metrics header
    print("\n" + "="*100)
    print(f"{'Model':<35}{'Accuracy':<12}{'F1 Score':<12}{'False Postive':<12}{'False Negative':<12}{'Detection':<12}")
    print("="*100)

    # Scenario 1: Train on 5 days, test on Dataset A 10k
    print("\n\nScenario 1: Train on 5 days, test on Dataset A 10k")
    print("-" * 80)

    # Load training data (all 5 days)
    train_df_5days = load_combined_data(base_dir)

    # Load testing data (Dataset A 10k)
    test_a_df = load_data(dataset_a_path)

    if train_df_5days is not None and test_a_df is not None:
        # Preprocess training data
        train_processed, scaler, label_encoder = preprocess_data(
            train_df_5days, fit_scaler=True, fit_encoder=True
        )

        # Preprocess test data
        test_a_processed = preprocess_data(
            test_a_df, scaler=scaler, label_encoder=label_encoder,
            fit_scaler=False, fit_encoder=False, handle_unknown=True
        )

        # Prepare features and labels
        X_train = train_processed.drop('Label_Encoded', axis=1)
        y_train = train_processed['Label_Encoded']
        X_test = test_a_processed.drop('Label_Encoded', axis=1)
        y_test = test_a_processed['Label_Encoded']

        # Train model
        model_5days = train_model(X_train, y_train, len(label_encoder.classes_))

        # Evaluate on Dataset A
        metrics_a = evaluate_model(
            model_5days, X_test, y_test, label_encoder,
            "XGBoost [Trained on 5 days] - Dataset A"
        )

        # Save model
        model_5days.save_model('xgboost_5days.json')
        print("Model saved as 'xgboost_5days.json'")

    # Scenario 2: Train on 5 days, test on Dataset B 10k
    print("\n\nScenario 2: Train on 5 days, test on Dataset B 10k")
    print("-" * 80)

    # Load testing data (Dataset B 10k)
    test_b_df = load_data(dataset_b_path)

    if train_df_5days is not None and test_b_df is not None:
        # Use the same model and preprocessing from Scenario 1

        # Preprocess test data
        test_b_processed = preprocess_data(
            test_b_df, scaler=scaler, label_encoder=label_encoder,
            fit_scaler=False, fit_encoder=False, handle_unknown=True
        )

        # Prepare features and labels
        X_test = test_b_processed.drop('Label_Encoded', axis=1)
        y_test = test_b_processed['Label_Encoded']

        # Evaluate on Dataset B
        metrics_b = evaluate_model(
            model_5days, X_test, y_test, label_encoder,
            "XGBoost [Trained on 5 days] - Dataset B"
        )

    # Scenario 3: Train on 3 days, test on Dataset B 10k
    print("\n\nScenario 3: Train on 3 days, test on Dataset B 10k")
    print("-" * 80)

    # Load training data (first 3 days)
    train_df_3days = load_combined_data(
        base_dir,
        days=["monday.csv", "tuesday.csv", "wednesday.csv"]
    )

    if train_df_3days is not None and test_b_df is not None:
        # Preprocess training data
        train_processed, scaler_3days, label_encoder_3days = preprocess_data(
            train_df_3days, fit_scaler=True, fit_encoder=True
        )

        # Preprocess test data
        test_b_processed = preprocess_data(
            test_b_df, scaler=scaler_3days, label_encoder=label_encoder_3days,
            fit_scaler=False, fit_encoder=False, handle_unknown=True
        )

        # Prepare features and labels
        X_train = train_processed.drop('Label_Encoded', axis=1)
        y_train = train_processed['Label_Encoded']
        X_test = test_b_processed.drop('Label_Encoded', axis=1)
        y_test = test_b_processed['Label_Encoded']

        # Train model
        model_3days = train_model(X_train, y_train, len(label_encoder_3days.classes_))

        # Evaluate on Dataset B
        metrics_b = evaluate_model(
            model_3days, X_test, y_test, label_encoder_3days,
            "XGBoost [Trained on 3 days] - Dataset B"
        )

        # Save model
        model_3days.save_model('xgboost_3days.json')
        print("Model saved as 'xgboost_3days.json'")

    # Print summary of all scenarios
    print("\n\nSummary of All Scenarios")
    print("=" * 100)
    print(f"{'Model':<35}{'Accuracy':<12}{'F1 Score':<12}{'False Postive':<12}{'False Negative':<12}{'Detection':<12}")
    print("=" * 100)
    # Metrics will be printed by the evaluate_model function

    print("\nAnalysis complete!")

if __name__ == "__main__":
    main()

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.

Files in directory:
  - monday.csv
  - tuesday.csv
  - wednesday.csv
  - thursday.csv
  - friday.csv
  - combined_dataset.csv
  - balanced_multiclass_10k.csv
  - train.csv
  - metrics_by_class.png
  - class_metrics.csv
  - prediction_results.csv
  - confusion_matrix.csv
  - confusion_metrics_by_class.png
  - fixed_train.csv
  - fixed_test.csv
  - enhanced_train.csv
  - plots_multiclass
  - multiclass_metrics.csv
  - balanced_10k_test.csv
  - test_thurs_friday_10000.csv
  - dataset_A_10k.csv

Model                              Accuracy    F1 Score    False PostiveFalse NegativeDetection   


Scenario 1: Train on 5 days, test on Dataset A 10k
--------------------------------------------------------------------------------
Loading data from days: ['monday.csv', 'tuesday.csv', 'wednesday.csv', 'thursday