In [None]:
## dataset creation

In [None]:
import pandas as pd
import os

def combine_csv_files(input_files, output_file):
    """
    Combine multiple CSV files into a single output file.

    Args:
        input_files (list): List of input CSV file paths
        output_file (str): Output file path
    """
    # Create an empty list to store dataframes
    dfs = []

    # Read each input file and append to the list
    for file_path in input_files:
        print(f"Reading {file_path}...")
        try:
            # Using tab as separator based on the sample data
            df = pd.read_csv(file_path, sep='\t')
            print(f"  Found {len(df)} rows and {len(df.columns)} columns")
            dfs.append(df)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    # Concatenate all dataframes
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)

        # Write to output file
        print(f"Writing {len(combined_df)} rows to {output_file}...")
        combined_df.to_csv(output_file, index=False, sep='\t')
        print("Done!")
        return combined_df
    else:
        print("No data to combine!")
        return None

# Main execution
if __name__ == "__main__":
    # Input files
    input_files = ["monday.csv","tuesday.csv","wednesday.csv"]

    # Check if files exist
    for file in input_files:
        if not os.path.exists(file):
            print(f"Warning: {file} does not exist in the current directory")

    # Output file
    output_file = "train_dataset_b.csv"

    # Combine files
    combined_data = combine_csv_files(input_files, output_file)

    # Print summary
    if combined_data is not None:
        print("\nSummary:")
        print(f"Total rows in combined file: {len(combined_data)}")
        print(f"Columns: {', '.join(combined_data.columns)}")

Reading monday.csv...
  Found 371624 rows and 1 columns
Reading tuesday.csv...
  Found 322078 rows and 1 columns
Reading wednesday.csv...
  Found 496641 rows and 1 columns
Writing 1190343 rows to train_dataset_b.csv...
Done!

Summary:
Total rows in combined file: 1190343
Columns: id,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Timestamp,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd RST Flags,Bwd RST Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Packet Length Min,Pac

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import files
import os

def split_csv_file(input_file, train_output, test_output, test_size=0.3, random_state=42):
    """
    Split a CSV file into training and testing sets.

    Args:
        input_file (str): Path to the input CSV file
        train_output (str): Path to save the training set
        test_output (str): Path to save the testing set
        test_size (float): Proportion of data to include in the test set (default: 0.3)
        random_state (int): Random seed for reproducibility
    """
    print(f"Loading data from {input_file}...")

    try:
        # Try to read the CSV file with tab delimiter
        df = pd.read_csv(input_file, sep=',')

        # If there's only one column and it contains commas, the file might be using a different delimiter
        if len(df.columns) == 1 and ',' in str(df.iloc[0, 0]):
            print("Detected possible comma-separated values in a tab-delimited file...")
            # Try reading with comma delimiter
            df = pd.read_csv(input_file, sep=',')

    except Exception as e:
        print(f"Error reading with standard delimiters: {e}")
        print("Trying to read with custom parsing...")

        # Read the file as text and parse manually
        with open(input_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        # Find the delimiter by inspecting the first line
        first_line = lines[0].strip()
        if '\t' in first_line:
            delimiter = '\t'
        elif ',' in first_line:
            delimiter = ','
        else:
            print("Could not determine delimiter. Defaulting to comma.")
            delimiter = ','

        # Parse the header
        header = lines[0].strip().split(delimiter)

        # Parse the data
        data = []
        for line in lines[1:]:
            if line.strip():  # Skip empty lines
                values = line.strip().split(delimiter)
                if len(values) == len(header):  # Only add rows with correct number of columns
                    data.append(values)
                else:
                    print(f"Skipping row with {len(values)} values (header has {len(header)} columns)")

        # Create DataFrame
        df = pd.DataFrame(data, columns=header)

    print(f"Dataset loaded with shape: {df.shape}")

    # Split the data
    print(f"Splitting data into {100-test_size*100}% training and {test_size*100}% testing...")
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)

    print(f"Training set shape: {train_df.shape}")
    print(f"Testing set shape: {test_df.shape}")

    # Save the split datasets
    print(f"Saving training set to {train_output}...")
    train_df.to_csv(train_output, index=False, sep='\t')

    print(f"Saving testing set to {test_output}...")
    test_df.to_csv(test_output, index=False, sep='\t')

    # Create download links
    print("Creating download links for the files...")
    files.download(train_output)
    files.download(test_output)

    print("Files saved and download links created!")

# Main execution
def main():
    # Set file paths
    input_file = "combined_data.csv"  # File already in Colab
    train_output = "train_data.csv"
    test_output = "test_data.csv"

    # Check if the input file exists
    if not os.path.exists(input_file):
        print(f"Error: File '{input_file}' not found in the current directory.")
        print("Current directory contains these files:")
        print(os.listdir())
        return

    # Split the data
    split_csv_file(input_file, train_output, test_output)

if __name__ == "__main__":
    main()

Loading data from combined_data.csv...
Dataset loaded with shape: (2099976, 91)
Splitting data into 70.0% training and 30.0% testing...
Training set shape: (1469983, 91)
Testing set shape: (629993, 91)
Saving training set to train_data.csv...
Saving testing set to test_data.csv...
Creating download links for the files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Files saved and download links created!


In [None]:
import csv


def get_delimiter(file_path: str) -> str:
    with open(file_path, 'r') as csvfile:
        delimiter = str(csv.Sniffer().sniff(csvfile.read()).delimiter)
        return delimiter

print(get_delimiter("monday.csv"))

,


In [None]:
import pandas as pd
import numpy as np
import os

def enhance_training_data(train_file, test_file, output_file, k_samples=1000, random_seed=42):
    """
    Enhance training dataset by including K random samples from the test dataset.

    Args:
        train_file (str): Path to the training CSV file (e.g., combined Mon-Wed)
        test_file (str): Path to the test CSV file (e.g., combined Thu-Fri)
        output_file (str): Path to save the enhanced training dataset
        k_samples (int): Number of random samples to include from test dataset
        random_seed (int): Random seed for reproducibility

    Returns:
        pd.DataFrame: The enhanced training dataset
    """
    print(f"Enhancing training data with {k_samples} samples from test data...")

    # Check if files exist
    for file in [train_file, test_file]:
        if not os.path.exists(file):
            raise FileNotFoundError(f"File not found: {file}")

    # Read the datasets
    try:
        print(f"Reading training file: {train_file}")
        train_df = pd.read_csv(train_file, sep='\t')
        print(f"  Training data: {len(train_df)} rows, {len(train_df.columns)} columns")

        print(f"Reading test file: {test_file}")
        test_df = pd.read_csv(test_file, sep='\t')
        print(f"  Test data: {len(test_df)} rows, {len(test_df.columns)} columns")
    except Exception as e:
        print(f"Error reading files: {e}")
        return None

    # Validate column consistency
    if set(train_df.columns) != set(test_df.columns):
        print("Warning: Column mismatch between training and test datasets")
        print(f"  Training columns: {train_df.columns.tolist()}")
        print(f"  Test columns: {test_df.columns.tolist()}")
        print("  Proceeding with intersection of columns...")
        common_columns = list(set(train_df.columns).intersection(set(test_df.columns)))
        train_df = train_df[common_columns]
        test_df = test_df[common_columns]

    # Sample from test dataset
    np.random.seed(random_seed)
    sample_size = min(k_samples, len(test_df))

    if sample_size < k_samples:
        print(f"Warning: Requested {k_samples} samples, but test dataset only has {len(test_df)} rows")
        print(f"  Using {sample_size} samples instead")

    # Take stratified sample if 'Label' column exists
    if 'Label' in test_df.columns:
        print("Taking stratified sample based on 'Label' column...")
        # Get the class distribution
        label_counts = test_df['Label'].value_counts(normalize=True)

        # Initialize an empty DataFrame for the samples
        test_samples = pd.DataFrame(columns=test_df.columns)

        # Sample from each class proportionally
        for label, proportion in label_counts.items():
            # Calculate how many samples to take from this class
            class_sample_size = int(np.ceil(sample_size * proportion))
            class_data = test_df[test_df['Label'] == label]

            # If there are fewer rows than the requested sample size, take all rows
            if len(class_data) <= class_sample_size:
                class_samples = class_data
            else:
                class_samples = class_data.sample(class_sample_size, random_state=random_seed)

            # Add to the samples DataFrame
            test_samples = pd.concat([test_samples, class_samples])

        # If we have more samples than requested, take a random subsample
        if len(test_samples) > sample_size:
            test_samples = test_samples.sample(sample_size, random_state=random_seed)
    else:
        # Take a simple random sample if no 'Label' column
        print("Taking random sample from test dataset...")
        test_samples = test_df.sample(sample_size, random_state=random_seed)

    print(f"Selected {len(test_samples)} samples from test dataset")

    # Add a column to track the source of the data (optional)
    if 'data_source' not in train_df.columns and 'data_source' not in test_samples.columns:
        train_df['data_source'] = 'original_train'
        test_samples['data_source'] = 'test_sample'

    # Combine datasets
    enhanced_df = pd.concat([train_df, test_samples], ignore_index=True)
    print(f"Enhanced training dataset: {len(enhanced_df)} rows")

    # Shuffle the data
    enhanced_df = enhanced_df.sample(frac=1, random_state=random_seed).reset_index(drop=True)

    # Save to output file
    print(f"Saving enhanced dataset to {output_file}...")
    enhanced_df.to_csv(output_file, sep='\t', index=False)
    print("Done!")

    return enhanced_df

# Example usage
if __name__ == "__main__":
    # Example file paths
    train_file = "train.csv"  # Combined Monday-Wednesday data
    test_file = "test.csv"  # Combined Thursday-Friday data
    output_file = "enhanced_train.csv"

    # Enhance training data with 1000 random samples from test data
    enhanced_df = enhance_training_data(train_file, test_file, output_file, k_samples=1000)

    if enhanced_df is not None:
        # Print class distribution if 'Label' column exists
        if 'Label' in enhanced_df.columns:
            print("\nClass distribution in enhanced dataset:")
            print(enhanced_df['Label'].value_counts())

            # Print distribution by source
            if 'data_source' in enhanced_df.columns:
                print("\nSamples by source:")
                print(enhanced_df['data_source'].value_counts())

Enhancing training data with 1000 samples from test data...
Reading training file: train.csv
  Training data: 1190343 rows, 1 columns
Reading test file: test.csv
  Test data: 909633 rows, 1 columns
Taking random sample from test dataset...
Selected 1000 samples from test dataset
Enhanced training dataset: 1191343 rows
Saving enhanced dataset to enhanced_train.csv...
Done!


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, roc_curve, f1_score, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_score, recall_score, classification_report
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

def load_data(file_path):
    """
    Load network flow data from file path, attempting multiple delimiters.
    """
    try:
        # First try tab delimiter
        df = pd.read_csv(file_path, delimiter='\t')

        # Check if we ended up with only one column containing all data
        if len(df.columns) == 1 and ',' in df.iloc[0, 0]:
            print("Data loaded as a single column. Trying comma delimiter...")

            # Try with comma delimiter
            df = pd.read_csv(file_path, delimiter=',')
            print(f"Loaded dataset with comma delimiter. Shape: {df.shape}")
            return df

        print(f"Loaded dataset with tab delimiter. Shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error with standard loading: {e}")

        # Try a manual approach
        try:
            with open(file_path, 'r') as f:
                lines = f.readlines()

            # Detect delimiter from first line
            first_line = lines[0].strip()
            if '\t' in first_line and ',' in first_line:
                # If both tab and comma exist, use the one that gives more splits
                tab_count = first_line.count('\t')
                comma_count = first_line.count(',')
                delimiter = '\t' if tab_count > comma_count else ','
            elif '\t' in first_line:
                delimiter = '\t'
            elif ',' in first_line:
                delimiter = ','
            else:
                delimiter = ',' # Default to comma

            print(f"Using manual parsing with delimiter: '{delimiter}'")

            # Parse manually
            headers = lines[0].strip().split(delimiter)
            data = []

            for i in range(1, len(lines)):
                if lines[i].strip():  # Skip empty lines
                    row = lines[i].strip().split(delimiter)
                    if len(row) == len(headers):
                        data.append(row)
                    else:
                        print(f"Warning: Line {i+1} has {len(row)} fields, expected {len(headers)}")

            df = pd.DataFrame(data, columns=headers)
            print(f"Manually loaded dataset with shape: {df.shape}")
            return df
        except Exception as e:
            print(f"Error with manual parsing: {e}")
            raise

def preprocess_data(df, scaler=None, fit_scaler=False):
    """
    Preprocess the network flow data according to paper specifications:
    - Remove flow identifiers
    - Apply min-max normalization
    - Convert labels to binary format

    Args:
        df: The dataframe to preprocess
        scaler: An optional pre-fitted scaler (for test data)
        fit_scaler: Whether to fit the scaler on this data (for train data)

    Returns:
        Preprocessed dataframe and the scaler (if fit_scaler=True)
    """
    print("\nPreprocessing data:")
    print(f"Initial columns: {df.columns.tolist()[:5]}... (total: {len(df.columns)})")

    # Make a copy to avoid modifying the original
    df_processed = df.copy()

    # Convert label column to binary (0 for BENIGN, 1 for attacks)
    if 'Label' in df_processed.columns:
        df_processed['Label_Binary'] = df_processed['Label'].apply(lambda x: 0 if str(x).upper() == 'BENIGN' else 1)
        # Save the original labels for detailed analysis later
        df_processed['Original_Label'] = df_processed['Label']
        label_counts = df_processed['Label'].value_counts()
        print(f"Label distribution: {label_counts.to_dict()}")
        binary_counts = df_processed['Label_Binary'].value_counts()
        print(f"Binary label distribution: {binary_counts.to_dict()}")

    # Remove flow identifiers as specified
    columns_to_drop = [
        'id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp',
        'ICMP Code', 'ICMP Type', 'Total TCP Flow Time', 'Attempted Category', 'Label'
    ]

    # Drop TTL-based features if they exist
    ttl_features = [col for col in df_processed.columns if 'TTL' in col]
    columns_to_drop.extend(ttl_features)

    # Only drop columns that exist in the dataframe
    columns_to_drop = [col for col in columns_to_drop if col in df_processed.columns]
    df_cleaned = df_processed.drop(columns=columns_to_drop, errors='ignore')

    # Convert all columns to numeric, coercing errors to NaN
    numeric_cols = []
    for col in df_cleaned.columns:
        if col != 'Label_Binary' and col != 'Original_Label':
            try:
                df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
                numeric_cols.append(col)
            except Exception as e:
                print(f"Error converting column {col} to numeric: {e}")
                df_cleaned = df_cleaned.drop(columns=[col])

    # Display info about numeric columns
    print(f"Number of numeric columns after conversion: {len(numeric_cols)}")
    if len(numeric_cols) == 0:
        raise ValueError("No numeric columns available after preprocessing")

    # Handle NaN values
    print(f"NaN values before handling: {df_cleaned[numeric_cols].isna().sum().sum()}")

    # Replace infinity values with NaN
    df_cleaned = df_cleaned.replace([np.inf, -np.inf], np.nan)

    # Check for columns with all NaN values
    null_cols = [col for col in numeric_cols if df_cleaned[col].isna().all()]
    if null_cols:
        print(f"Dropping columns with all NaN values: {null_cols}")
        df_cleaned = df_cleaned.drop(columns=null_cols)
        numeric_cols = [col for col in numeric_cols if col not in null_cols]

    # Fill remaining NaN values with column means
    for col in numeric_cols:
        if df_cleaned[col].isna().any():
            col_mean = df_cleaned[col].mean()
            df_cleaned[col] = df_cleaned[col].fillna(col_mean)

    print(f"NaN values after handling: {df_cleaned[numeric_cols].isna().sum().sum()}")

    # Verify we have data to work with
    if len(numeric_cols) == 0:
        raise ValueError("No numeric columns available after preprocessing")

    # Apply min-max scaling to all numeric columns
    if 'Label_Binary' in df_cleaned.columns:
        features = df_cleaned[numeric_cols]
        labels = df_cleaned['Label_Binary']
        original_labels = df_cleaned.get('Original_Label', None)
    else:
        features = df_cleaned[numeric_cols]
        labels = None
        original_labels = None

    # Apply scaling
    if scaler is None and fit_scaler:
        scaler = MinMaxScaler()
        scaled_features = scaler.fit_transform(features)
    elif scaler is not None:
        scaled_features = scaler.transform(features)
    else:
        raise ValueError("Either provide a fitted scaler or set fit_scaler=True")

    # Create a new dataframe with scaled features
    scaled_df = pd.DataFrame(scaled_features, columns=features.columns)

    # Add back the label columns if they exist
    if labels is not None:
        scaled_df['Label_Binary'] = labels.values
    if original_labels is not None:
        scaled_df['Original_Label'] = original_labels.values

    print(f"Preprocessed dataset shape: {scaled_df.shape}")

    if fit_scaler:
        return scaled_df, scaler
    else:
        return scaled_df

def evaluate_performance(y_true, y_pred, y_scores, prediction_time, original_labels=None):
    """Calculate all required performance metrics."""
    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)

    # Calculate AUC - handle case where all predictions are the same class
    try:
        auc_score = roc_auc_score(y_true, y_scores)
    except:
        auc_score = 0.5  # Default value when AUC can't be calculated
        print("Warning: AUC could not be calculated, possibly due to only one class present")

    # Calculate F1 score
    f1 = f1_score(y_true, y_pred, zero_division=0)

    # Calculate precision and recall
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)

    # Calculate confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Ensure confusion matrix has correct shape for metrics calculation
    if cm.shape == (1, 1):  # Only one class predicted
        if y_true[0] == 1:  # Only positive class exists
            tn, fp, fn, tp = 0, 0, 0, cm[0, 0]
        else:  # Only negative class exists
            tn, fp, fn, tp = cm[0, 0], 0, 0, 0
    elif cm.shape == (2, 1) or cm.shape == (1, 2):  # Handle imbalanced confusion matrix
        if cm.size == 2:  # We have two elements
            if 1 in y_pred:  # We predicted positive at least once
                tn = 0
                fp = (y_true == 0).sum() - tn
                tp = (y_true == 1).sum() - 0  # All positive samples are TP
                fn = 0
            else:  # We predicted negative for all
                tn = (y_true == 0).sum()
                fp = 0
                tp = 0
                fn = (y_true == 1).sum()
    else:  # Normal case
        tn, fp, fn, tp = cm.ravel()

    # Calculate Detection Rate (DR) and False Alarm Rate (FAR)
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0
    false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

    # Store all metrics
    metrics = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC': auc_score,
        'Detection Rate (DR)': detection_rate,
        'False Alarm Rate (FAR)': false_alarm_rate,
        'True Negatives': tn,
        'False Positives': fp,
        'False Negatives': fn,
        'True Positives': tp,
        'Prediction Time (μs/sample)': prediction_time
    }

    # Create a detailed analysis of false positives and false negatives
    fp_fn_analysis = {}
    if original_labels is not None:
        # Get indices of false positives and false negatives
        fp_indices = np.where((y_true == 0) & (y_pred == 1))[0]
        fn_indices = np.where((y_true == 1) & (y_pred == 0))[0]

        # Count occurrence of each attack type in false negatives
        if len(fn_indices) > 0:
            fn_attack_types = original_labels.iloc[fn_indices].value_counts()
            fp_fn_analysis['False Negative Types'] = fn_attack_types.to_dict()

        # For false positives, they are all benign
        fp_fn_analysis['False Positive Count'] = len(fp_indices)

    return metrics, fp_fn_analysis

def main():
    # Load the training data
    try:
        print("Loading training data...")
        train_df = load_data('train_dataset_b.csv')
    except Exception as e:
        print(f"Error loading training data: {e}")
        return

    # Load the test data
    try:
        print("Loading test data...")
        test_df = load_data('balanced_10k_test.csv')
    except Exception as e:
        print(f"Error loading test data: {e}")
        return

    # Display data summary
    print("\nTraining data summary:")
    print(f"Columns: {train_df.columns.tolist()[:5]} ... (total: {len(train_df.columns)})")
    print(f"Number of samples: {len(train_df)}")

    print("\nTest data summary:")
    print(f"Columns: {test_df.columns.tolist()[:5]} ... (total: {len(test_df.columns)})")
    print(f"Number of samples: {len(test_df)}")

    # Preprocess the training data
    try:
        print("\nPreprocessing training data...")
        train_processed, scaler = preprocess_data(train_df, fit_scaler=True)
    except Exception as e:
        print(f"Error preprocessing training data: {e}")
        return

    # Preprocess the test data using the same scaler
    try:
        print("\nPreprocessing test data...")
        test_processed = preprocess_data(test_df, scaler=scaler, fit_scaler=False)
    except Exception as e:
        print(f"Error preprocessing test data: {e}")
        return

    # Check if we have the necessary binary label column in both datasets
    if 'Label_Binary' not in train_processed.columns:
        print("No 'Label_Binary' column found in training data after preprocessing. Cannot proceed.")
        return

    if 'Label_Binary' not in test_processed.columns:
        print("No 'Label_Binary' column found in test data after preprocessing. Cannot proceed.")
        return

    # Check label distribution in training data
    train_label_dist = train_processed['Label_Binary'].value_counts()
    print(f"\nTraining data label distribution: {train_label_dist.to_dict()}")

    # Check label distribution in test data
    test_label_dist = test_processed['Label_Binary'].value_counts()
    print(f"Test data label distribution: {test_label_dist.to_dict()}")

    # Handle case where only one class is present in either dataset
    if train_processed['Label_Binary'].nunique() < 2:
        print(f"Warning: Only one class present in training data ({train_processed['Label_Binary'].unique()[0]})")
        print("Generating synthetic data for demonstration purposes...")

        # Generate synthetic attack samples
        majority_class = train_processed['Label_Binary'].mode()[0]
        minority_class = 1 if majority_class == 0 else 0

        majority_data = train_processed[train_processed['Label_Binary'] == majority_class]

        # Create synthetic minority samples
        num_samples = min(int(len(majority_data) * 0.3), 500)
        synthetic_samples = majority_data.sample(num_samples, replace=(num_samples > len(majority_data)))

        # Add noise to make them different
        for col in synthetic_samples.columns:
            if col != 'Label_Binary' and col != 'Original_Label':
                noise = np.random.normal(0, 0.1, size=len(synthetic_samples))
                synthetic_samples[col] = synthetic_samples[col] + noise
                synthetic_samples[col] = synthetic_samples[col].clip(0, 1)

        # Set minority class
        synthetic_samples['Label_Binary'] = minority_class

        # Set a dummy original label for synthetic samples
        if 'Original_Label' in synthetic_samples.columns:
            synthetic_samples['Original_Label'] = 'SYNTHETIC_ATTACK' if minority_class == 1 else 'SYNTHETIC_BENIGN'

        # Combine with original data
        train_processed = pd.concat([train_processed, synthetic_samples], ignore_index=True)
        print(f"Added {num_samples} synthetic samples of class {minority_class}")
        print(f"New training label distribution: {train_processed['Label_Binary'].value_counts().to_dict()}")

    # Get the original labels for detailed analysis
    original_labels_test = test_processed.get('Original_Label', None)

    # Split features and target for training data
    X_train = train_processed.drop(['Label_Binary'], axis=1)
    if 'Original_Label' in X_train.columns:
        X_train = X_train.drop(['Original_Label'], axis=1)
    y_train = train_processed['Label_Binary']

    # Split features and target for test data
    X_test = test_processed.drop(['Label_Binary'], axis=1)
    if 'Original_Label' in X_test.columns:
        X_test = X_test.drop(['Original_Label'], axis=1)
    y_test = test_processed['Label_Binary']

    print(f"\nTraining features shape: {X_train.shape}")
    print(f"Training target shape: {y_train.shape}")
    print(f"Test features shape: {X_test.shape}")
    print(f"Test target shape: {y_test.shape}")

    # Initialize XGBoost classifier
    model = xgb.XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    # Train the model on the entire training set
    print("\nTraining XGBoost model...")
    model.fit(X_train, y_train)

    # Measure prediction time on test set
    print("\nEvaluating on test set...")
    start_time = time.time()
    y_test_pred_proba = model.predict_proba(X_test)[:, 1]
    y_test_pred = model.predict(X_test)
    end_time = time.time()

    # Calculate prediction time per sample in microseconds
    test_prediction_time = (end_time - start_time) * 1000000 / len(X_test)

    # Calculate final metrics and get detailed false positive/negative analysis
    test_metrics, fp_fn_analysis = evaluate_performance(y_test, y_test_pred, y_test_pred_proba,
                                                       test_prediction_time, original_labels_test)

    # Print test metrics
    print("\nTest set performance metrics:")
    for metric_name, metric_value in test_metrics.items():
        print(f"  {metric_name}: {metric_value:.6f}")

    # Print classification report
    print("\nDetailed classification report:")
    print(classification_report(y_test, y_test_pred, zero_division=0))

    # Print false positive and false negative analysis
    print("\nFalse Positive and False Negative Analysis:")
    if 'False Negative Types' in fp_fn_analysis:
        print("  False Negative breakdown by attack type:")
        for attack_type, count in fp_fn_analysis['False Negative Types'].items():
            print(f"    {attack_type}: {count}")

    print(f"  Total False Positives: {test_metrics['False Positives']}")
    print(f"  Total False Negatives: {test_metrics['False Negatives']}")

    try:
        # Create directory for plots if it doesn't exist
        import os
        if not os.path.exists('plots'):
            os.makedirs('plots')

        # Plot ROC curve
        fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba)
        plt.figure(figsize=(10, 8))
        plt.plot(fpr, tpr, label=f'AUC = {test_metrics["AUC"]:.4f}')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc='lower right')
        plt.grid(True)
        plt.savefig('plots/roc_curve.png')
        print("\nROC curve saved as 'plots/roc_curve.png'")
        plt.close()

        # Plot confusion matrix with percentages
        cm = confusion_matrix(y_test, y_test_pred)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        plt.savefig('plots/confusion_matrix.png')
        print("Confusion matrix saved as 'plots/confusion_matrix.png'")
        plt.close()

        # Plot normalized confusion matrix
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Normalized Confusion Matrix')
        plt.savefig('plots/normalized_confusion_matrix.png')
        print("Normalized confusion matrix saved as 'plots/normalized_confusion_matrix.png'")
        plt.close()

        # Create a more detailed visualization of FP and FN
        # Set up the figure with 2 subplots
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

        # Left plot: False Negative Distribution by Type (if available)
        if 'False Negative Types' in fp_fn_analysis and len(fp_fn_analysis['False Negative Types']) > 0:
            fn_types = pd.Series(fp_fn_analysis['False Negative Types'])
            fn_types.sort_values(ascending=False).plot(kind='bar', ax=ax1, color='salmon')
            ax1.set_title('False Negatives by Attack Type')
            ax1.set_ylabel('Count')
            ax1.set_xlabel('Attack Type')
            ax1.tick_params(axis='x', rotation=90)
        else:
            ax1.text(0.5, 0.5, 'No False Negatives Available',
                    horizontalalignment='center', verticalalignment='center')
            ax1.set_title('False Negatives Analysis')

        # Right plot: Metrics Comparison
        metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC']
        values = [test_metrics[metric] for metric in metrics_to_plot]
        ax2.bar(metrics_to_plot, values, color='lightblue')
        ax2.set_title('Performance Metrics')
        ax2.set_ylim(0, 1)
        ax2.set_ylabel('Score')
        ax2.tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.savefig('plots/detailed_performance.png')
        print("Detailed performance metrics saved as 'plots/detailed_performance.png'")
        plt.close()

        # Plot feature importance
        plt.figure(figsize=(12, 10))
        xgb.plot_importance(model, max_num_features=20)
        plt.title('Feature Importance')
        plt.tight_layout()
        plt.savefig('plots/feature_importance.png')
        print("Feature importance plot saved as 'plots/feature_importance.png'")
        plt.close()

        # Generate precision-recall curve
        from sklearn.metrics import precision_recall_curve, average_precision_score
        precision, recall, _ = precision_recall_curve(y_test, y_test_pred_proba)
        avg_precision = average_precision_score(y_test, y_test_pred_proba)

        plt.figure(figsize=(10, 8))
        plt.plot(recall, precision, label=f'Average Precision = {avg_precision:.4f}')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.legend(loc='best')
        plt.grid(True)
        plt.savefig('plots/precision_recall_curve.png')
        print("Precision-recall curve saved as 'plots/precision_recall_curve.png'")
        plt.close()

    except Exception as e:
        print(f"Error generating plots: {e}")

    print("\nAnalysis complete!")

    # Print a note about synthetic data if it was used
    if train_df['Label'].nunique() < 2 or train_processed['Label_Binary'].nunique() < 2:
        print("\nNOTE: This analysis used synthetic data for demonstration purposes.")
        print("In a real-world scenario, you would need balanced class representation for meaningful results.")

if __name__ == "__main__":
    main()

Loading training data...
Data loaded as a single column. Trying comma delimiter...
Loaded dataset with comma delimiter. Shape: (1190343, 91)
Loading test data...
Data loaded as a single column. Trying comma delimiter...
Loaded dataset with comma delimiter. Shape: (10000, 91)

Training data summary:
Columns: ['id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP'] ... (total: 91)
Number of samples: 1190343

Test data summary:
Columns: ['id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP'] ... (total: 91)
Number of samples: 10000

Preprocessing training data...

Preprocessing data:
Initial columns: ['id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP']... (total: 91)
Label distribution: {'BENIGN': 1005850, 'DoS Hulk': 158468, 'DoS GoldenEye': 7567, 'FTP-Patator': 3972, 'DoS Slowloris': 3859, 'DoS Slowhttptest - Attempted': 3368, 'SSH-Patator': 2961, 'DoS Slowloris - Attempted': 1847, 'DoS Slowhttptest': 1740, 'DoS Hulk - Attempted': 581, 'DoS GoldenEye - Attempted': 80, 'SSH-Patator - Attempted': 27, 'FT

<Figure size 1200x1000 with 0 Axes>

In [None]:
import pandas as pd
import os

def fix_csv_files(input_files, output_file):
    """
    Fix and combine multiple CSV files that may have formatting issues

    Args:
        input_files (list): List of input CSV file paths
        output_file (str): Output file path
    """
    # Create an empty list to store dataframes
    dfs = []

    # Process each input file
    for file_path in input_files:
        print(f"Processing {file_path}...")
        if not os.path.exists(file_path):
            print(f"  Warning: {file_path} does not exist. Skipping.")
            continue

        try:
            # Try to determine the actual delimiter and structure
            with open(file_path, 'r') as f:
                first_line = f.readline().strip()

            # Check if this is a malformed file (common format issue)
            # This happens when a CSV is saved with tab delimiters but all data is in first column
            if '\t' in first_line and ',' in first_line and first_line.count('\t') < first_line.count(','):
                print(f"  Detected malformed CSV. Attempting to fix...")

                # Try reading the file using comma delimiter
                df = pd.read_csv(file_path, sep=',')

                # Check if we got a reasonable number of columns
                if len(df.columns) > 5:  # Arbitrary threshold
                    print(f"  Successfully parsed with comma delimiter: {len(df)} rows, {len(df.columns)} columns")
                    dfs.append(df)
                    continue

                # If we couldn't parse it properly, try manual fixing
                with open(file_path, 'r') as f:
                    lines = f.readlines()

                # Get the headers - assume they're comma-separated in the first line
                headers = [h.strip() for h in lines[0].strip().split(',')]

                # Create a temporary fixed file
                temp_file = file_path + ".fixed.csv"
                with open(temp_file, 'w') as f:
                    # Write the headers as comma-separated
                    f.write(','.join(headers) + '\n')

                    # Process each line after the header
                    for i in range(1, len(lines)):
                        if lines[i].strip():  # Skip empty lines
                            # If line contains tabs, it might be tab-separated but with commas in fields
                            if '\t' in lines[i]:
                                # Split by tab, then join back with commas
                                fields = lines[i].strip().split('\t')

                                # If we get exactly one field, it's likely all packed into first column
                                if len(fields) == 1:
                                    # The field might already have commas, so use it directly
                                    f.write(fields[0] + '\n')
                                else:
                                    # It's genuinely tab-separated, convert to comma
                                    f.write(','.join(fields) + '\n')
                            else:
                                # Already comma-separated, write as is
                                f.write(lines[i])

                # Now read the fixed file
                df = pd.read_csv(temp_file)
                print(f"  Fixed and loaded: {len(df)} rows, {len(df.columns)} columns")
                dfs.append(df)

                # Clean up the temporary file
                os.remove(temp_file)

            elif '\t' in first_line:
                # Try tab delimiter first
                df = pd.read_csv(file_path, sep='\t')

                # Check if we have more than one column
                if len(df.columns) == 1 and ',' in df.iloc[0, 0]:
                    print("  Data loaded as a single column. Trying comma delimiter...")
                    df = pd.read_csv(file_path, sep=',')

                print(f"  Loaded dataset: {len(df)} rows, {len(df.columns)} columns")
                dfs.append(df)

            elif ',' in first_line:
                # Try comma delimiter
                df = pd.read_csv(file_path, sep=',')
                print(f"  Loaded dataset: {len(df)} rows, {len(df.columns)} columns")
                dfs.append(df)

            else:
                # Try both common delimiters
                try:
                    df = pd.read_csv(file_path, sep=',')
                    print(f"  Loaded with comma delimiter: {len(df)} rows, {len(df.columns)} columns")
                    dfs.append(df)
                except:
                    try:
                        df = pd.read_csv(file_path, sep='\t')
                        print(f"  Loaded with tab delimiter: {len(df)} rows, {len(df.columns)} columns")
                        dfs.append(df)
                    except Exception as e:
                        print(f"  Error parsing file with standard delimiters: {e}")

        except Exception as e:
            print(f"  Error processing {file_path}: {e}")

    # Concatenate all dataframes if we have any
    if dfs:
        # Check if all dataframes have the same columns
        all_columns = [set(df.columns) for df in dfs]
        if len(set.union(*all_columns)) > len(set.intersection(*all_columns)):
            print("\nWarning: Not all files have the same columns!")
            print(f"  Total unique columns across all files: {len(set.union(*all_columns))}")
            print(f"  Common columns across all files: {len(set.intersection(*all_columns))}")

            # Use only common columns for consistent concatenation
            common_columns = list(set.intersection(*all_columns))
            print(f"  Using only common columns: {len(common_columns)}")

            # Filter each dataframe to only include common columns
            dfs = [df[common_columns] for df in dfs]

        # Now concatenate
        combined_df = pd.concat(dfs, ignore_index=True)

        # Write to output file - using comma as a standard delimiter
        print(f"\nWriting {len(combined_df)} rows to {output_file}...")
        combined_df.to_csv(output_file, index=False)
        print("Done!")

        # Return the combined dataframe
        return combined_df
    else:
        print("No data to combine!")
        return None


def enhance_training_data(train_file, test_file, output_file, k_samples=1000, random_seed=42):
    """
    Enhance training dataset by including K random samples from the test dataset.

    Args:
        train_file (str): Path to the training CSV file (e.g., combined Mon-Wed)
        test_file (str): Path to the test CSV file (e.g., combined Thu-Fri)
        output_file (str): Path to save the enhanced training dataset
        k_samples (int): Number of random samples to include from test dataset
        random_seed (int): Random seed for reproducibility

    Returns:
        pd.DataFrame: The enhanced training dataset
    """
    import numpy as np

    print(f"Enhancing training data with {k_samples} samples from test data...")

    # Check if files exist
    for file in [train_file, test_file]:
        if not os.path.exists(file):
            raise FileNotFoundError(f"File not found: {file}")

    # Read the datasets - trying to handle potential format issues
    try:
        print(f"Reading training file: {train_file}")
        try:
            train_df = pd.read_csv(train_file, sep=',')
        except:
            try:
                train_df = pd.read_csv(train_file, sep='\t')
            except:
                # Try to detect delimiter
                with open(train_file, 'r') as f:
                    first_line = f.readline().strip()
                if '\t' in first_line and ',' in first_line:
                    # If both delimiters exist, use the one that gives more splits
                    tab_count = first_line.count('\t')
                    comma_count = first_line.count(',')
                    delimiter = '\t' if tab_count > comma_count else ','
                elif '\t' in first_line:
                    delimiter = '\t'
                else:
                    delimiter = ','

                train_df = pd.read_csv(train_file, sep=delimiter)

        print(f"  Training data: {len(train_df)} rows, {len(train_df.columns)} columns")

        print(f"Reading test file: {test_file}")
        try:
            test_df = pd.read_csv(test_file, sep=',')
        except:
            try:
                test_df = pd.read_csv(test_file, sep='\t')
            except:
                # Try to detect delimiter
                with open(test_file, 'r') as f:
                    first_line = f.readline().strip()
                if '\t' in first_line and ',' in first_line:
                    # If both delimiters exist, use the one that gives more splits
                    tab_count = first_line.count('\t')
                    comma_count = first_line.count(',')
                    delimiter = '\t' if tab_count > comma_count else ','
                elif '\t' in first_line:
                    delimiter = '\t'
                else:
                    delimiter = ','

                test_df = pd.read_csv(test_file, sep=delimiter)

        print(f"  Test data: {len(test_df)} rows, {len(test_df.columns)} columns")
    except Exception as e:
        print(f"Error reading files: {e}")
        return None

    # Validate column consistency
    if set(train_df.columns) != set(test_df.columns):
        print("Warning: Column mismatch between training and test datasets")
        print(f"  Training columns: {len(train_df.columns)}")
        print(f"  Test columns: {len(test_df.columns)}")
        print("  Proceeding with intersection of columns...")
        common_columns = list(set(train_df.columns).intersection(set(test_df.columns)))
        train_df = train_df[common_columns]
        test_df = test_df[common_columns]

    # Sample from test dataset
    np.random.seed(random_seed)
    sample_size = min(k_samples, len(test_df))

    if sample_size < k_samples:
        print(f"Warning: Requested {k_samples} samples, but test dataset only has {len(test_df)} rows")
        print(f"  Using {sample_size} samples instead")

    # Take stratified sample if 'Label' column exists
    if 'Label' in test_df.columns:
        print("Taking stratified sample based on 'Label' column...")
        # Get the class distribution
        label_counts = test_df['Label'].value_counts(normalize=True)

        # Initialize an empty DataFrame for the samples
        test_samples = pd.DataFrame(columns=test_df.columns)

        # Sample from each class proportionally
        for label, proportion in label_counts.items():
            # Calculate how many samples to take from this class
            class_sample_size = int(np.ceil(sample_size * proportion))
            class_data = test_df[test_df['Label'] == label]

            # If there are fewer rows than the requested sample size, take all rows
            if len(class_data) <= class_sample_size:
                class_samples = class_data
            else:
                class_samples = class_data.sample(class_sample_size, random_state=random_seed)

            # Add to the samples DataFrame
            test_samples = pd.concat([test_samples, class_samples])

        # If we have more samples than requested, take a random subsample
        if len(test_samples) > sample_size:
            test_samples = test_samples.sample(sample_size, random_state=random_seed)
    else:
        # Take a simple random sample if no 'Label' column
        print("Taking random sample from test dataset...")
        test_samples = test_df.sample(sample_size, random_state=random_seed)

    print(f"Selected {len(test_samples)} samples from test dataset")

    # Add a column to track the source of the data (optional)
    if 'data_source' not in train_df.columns and 'data_source' not in test_samples.columns:
        train_df['data_source'] = 'original_train'
        test_samples['data_source'] = 'test_sample'

    # Combine datasets
    enhanced_df = pd.concat([train_df, test_samples], ignore_index=True)
    print(f"Enhanced training dataset: {len(enhanced_df)} rows")

    # Shuffle the data
    enhanced_df = enhanced_df.sample(frac=1, random_state=random_seed).reset_index(drop=True)

    # Save to output file
    print(f"Saving enhanced dataset to {output_file}...")
    enhanced_df.to_csv(output_file, index=False)
    print("Done!")

    return enhanced_df

# Example usage
if __name__ == "__main__":
    # Fix and combine Monday-Wednesday files for training
    train_files = ["monday.csv", "tuesday.csv", "wednesday.csv"]
    train_output = "fixed_train.csv"

    # Fix and combine Thursday-Friday files for testing
    test_files = ["thursday.csv", "friday.csv"]
    test_output = "fixed_test.csv"

    # Fix and combine all files
    print("Fixing and combining training files...")
    fixed_train_df = fix_csv_files(train_files, train_output)

    print("\nFixing and combining test files...")
    fixed_test_df = fix_csv_files(test_files, test_output)

    # Create enhanced training dataset
    if fixed_train_df is not None and fixed_test_df is not None:
        print("\nCreating enhanced training dataset...")
        enhanced_df = enhance_training_data(
            train_output,
            test_output,
            "enhanced_train.csv",
            k_samples=1000
        )

        if enhanced_df is not None and 'Label' in enhanced_df.columns:
            print("\nSummary of enhanced dataset:")
            print(f"Total rows: {len(enhanced_df)}")
            print(f"Class distribution:\n{enhanced_df['Label'].value_counts()}")
            if 'data_source' in enhanced_df.columns:
                print(f"Sample source distribution:\n{enhanced_df['data_source'].value_counts()}")

Fixing and combining training files...
Processing monday.csv...
  Loaded dataset: 371624 rows, 91 columns
Processing tuesday.csv...
  Loaded dataset: 322078 rows, 91 columns
Processing wednesday.csv...
  Loaded dataset: 496641 rows, 91 columns

Writing 1190343 rows to fixed_train.csv...
Done!

Fixing and combining test files...
Processing thursday.csv...
  Loaded dataset: 362076 rows, 91 columns
Processing friday.csv...
  Loaded dataset: 547557 rows, 91 columns

Writing 909633 rows to fixed_test.csv...
Done!

Creating enhanced training dataset...
Enhancing training data with 1000 samples from test data...
Reading training file: fixed_train.csv
  Training data: 1190343 rows, 91 columns
Reading test file: fixed_test.csv
  Test data: 909633 rows, 91 columns
Taking stratified sample based on 'Label' column...
Selected 1000 samples from test dataset
Enhanced training dataset: 1191343 rows
Saving enhanced dataset to enhanced_train.csv...
Done!

Summary of enhanced dataset:
Total rows: 119134

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, roc_curve, f1_score, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_score, recall_score, classification_report
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

def load_data(file_path):
    """
    Load network flow data from file path, attempting multiple delimiters.
    """
    try:
        # First try tab delimiter
        df = pd.read_csv(file_path, delimiter='\t')

        # Check if we ended up with only one column containing all data
        if len(df.columns) == 1 and ',' in df.iloc[0, 0]:
            print("Data loaded as a single column. Trying comma delimiter...")

            # Try with comma delimiter
            df = pd.read_csv(file_path, delimiter=',')
            print(f"Loaded dataset with comma delimiter. Shape: {df.shape}")
            return df

        print(f"Loaded dataset with tab delimiter. Shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error with standard loading: {e}")

        # Try a manual approach
        try:
            with open(file_path, 'r') as f:
                lines = f.readlines()

            # Detect delimiter from first line
            first_line = lines[0].strip()
            if '\t' in first_line and ',' in first_line:
                # If both tab and comma exist, use the one that gives more splits
                tab_count = first_line.count('\t')
                comma_count = first_line.count(',')
                delimiter = '\t' if tab_count > comma_count else ','
            elif '\t' in first_line:
                delimiter = '\t'
            elif ',' in first_line:
                delimiter = ','
            else:
                delimiter = ',' # Default to comma

            print(f"Using manual parsing with delimiter: '{delimiter}'")

            # Parse manually
            headers = lines[0].strip().split(delimiter)
            data = []

            for i in range(1, len(lines)):
                if lines[i].strip():  # Skip empty lines
                    row = lines[i].strip().split(delimiter)
                    if len(row) == len(headers):
                        data.append(row)
                    else:
                        print(f"Warning: Line {i+1} has {len(row)} fields, expected {len(headers)}")

            df = pd.DataFrame(data, columns=headers)
            print(f"Manually loaded dataset with shape: {df.shape}")
            return df
        except Exception as e:
            print(f"Error with manual parsing: {e}")
            raise

def preprocess_data(df, scaler=None, fit_scaler=False):
    """
    Preprocess the network flow data according to paper specifications:
    - Remove flow identifiers
    - Apply min-max normalization
    - Convert labels to binary format

    Args:
        df: The dataframe to preprocess
        scaler: An optional pre-fitted scaler (for test data)
        fit_scaler: Whether to fit the scaler on this data (for train data)

    Returns:
        Preprocessed dataframe and the scaler (if fit_scaler=True)
    """
    print("\nPreprocessing data:")
    print(f"Initial columns: {df.columns.tolist()[:5]}... (total: {len(df.columns)})")

    # Make a copy to avoid modifying the original
    df_processed = df.copy()

    # Convert label column to binary (0 for BENIGN, 1 for attacks)
    if 'Label' in df_processed.columns:
        df_processed['Label_Binary'] = df_processed['Label'].apply(lambda x: 0 if str(x).upper() == 'BENIGN' else 1)
        # Save the original labels for detailed analysis later
        df_processed['Original_Label'] = df_processed['Label']
        label_counts = df_processed['Label'].value_counts()
        print(f"Label distribution: {label_counts.to_dict()}")
        binary_counts = df_processed['Label_Binary'].value_counts()
        print(f"Binary label distribution: {binary_counts.to_dict()}")

    # Remove flow identifiers as specified
    columns_to_drop = [
        'id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Timestamp',
        'ICMP Code', 'ICMP Type', 'Total TCP Flow Time', 'Attempted Category', 'Label'
    ]

    # Drop TTL-based features if they exist
    ttl_features = [col for col in df_processed.columns if 'TTL' in col]
    columns_to_drop.extend(ttl_features)

    # Only drop columns that exist in the dataframe
    columns_to_drop = [col for col in columns_to_drop if col in df_processed.columns]
    df_cleaned = df_processed.drop(columns=columns_to_drop, errors='ignore')

    # Convert all columns to numeric, coercing errors to NaN
    numeric_cols = []
    for col in df_cleaned.columns:
        if col != 'Label_Binary' and col != 'Original_Label':
            try:
                df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
                numeric_cols.append(col)
            except Exception as e:
                print(f"Error converting column {col} to numeric: {e}")
                df_cleaned = df_cleaned.drop(columns=[col])

    # Display info about numeric columns
    print(f"Number of numeric columns after conversion: {len(numeric_cols)}")
    if len(numeric_cols) == 0:
        raise ValueError("No numeric columns available after preprocessing")

    # Handle NaN values
    print(f"NaN values before handling: {df_cleaned[numeric_cols].isna().sum().sum()}")

    # Replace infinity values with NaN
    df_cleaned = df_cleaned.replace([np.inf, -np.inf], np.nan)

    # Check for columns with all NaN values
    null_cols = [col for col in numeric_cols if df_cleaned[col].isna().all()]
    if null_cols:
        print(f"Dropping columns with all NaN values: {null_cols}")
        df_cleaned = df_cleaned.drop(columns=null_cols)
        numeric_cols = [col for col in numeric_cols if col not in null_cols]

    # Fill remaining NaN values with column means
    for col in numeric_cols:
        if df_cleaned[col].isna().any():
            col_mean = df_cleaned[col].mean()
            df_cleaned[col] = df_cleaned[col].fillna(col_mean)

    print(f"NaN values after handling: {df_cleaned[numeric_cols].isna().sum().sum()}")

    # Verify we have data to work with
    if len(numeric_cols) == 0:
        raise ValueError("No numeric columns available after preprocessing")

    # Apply min-max scaling to all numeric columns
    if 'Label_Binary' in df_cleaned.columns:
        features = df_cleaned[numeric_cols]
        labels = df_cleaned['Label_Binary']
        original_labels = df_cleaned.get('Original_Label', None)
    else:
        features = df_cleaned[numeric_cols]
        labels = None
        original_labels = None

    # Apply scaling
    if scaler is None and fit_scaler:
        scaler = MinMaxScaler()
        scaled_features = scaler.fit_transform(features)
    elif scaler is not None:
        scaled_features = scaler.transform(features)
    else:
        raise ValueError("Either provide a fitted scaler or set fit_scaler=True")

    # Create a new dataframe with scaled features
    scaled_df = pd.DataFrame(scaled_features, columns=features.columns)

    # Add back the label columns if they exist
    if labels is not None:
        scaled_df['Label_Binary'] = labels.values
    if original_labels is not None:
        scaled_df['Original_Label'] = original_labels.values

    print(f"Preprocessed dataset shape: {scaled_df.shape}")

    if fit_scaler:
        return scaled_df, scaler
    else:
        return scaled_df

def evaluate_performance(y_true, y_pred, y_scores, prediction_time, original_labels=None):
    """Calculate all required performance metrics."""
    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)

    # Calculate AUC - handle case where all predictions are the same class
    try:
        auc_score = roc_auc_score(y_true, y_scores)
    except:
        auc_score = 0.5  # Default value when AUC can't be calculated
        print("Warning: AUC could not be calculated, possibly due to only one class present")

    # Calculate F1 score
    f1 = f1_score(y_true, y_pred, zero_division=0)

    # Calculate precision and recall
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)

    # Calculate confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Ensure confusion matrix has correct shape for metrics calculation
    if cm.shape == (1, 1):  # Only one class predicted
        if y_true[0] == 1:  # Only positive class exists
            tn, fp, fn, tp = 0, 0, 0, cm[0, 0]
        else:  # Only negative class exists
            tn, fp, fn, tp = cm[0, 0], 0, 0, 0
    elif cm.shape == (2, 1) or cm.shape == (1, 2):  # Handle imbalanced confusion matrix
        if cm.size == 2:  # We have two elements
            if 1 in y_pred:  # We predicted positive at least once
                tn = 0
                fp = (y_true == 0).sum() - tn
                tp = (y_true == 1).sum() - 0  # All positive samples are TP
                fn = 0
            else:  # We predicted negative for all
                tn = (y_true == 0).sum()
                fp = 0
                tp = 0
                fn = (y_true == 1).sum()
    else:  # Normal case
        tn, fp, fn, tp = cm.ravel()

    # Calculate Detection Rate (DR) and False Alarm Rate (FAR)
    detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0
    false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

    # Store all metrics
    metrics = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'AUC': auc_score,
        'Detection Rate (DR)': detection_rate,
        'False Alarm Rate (FAR)': false_alarm_rate,
        'True Negatives': tn,
        'False Positives': fp,
        'False Negatives': fn,
        'True Positives': tp,
        'Prediction Time (μs/sample)': prediction_time
    }

    # Create a detailed analysis of false positives and false negatives
    fp_fn_analysis = {}
    if original_labels is not None:
        # Get indices of false positives and false negatives
        fp_indices = np.where((y_true == 0) & (y_pred == 1))[0]
        fn_indices = np.where((y_true == 1) & (y_pred == 0))[0]

        # Count occurrence of each attack type in false negatives
        if len(fn_indices) > 0:
            fn_attack_types = original_labels.iloc[fn_indices].value_counts()
            fp_fn_analysis['False Negative Types'] = fn_attack_types.to_dict()

        # For false positives, they are all benign
        fp_fn_analysis['False Positive Count'] = len(fp_indices)

    return metrics, fp_fn_analysis

def main():
    # Load the training data
    try:
        print("Loading training data...")
        train_df = load_data('enhanced_train.csv')
    except Exception as e:
        print(f"Error loading training data: {e}")
        return

    # Load the test data
    try:
        print("Loading test data...")
        test_df = load_data('fixed_test.csv')
    except Exception as e:
        print(f"Error loading test data: {e}")
        return

    # Display data summary
    print("\nTraining data summary:")
    print(f"Columns: {train_df.columns.tolist()[:5]} ... (total: {len(train_df.columns)})")
    print(f"Number of samples: {len(train_df)}")

    print("\nTest data summary:")
    print(f"Columns: {test_df.columns.tolist()[:5]} ... (total: {len(test_df.columns)})")
    print(f"Number of samples: {len(test_df)}")

    # Preprocess the training data
    try:
        print("\nPreprocessing training data...")
        train_processed, scaler = preprocess_data(train_df, fit_scaler=True)
    except Exception as e:
        print(f"Error preprocessing training data: {e}")
        return

    # Preprocess the test data using the same scaler
    try:
        print("\nPreprocessing test data...")
        test_processed = preprocess_data(test_df, scaler=scaler, fit_scaler=False)
    except Exception as e:
        print(f"Error preprocessing test data: {e}")
        return

    # Check if we have the necessary binary label column in both datasets
    if 'Label_Binary' not in train_processed.columns:
        print("No 'Label_Binary' column found in training data after preprocessing. Cannot proceed.")
        return

    if 'Label_Binary' not in test_processed.columns:
        print("No 'Label_Binary' column found in test data after preprocessing. Cannot proceed.")
        return

    # Check label distribution in training data
    train_label_dist = train_processed['Label_Binary'].value_counts()
    print(f"\nTraining data label distribution: {train_label_dist.to_dict()}")

    # Check label distribution in test data
    test_label_dist = test_processed['Label_Binary'].value_counts()
    print(f"Test data label distribution: {test_label_dist.to_dict()}")

    # Handle case where only one class is present in either dataset
    if train_processed['Label_Binary'].nunique() < 2:
        print(f"Warning: Only one class present in training data ({train_processed['Label_Binary'].unique()[0]})")
        print("Generating synthetic data for demonstration purposes...")

        # Generate synthetic attack samples
        majority_class = train_processed['Label_Binary'].mode()[0]
        minority_class = 1 if majority_class == 0 else 0

        majority_data = train_processed[train_processed['Label_Binary'] == majority_class]

        # Create synthetic minority samples
        num_samples = min(int(len(majority_data) * 0.3), 500)
        synthetic_samples = majority_data.sample(num_samples, replace=(num_samples > len(majority_data)))

        # Add noise to make them different
        for col in synthetic_samples.columns:
            if col != 'Label_Binary' and col != 'Original_Label':
                noise = np.random.normal(0, 0.1, size=len(synthetic_samples))
                synthetic_samples[col] = synthetic_samples[col] + noise
                synthetic_samples[col] = synthetic_samples[col].clip(0, 1)

        # Set minority class
        synthetic_samples['Label_Binary'] = minority_class

        # Set a dummy original label for synthetic samples
        if 'Original_Label' in synthetic_samples.columns:
            synthetic_samples['Original_Label'] = 'SYNTHETIC_ATTACK' if minority_class == 1 else 'SYNTHETIC_BENIGN'

        # Combine with original data
        train_processed = pd.concat([train_processed, synthetic_samples], ignore_index=True)
        print(f"Added {num_samples} synthetic samples of class {minority_class}")
        print(f"New training label distribution: {train_processed['Label_Binary'].value_counts().to_dict()}")

    # Get the original labels for detailed analysis
    original_labels_test = test_processed.get('Original_Label', None)

    # Split features and target for training data
    X_train = train_processed.drop(['Label_Binary'], axis=1)
    if 'Original_Label' in X_train.columns:
        X_train = X_train.drop(['Original_Label'], axis=1)
    y_train = train_processed['Label_Binary']

    # Split features and target for test data
    X_test = test_processed.drop(['Label_Binary'], axis=1)
    if 'Original_Label' in X_test.columns:
        X_test = X_test.drop(['Original_Label'], axis=1)
    y_test = test_processed['Label_Binary']

    print(f"\nTraining features shape: {X_train.shape}")
    print(f"Training target shape: {y_train.shape}")
    print(f"Test features shape: {X_test.shape}")
    print(f"Test target shape: {y_test.shape}")

    # Initialize XGBoost classifier
    model = xgb.XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    # Train the model on the entire training set
    print("\nTraining XGBoost model...")
    model.fit(X_train, y_train)

    # Measure prediction time on test set
    print("\nEvaluating on test set...")
    start_time = time.time()
    y_test_pred_proba = model.predict_proba(X_test)[:, 1]
    y_test_pred = model.predict(X_test)
    end_time = time.time()

    # Calculate prediction time per sample in microseconds
    test_prediction_time = (end_time - start_time) * 1000000 / len(X_test)

    # Calculate final metrics and get detailed false positive/negative analysis
    test_metrics, fp_fn_analysis = evaluate_performance(y_test, y_test_pred, y_test_pred_proba,
                                                       test_prediction_time, original_labels_test)

    # Print test metrics
    print("\nTest set performance metrics:")
    for metric_name, metric_value in test_metrics.items():
        print(f"  {metric_name}: {metric_value:.6f}")

    # Print classification report
    print("\nDetailed classification report:")
    print(classification_report(y_test, y_test_pred, zero_division=0))

    # Print false positive and false negative analysis
    print("\nFalse Positive and False Negative Analysis:")
    if 'False Negative Types' in fp_fn_analysis:
        print("  False Negative breakdown by attack type:")
        for attack_type, count in fp_fn_analysis['False Negative Types'].items():
            print(f"    {attack_type}: {count}")

    print(f"  Total False Positives: {test_metrics['False Positives']}")
    print(f"  Total False Negatives: {test_metrics['False Negatives']}")

    try:
        # Create directory for plots if it doesn't exist
        import os
        if not os.path.exists('plots'):
            os.makedirs('plots')

        # Plot ROC curve
        fpr, tpr, _ = roc_curve(y_test, y_test_pred_proba)
        plt.figure(figsize=(10, 8))
        plt.plot(fpr, tpr, label=f'AUC = {test_metrics["AUC"]:.4f}')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc='lower right')
        plt.grid(True)
        plt.savefig('plots/roc_curve.png')
        print("\nROC curve saved as 'plots/roc_curve.png'")
        plt.close()

        # Plot confusion matrix with percentages
        cm = confusion_matrix(y_test, y_test_pred)
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        plt.savefig('plots/confusion_matrix.png')
        print("Confusion matrix saved as 'plots/confusion_matrix.png'")
        plt.close()

        # Plot normalized confusion matrix
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        plt.figure(figsize=(10, 8))
        sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Normalized Confusion Matrix')
        plt.savefig('plots/normalized_confusion_matrix.png')
        print("Normalized confusion matrix saved as 'plots/normalized_confusion_matrix.png'")
        plt.close()

        # Create a more detailed visualization of FP and FN
        # Set up the figure with 2 subplots
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

        # Left plot: False Negative Distribution by Type (if available)
        if 'False Negative Types' in fp_fn_analysis and len(fp_fn_analysis['False Negative Types']) > 0:
            fn_types = pd.Series(fp_fn_analysis['False Negative Types'])
            fn_types.sort_values(ascending=False).plot(kind='bar', ax=ax1, color='salmon')
            ax1.set_title('False Negatives by Attack Type')
            ax1.set_ylabel('Count')
            ax1.set_xlabel('Attack Type')
            ax1.tick_params(axis='x', rotation=90)
        else:
            ax1.text(0.5, 0.5, 'No False Negatives Available',
                    horizontalalignment='center', verticalalignment='center')
            ax1.set_title('False Negatives Analysis')

        # Right plot: Metrics Comparison
        metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC']
        values = [test_metrics[metric] for metric in metrics_to_plot]
        ax2.bar(metrics_to_plot, values, color='lightblue')
        ax2.set_title('Performance Metrics')
        ax2.set_ylim(0, 1)
        ax2.set_ylabel('Score')
        ax2.tick_params(axis='x', rotation=45)

        plt.tight_layout()
        plt.savefig('plots/detailed_performance.png')
        print("Detailed performance metrics saved as 'plots/detailed_performance.png'")
        plt.close()

        # Plot feature importance
        plt.figure(figsize=(12, 10))
        xgb.plot_importance(model, max_num_features=20)
        plt.title('Feature Importance')
        plt.tight_layout()
        plt.savefig('plots/feature_importance.png')
        print("Feature importance plot saved as 'plots/feature_importance.png'")
        plt.close()

        # Generate precision-recall curve
        from sklearn.metrics import precision_recall_curve, average_precision_score
        precision, recall, _ = precision_recall_curve(y_test, y_test_pred_proba)
        avg_precision = average_precision_score(y_test, y_test_pred_proba)

        plt.figure(figsize=(10, 8))
        plt.plot(recall, precision, label=f'Average Precision = {avg_precision:.4f}')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.legend(loc='best')
        plt.grid(True)
        plt.savefig('plots/precision_recall_curve.png')
        print("Precision-recall curve saved as 'plots/precision_recall_curve.png'")
        plt.close()

    except Exception as e:
        print(f"Error generating plots: {e}")

    print("\nAnalysis complete!")

    # Print a note about synthetic data if it was used
    if train_df['Label'].nunique() < 2 or train_processed['Label_Binary'].nunique() < 2:
        print("\nNOTE: This analysis used synthetic data for demonstration purposes.")
        print("In a real-world scenario, you would need balanced class representation for meaningful results.")

if __name__ == "__main__":
    main()

Loading training data...
Data loaded as a single column. Trying comma delimiter...
Loaded dataset with comma delimiter. Shape: (1191343, 92)
Loading test data...
Data loaded as a single column. Trying comma delimiter...
Loaded dataset with comma delimiter. Shape: (909633, 91)

Training data summary:
Columns: ['id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP'] ... (total: 92)
Number of samples: 1191343

Test data summary:
Columns: ['id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP'] ... (total: 91)
Number of samples: 909633

Preprocessing training data...

Preprocessing data:
Initial columns: ['id', 'Flow ID', 'Src IP', 'Src Port', 'Dst IP']... (total: 92)
Label distribution: {'BENIGN': 1006478, 'DoS Hulk': 158468, 'DoS GoldenEye': 7567, 'FTP-Patator': 3972, 'DoS Slowloris': 3859, 'DoS Slowhttptest - Attempted': 3368, 'SSH-Patator': 2961, 'DoS Slowloris - Attempted': 1847, 'DoS Slowhttptest': 1740, 'DoS Hulk - Attempted': 581, 'Portscan': 174, 'DDoS': 104, 'DoS GoldenEye - Attempted': 80, 'I

<Figure size 1200x1000 with 0 Axes>