PreProcessing the CSV file

In [6]:
import pandas as pd

def preprocess_cicids_attacks(input_csv_path, output_csv_path):
    # Load dataset
    df = pd.read_csv(input_csv_path)
    
    # Strip leading/trailing spaces from column names
    df.columns = df.columns.str.strip()
    
    # Display original label counts
    if 'Label' not in df.columns:
        raise ValueError(f"'Label' column not found. Columns present: {df.columns.tolist()}")
    
    print("Original label distribution:")
    print(df['Label'].value_counts())
    
    # Define attack labels to combine under ATTACK
    attack_labels = [
        'DoS GoldenEye',
        'DoS Hulk',
        'DoS Slowhttptest',
        'DoS slowloris',
        'Heartbleed'
    ]
    
    # Create new column with BENIGN or ATTACK
    df['BinaryLabel'] = df['Label'].apply(lambda x: 'ATTACK' if x in attack_labels else 'BENIGN')
    
    # Check new label counts
    print("\nNew label distribution after merging:")
    print(df['BinaryLabel'].value_counts())
    
    # Save to new CSV
    df.to_csv(output_csv_path, index=False)
    print(f"\nPreprocessed dataset saved to: {output_csv_path}")

# Example usage
input_file = r"C:\Users\USER\Desktop\Capstone\GeneratedLabelledFlows\TrafficLabelling\Wednesday-workingHours-1.pcap_ISCX.csv"
output_file = r"C:\Users\USER\Desktop\Capstone\GeneratedLabelledFlows\TrafficLabelling\Wednesday-workingHours-preprocessed.csv"

preprocess_cicids_attacks(input_file, output_file)

Original label distribution:
Label
BENIGN              440031
DoS Hulk            231073
DoS GoldenEye        10293
DoS slowloris         5796
DoS Slowhttptest      5499
Heartbleed              11
Name: count, dtype: int64

New label distribution after merging:
BinaryLabel
BENIGN    440031
ATTACK    252672
Name: count, dtype: int64

Preprocessed dataset saved to: C:\Users\USER\Desktop\Capstone\GeneratedLabelledFlows\TrafficLabelling\Wednesday-workingHours-preprocessed.csv
