In [4]:
# Install missing dependencies
%pip install imbalanced-learn

Collecting imbalanced-learn
  Using cached imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting scipy<2,>=1.10.1 (from imbalanced-learn)
  Downloading scipy-1.16.0-cp313-cp313-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting scikit-learn<2,>=1.3.2 (from imbalanced-learn)
  Using cached scikit_learn-1.7.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Using cached sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Collecting joblib<2,>=1.1.1 (from imbalanced-learn)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl<4,>=2.0.0 (from imbalanced-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting scikit-learn<2,>=1.3.2 (from imbalanced-learn)
  Using cached scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (31 kB)
Using cached imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Using cached joblib-1.5.1-py3-none-any.whl (307 

In [5]:
import os

In [1]:
import pandas as pd

df1 = pd.read_csv('../data/metasploitable-2.csv')
df2 = pd.read_csv('../data/Normal_data.csv')
df3 = pd.read_csv('../data/OVS.csv')

# Concatenate
combined_df = pd.concat([df1, df2, df3], ignore_index=True)

# View all unique labels
print(combined_df['Label'].unique())

['U2R' 'BFA' 'DDoS' 'DoS' 'Probe' 'Normal' 'DDoS ' 'Web-Attack' 'BOTNET']


In [2]:
# Assuming your label column is named 'label'
label_counts = combined_df['Label'].value_counts()

print(label_counts)

Label
Probe         98129
DDoS          73529
Normal        68424
DoS           53616
DDoS          48413
BFA            1405
Web-Attack      192
BOTNET          164
U2R              17
Name: count, dtype: int64


In [3]:
# Remove leading/trailing whitespace from all labels
combined_df['Label'] = combined_df['Label'].str.strip()

# Now get the counts again
label_counts = combined_df['Label'].value_counts()
print(label_counts)

Label
DDoS          121942
Probe          98129
Normal         68424
DoS            53616
BFA             1405
Web-Attack       192
BOTNET           164
U2R               17
Name: count, dtype: int64


In [6]:
# Import pandas
import pandas as pd
import glob
import os

# Define data directory (adjust path as needed)
data_dir = '../data'  # Replace with your actual data folder path

# Load all CSV files
csv_files = glob.glob(f'{data_dir}/*.csv')
if not csv_files:
    raise ValueError(f"No CSV files found in {data_dir}. Please check the directory.")

# Combine datasets
dfs = []
for file in csv_files:
    try:
        df = pd.read_csv(file)
        print(f"Loaded {file} with shape: {df.shape}")
        dfs.append(df)
    except Exception as e:
        print(f"Error loading {file}: {e}")

if not dfs:
    raise ValueError("No datasets loaded successfully.")

combined_df = pd.concat(dfs, ignore_index=True)
print(f"\nCombined dataset shape: {combined_df.shape}")

# Clean labels
combined_df['Label'] = combined_df['Label'].astype(str).str.strip()
print("\nInitial label counts:")
print(combined_df['Label'].value_counts())

Loaded ../data/metasploitable-2.csv with shape: (136743, 84)
Loaded ../data/Normal_data.csv with shape: (68424, 84)
Loaded ../data/OVS.csv with shape: (138722, 84)

Combined dataset shape: (343889, 84)

Initial label counts:
Label
DDoS          121942
Probe          98129
Normal         68424
DoS            53616
BFA             1405
Web-Attack       192
BOTNET           164
U2R               17
Name: count, dtype: int64


In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
import glob

# Assuming combined_df is already loaded with:
# combined_df['Label'] = combined_df['Label'].str.strip()
# label_counts = combined_df['Label'].value_counts()
# print(label_counts)

# Define paths
output_dir = '../output'
os.makedirs(output_dir, exist_ok=True)

# Step 1: Verify combined_df
print("\nInitial combined_df shape:", combined_df.shape)
print("\nInitial label counts:")
print(combined_df['Label'].value_counts())

# Step 2: Clean the output folder
for file_path in glob.glob(f'{output_dir}/*'):
    try:
        os.remove(file_path)
        print(f"Deleted {file_path}")
    except Exception as e:
        print(f"Error deleting {file_path}: {e}")

# Step 3: Encode string labels to numerical values
# Ensure all labels are stripped of whitespace
combined_df['Label'] = combined_df['Label'].astype(str).str.strip()

print("Unique labels before mapping:", combined_df['Label'].unique())

label_mapping = {
    'DDoS': 0.0,
    'Probe': 1.0,
    'Normal': 2.0,
    'DoS': 3.0,
    'BFA': 4.0,
    'Web-Attack': 5.0,
    'BOTNET': 6.0,
    'U2R': 7.0
}
combined_df['Label'] = combined_df['Label'].map(label_mapping)

if combined_df['Label'].isnull().any():
    unmapped_labels = combined_df[combined_df['Label'].isnull()]['Label'].unique()
    print("Unmapped labels found:", unmapped_labels)
    raise ValueError(f"Some labels could not be mapped: {unmapped_labels}")

print("\nLabel counts after encoding:")
print(combined_df['Label'].value_counts())

# Step 4: Preprocess data
constant_columns = ['Fwd PSH Flags', 'Fwd URG Flags', 'CWE Flag Count', 'ECE Flag Cnt', 
                    'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 
                    'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Fwd Seg Size Min']
non_numerical = ['Flow ID', 'Src IP', 'Dst IP', 'Timestamp']
combined_df = combined_df.drop(columns=[col for col in non_numerical + constant_columns if col in combined_df.columns])
for col in combined_df.columns:
    if col != 'Label' and np.issubdtype(combined_df[col].dtype, np.number):
        combined_df[col] = combined_df[col].clip(lower=0)
print(f"\nCombined dataset shape after preprocessing: {combined_df.shape}")

# Step 5: Split into minority and majority groups
minority_classes = [4.0, 5.0, 6.0, 7.0]  # BFA, Web-Attack, BOTNET, U2R
groups = {
    'minority': combined_df[combined_df['Label'].isin(minority_classes)],
    'majority': combined_df[~combined_df['Label'].isin(minority_classes)]
}
for group_name, df in groups.items():
    print(f"\n{group_name} group shape: {df.shape}")
    print(f"{group_name} label counts:")
    print(df['Label'].value_counts())
    if df.empty:
        print(f"Warning: {group_name} group is empty. Check label mapping and data.")

# Step 6: Save grouped datasets only if not empty
for group_name, df in groups.items():
    if not df.empty:
        df.to_csv(f'{output_dir}/{group_name}.csv', index=False)
        print(f"\nSaved {group_name}.csv to {output_dir}/{group_name}.csv")
    else:
        print(f"Skipped saving {group_name}.csv because the group is empty.")

# Step 7: Verify saved files
for group_name in groups:
    file_path = f'{output_dir}/{group_name}.csv'
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        print(f"\nVerified {file_path} shape: {df.shape}")
        if df.empty:
            raise ValueError(f"Saved {file_path} is empty.")
    else:
        print(f"File {file_path} does not exist, skipping verification.")


Initial combined_df shape: (343889, 84)

Initial label counts:
Label
DDoS          121942
Probe          98129
Normal         68424
DoS            53616
BFA             1405
Web-Attack       192
BOTNET           164
U2R               17
Name: count, dtype: int64
Unique labels before mapping: ['U2R' 'BFA' 'DDoS' 'DoS' 'Probe' 'Normal' 'Web-Attack' 'BOTNET']

Label counts after encoding:
Label
0.0    121942
1.0     98129
2.0     68424
3.0     53616
4.0      1405
5.0       192
6.0       164
7.0        17
Name: count, dtype: int64

Combined dataset shape after preprocessing: (343889, 69)

minority group shape: (1778, 69)
minority label counts:
Label
4.0    1405
5.0     192
6.0     164
7.0      17
Name: count, dtype: int64

majority group shape: (342111, 69)
majority label counts:
Label
0.0    121942
1.0     98129
2.0     68424
3.0     53616
Name: count, dtype: int64

Saved minority.csv to ../output/minority.csv

Saved majority.csv to ../output/majority.csv

Verified ../output/minority.csv

In [8]:
for file in ['majority.csv', 'minority.csv']:
    df = pd.read_csv(os.path.join(output_dir, file))
    print(f'\n{file} shape: {df.shape}')
    print(f'Labels: {df["Label"].unique().tolist()}')


majority.csv shape: (342111, 69)
Labels: [0.0, 3.0, 1.0, 2.0]

minority.csv shape: (1778, 69)
Labels: [7.0, 4.0, 5.0, 6.0]
