In [1]:

import os

# Getting all the arff files from the current directory
files = [arff for arff in os.listdir('.') if arff.endswith(".arff")]

# Function for converting arff list to csv list
def toCsv(text):
    data = False
    header = ""
    new_content = []
    for line in text:
        if not data:
            if "@ATTRIBUTE" in line or "@attribute" in line:
                attributes = line.split()
                if("@attribute" in line):
                    attri_case = "@attribute"
                else:
                    attri_case = "@ATTRIBUTE"
                column_name = attributes[attributes.index(attri_case) + 1]
                header = header + column_name + ","
            elif "@DATA" in line or "@data" in line:
                data = True
                header = header[:-1]
                header += '\n'
                new_content.append(header)
        else:
            new_content.append(line)
    return new_content


# Main loop for reading and writing files
for file in files:
    with open(file, "r") as inFile:
        content = inFile.readlines()
        name, ext = os.path.splitext(inFile.name)
        new = toCsv(content)
        with open(name + ".csv", "w") as outFile:
            outFile.writelines(new)


In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import requests
from io import StringIO


# Define column names based on the provided header
column_names = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in',
    'lnum_compromised', 'lroot_shell', 'lsu_attempted', 'lnum_root', 
    'lnum_file_creations', 'lnum_shells', 'lnum_access_files', 'lnum_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label'
]



df = pd.read_csv('KDDCup99_full.csv', names=column_names, low_memory=False)
# And for the split, remove stratification which may be causing issues with rare classes:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)
print(f"Dataset loaded with shape: {df.shape}")

# 2. Identify and encode categorical features
print("\nConverting categorical features to numerical using Label Encoding...")
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"Categorical columns found: {list(categorical_cols)}")

# Apply label encoding to each categorical column
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
    print(f"Encoded {col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")

print("\nSample data after encoding:")
print(df.head())

# 3. Scale numerical features for genetic algorithm (Min-Max scaling)
print("\nScaling numerical features...")
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
numeric_cols = numeric_cols.drop('label') if 'label' in numeric_cols else numeric_cols

# Scale each numeric column to [0,1] range
for col in numeric_cols:
    min_val = df[col].min()
    max_val = df[col].max()
    # Avoid division by zero
    if max_val > min_val:
        df[col] = (df[col] - min_val) / (max_val - min_val)
    else:
        df[col] = 0  # If all values are the same

# 4. Split into training and testing sets
print("\nSplitting data into training and test sets (70/30 split)...")
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

# 5. Save the processed datasets to CSV files
train_df.to_csv('kdd_cup_train.csv', index=False)
test_df.to_csv('kdd_cup_test.csv', index=False)

print("\nProcessed data saved to:")
print("- kdd_cup_train.csv")
print("- kdd_cup_test.csv")

# Display distribution in train/test sets to verify stratification
print("\nLabel distribution in training set:")
print(train_df['label'].value_counts(normalize=True).sort_index())
print("\nLabel distribution in test set:")
print(test_df['label'].value_counts(normalize=True).sort_index())

print("\nPreprocessing complete. Data is ready for genetic algorithm application.")

Dataset loaded with shape: (2204293, 42)

Converting categorical features to numerical using Label Encoding...
Categorical columns found: ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'lnum_compromised', 'lroot_shell', 'lsu_attempted', 'lnum_root', 'lnum_file_creations', 'lnum_shells', 'lnum_access_files', 'lnum_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'label']
Encoded duration: {'0': 0, '1': 1, '10': 2, '100': 3, '1000': 4, '10007': 5, '1001': 6, '10010': 7, '1002': 8, '1002