In [1]:
import pandas as pd
import numpy as np

# NSL-KDD Column Names (41 features + 1 attack type + 1 difficulty score)
COLUMNS = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 
    'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 
    'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 
    'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 
    'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack_type', 'difficulty_level'
]

file_name = "KDDTrain+.txt"

try:
    # Load the dataset assuming no header
    df = pd.read_csv(file_name, header=None, names=COLUMNS)
    print("--- DataFrame Head ---")
    print(df.head().to_markdown(index=False, numalign="left", stralign="left"))
    print("\n--- DataFrame Info ---")
    df.info()
except Exception as e:
    print(f"Error loading file: {e}")
    

--- DataFrame Head ---
| duration   | protocol_type   | service   | flag   | src_bytes   | dst_bytes   | land   | wrong_fragment   | urgent   | hot   | num_failed_logins   | logged_in   | num_compromised   | root_shell   | su_attempted   | num_root   | num_file_creations   | num_shells   | num_access_files   | num_outbound_cmds   | is_host_login   | is_guest_login   | count   | srv_count   | serror_rate   | srv_serror_rate   | rerror_rate   | srv_rerror_rate   | same_srv_rate   | diff_srv_rate   | srv_diff_host_rate   | dst_host_count   | dst_host_srv_count   | dst_host_same_srv_rate   | dst_host_diff_srv_rate   | dst_host_same_src_port_rate   | dst_host_srv_diff_host_rate   | dst_host_serror_rate   | dst_host_srv_serror_rate   | dst_host_rerror_rate   | dst_host_srv_rerror_rate   | attack_type   | difficulty_level   |
|:-----------|:----------------|:----------|:-------|:------------|:------------|:-------|:-----------------|:---------|:------|:--------------------|:------------|:----

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import joblib

# 1. Drop the redundant column 'difficulty_level'
df = df.drop('difficulty_level', axis=1)

# 2. Separate Features (X) and Target (y)
X = df.drop('attack_type', axis=1)
y = df['attack_type']

# Identify categorical and numerical columns for preprocessing
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# 3. Handle Categorical Features (One-Hot Encoding)
X_cat = X[categorical_cols]
X_cat_encoded = pd.get_dummies(X_cat, columns=categorical_cols, prefix=categorical_cols)

# Re-index the encoded features to align with the numerical data
X_cat_encoded.index = X_cat.index

# 4. Handle Numerical Features (Min-Max Scaling)
scaler = MinMaxScaler()
X_num = X[numerical_cols]
X_num_scaled = scaler.fit_transform(X_num)
X_num_scaled_df = pd.DataFrame(X_num_scaled, columns=numerical_cols)
X_num_scaled_df.index = X.index # Align index for concatenation

# 5. Combine Features
X_processed = pd.concat([X_num_scaled_df, X_cat_encoded], axis=1)

# 6. Preprocessing of Target Variable (y)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_df = pd.DataFrame(y_encoded, columns=['attack_label'])
# Store the mapping for later use (e.g., in Flask app)
attack_type_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Display results
print("\n--- Processed Features Shape and Head ---")
print(f"Original Features Shape: {X.shape}")
print(f"Processed Features Shape: {X_processed.shape}")
print(X_processed.head().to_markdown(index=False, numalign="left", stralign="left"))

print("\n--- Target Variable Mapping ---")
print(f"Attack Types: {label_encoder.classes_}")
print(f"Mapping: {attack_type_mapping}")

# Save the processed data to a CSV for potential model training steps
X_processed.to_csv("KDDTrain+_features_processed.csv", index=False)
y_df.to_csv("KDDTrain+_target_processed.csv", index=False)

# Save the scaler and encoder objects for use in the Flask application

joblib.dump(scaler, 'min_max_scaler.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')

print("\nSaved Scaler, Label Encoder, and Processed Data to files for ML training.")


--- Processed Features Shape and Head ---
Original Features Shape: (125973, 41)
Processed Features Shape: (125973, 122)
| duration   | src_bytes   | dst_bytes   | land   | wrong_fragment   | urgent   | hot   | num_failed_logins   | logged_in   | num_compromised   | root_shell   | su_attempted   | num_root   | num_file_creations   | num_shells   | num_access_files   | num_outbound_cmds   | is_host_login   | is_guest_login   | count      | srv_count   | serror_rate   | srv_serror_rate   | rerror_rate   | srv_rerror_rate   | same_srv_rate   | diff_srv_rate   | srv_diff_host_rate   | dst_host_count   | dst_host_srv_count   | dst_host_same_srv_rate   | dst_host_diff_srv_rate   | dst_host_same_src_port_rate   | dst_host_srv_diff_host_rate   | dst_host_serror_rate   | dst_host_srv_serror_rate   | dst_host_rerror_rate   | dst_host_srv_rerror_rate   | protocol_type_icmp   | protocol_type_tcp   | protocol_type_udp   | service_IRC   | service_X11   | service_Z39_50   | service_aol   | service_au