In [16]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle

# ============================
# Preprocessing Training Data
# ============================

# Load the NSL-KDD training dataset from the CSV file
df_train = pd.read_csv("../../data/kdd_train.csv", header=None)

# Fill missing values in the training data, if any
df_train = df_train.fillna(0)

# Column indices for the relevant categorical fields
categorical_columns = [1, 2, 3, 41]  # 'protocol_type', 'service', 'flag', 'class'

# Initialize LabelEncoder
le_dict = {}  # Dictionary to hold the label encoders for each categorical column

# Convert categorical columns using Label Encoding (store encoders to ensure consistent mapping)
for col in categorical_columns:
    le = LabelEncoder()
    df_train[col] = le.fit_transform(df_train[col])
    le_dict[col] = le  # Save the encoder for later use

# Convert binary columns to numeric
binary_columns = [6, 11, 20, 21]
df_train[binary_columns] = df_train[binary_columns].apply(pd.to_numeric, errors='coerce')

# Ensure all columns are numeric before applying the scaler
df_train = df_train.apply(pd.to_numeric, errors='coerce')

# Separate features and target
X_train = df_train.drop(columns=[41])  # Exclude 'class' column
y_train = df_train[41]  # Target column ('class')

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Save scaler for later use
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Save label encoders
with open("label_encoders.pkl", "wb") as f:
    pickle.dump(le_dict, f)

# Combine scaled features with target labels
df_train_scaled = pd.DataFrame(X_train_scaled)
df_train_scaled['class'] = y_train

df_train_scaled = df_test_scaled.dropna()

# Save the scaled training dataset
df_train_scaled.to_csv('../../data/kdd_train_scaled.csv', index=False)

print("Training data preprocessing complete, and scaler/label encoders saved.")
print(df_train_scaled.head())

# ============================
# Preprocessing Test Data
# ============================

# Load the NSL-KDD test dataset
df_test = pd.read_csv("../../data/kdd_test.csv", header=None)

# Fill missing values in the test data
df_test = df_test.fillna(0)

# Load the label encoders
with open("label_encoders.pkl", "rb") as f:
    le_dict = pickle.load(f)

# Apply the same LabelEncoders to the test data
for col in categorical_columns:
    if col in le_dict:
        le = le_dict[col]
        try:
            df_test[col] = le.transform(df_test[col])
        except ValueError:
            # Handle unseen labels by setting them to a default value (e.g., -1)
            df_test[col] = df_test[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

# Convert binary columns to numeric
df_test[binary_columns] = df_test[binary_columns].apply(pd.to_numeric, errors='coerce')

# Ensure all columns are numeric
df_test = df_test.apply(pd.to_numeric, errors='coerce')

# Separate features and target
X_test = df_test.drop(columns=[41])
y_test = df_test[41]

# Load the scaler and apply it to the test data
with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

X_test_scaled = scaler.transform(X_test)

# Combine scaled features with target labels
df_test_scaled = pd.DataFrame(X_test_scaled)
df_test_scaled['class'] = y_test

df_test_scaled = df_test_scaled.dropna()

# Save the scaled test dataset
df_test_scaled.to_csv('../../data/kdd_test_scaled.csv', index=False)

print("Test data preprocessing complete.")
print(df_test_scaled.head())


  df_train = pd.read_csv("../../data/kdd_train.csv", header=None)


Training data preprocessing complete, and scaler/label encoders saved.
          0         1         2         3         4         5         6  \
1 -0.110249  0.020418  1.065923 -2.223582 -0.007762 -0.004919 -0.014089   
2 -0.110249  0.020418  1.065923 -2.223582 -0.007762 -0.004919 -0.014089   
3 -0.109481  0.020418 -0.685614  0.751096 -0.005551 -0.004919 -0.014089   
4 -0.110249 -3.214973 -1.048000  0.751096 -0.007759 -0.004919 -0.014089   
5 -0.109865  0.020418  1.790697 -1.851747 -0.007762 -0.004915 -0.014089   

          7         8         9  ...        32        33        34        35  \
1 -0.089486 -0.007736 -0.095076  ... -0.954389 -1.071933 -0.121485 -0.480197   
2 -0.089486 -0.007736 -0.095076  ... -1.035688 -1.161030 -0.121485 -0.480197   
3 -0.089486 -0.007736 -0.095076  ... -0.267863  0.197703 -0.227350  1.493939   
4 -0.089486 -0.007736 -0.095076  ... -0.529826  1.066401 -0.439078  2.756092   
5 -0.089486 -0.007736 -0.095076  ... -0.267863 -0.470526  0.460769 -0.383108  

  df_test = pd.read_csv("../../data/kdd_test.csv", header=None)


Test data preprocessing complete.
          0         1         2         3         4         5         6  \
1 -0.110249  0.020418  1.065923 -2.223582 -0.007762 -0.004919 -0.014089   
2 -0.110249  0.020418  1.065923 -2.223582 -0.007762 -0.004919 -0.014089   
3 -0.109481  0.020418 -0.685614  0.751096 -0.005551 -0.004919 -0.014089   
4 -0.110249 -3.214973 -1.048000  0.751096 -0.007759 -0.004919 -0.014089   
5 -0.109865  0.020418  1.790697 -1.851747 -0.007762 -0.004915 -0.014089   

          7         8         9  ...        32        33        34        35  \
1 -0.089486 -0.007736 -0.095076  ... -0.954389 -1.071933 -0.121485 -0.480197   
2 -0.089486 -0.007736 -0.095076  ... -1.035688 -1.161030 -0.121485 -0.480197   
3 -0.089486 -0.007736 -0.095076  ... -0.267863  0.197703 -0.227350  1.493939   
4 -0.089486 -0.007736 -0.095076  ... -0.529826  1.066401 -0.439078  2.756092   
5 -0.089486 -0.007736 -0.095076  ... -0.267863 -0.470526  0.460769 -0.383108   

         36        37        38   