In [None]:
! pip install pandas numpy scikit-learn

Collecting pandas
  Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Using cached numpy-2.2.4-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.2-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached pandas-2.2.3-cp312-cp312-win_amd64.whl (11.5 MB)
Downloading numpy-2.2.4-cp312-cp312-win_amd64.whl (12.6 MB)
   ---------------------------------------- 0.0


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd

# Define file paths
train_file = "dataset/KDDTrain+.TXT"
test_file = "dataset/KDDTest+.TXT"

# Column names for the dataset (as per NSL-KDD documentation)
columns = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", 
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", 
    "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", 
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", 
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", 
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", 
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", 
    "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate", 
    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", 
    "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate", 
    "dst_host_srv_rerror_rate", "label", "difficulty_level"
]

# Load dataset
train_df = pd.read_csv(train_file, names=columns)
test_df = pd.read_csv(test_file, names=columns)

# Drop 'difficulty_level' column (not needed for model training)
train_df.drop(columns=['difficulty_level'], inplace=True)
test_df.drop(columns=['difficulty_level'], inplace=True)

# Display dataset info
print(train_df.head())
print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)


   duration protocol_type   service flag  src_bytes  dst_bytes  land  \
0         0           tcp  ftp_data   SF        491          0     0   
1         0           udp     other   SF        146          0     0   
2         0           tcp   private   S0          0          0     0   
3         0           tcp      http   SF        232       8153     0   
4         0           tcp      http   SF        199        420     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                  25   
1               0       0    0  ...                   1   
2               0       0    0  ...                  26   
3               0       0    0  ...                 255   
4               0       0    0  ...                 255   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.17                    0.03   
1                    0.00                    0.60   
2                    0.10                    0.05   


In [3]:
from sklearn.preprocessing import LabelEncoder

# List of categorical columns
categorical_columns = ["protocol_type", "service", "flag"]

# Encode categorical variables
encoder = LabelEncoder()
for col in categorical_columns:
    train_df[col] = encoder.fit_transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])

print("Categorical columns encoded successfully!")
print(train_df.head())


Categorical columns encoded successfully!
   duration  protocol_type  service  flag  src_bytes  dst_bytes  land  \
0         0              1       20     9        491          0     0   
1         0              2       44     9        146          0     0   
2         0              1       49     5          0          0     0   
3         0              1       24     9        232       8153     0   
4         0              1       24     9        199        420     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                  25   
1               0       0    0  ...                   1   
2               0       0    0  ...                  26   
3               0       0    0  ...                 255   
4               0       0    0  ...                 255   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.17                    0.03   
1                    0.00                    0.60   
2    

In [4]:
# Convert attack types into binary (normal = 0, attack = 1)
train_df['label'] = train_df['label'].apply(lambda x: 0 if x == "normal" else 1)
test_df['label'] = test_df['label'].apply(lambda x: 0 if x == "normal" else 1)

print("Labels converted to binary classification!")
print(train_df['label'].value_counts())  # Check distribution


Labels converted to binary classification!
label
0    67343
1    58630
Name: count, dtype: int64


In [5]:
from sklearn.preprocessing import MinMaxScaler

# Drop label column for scaling
features = train_df.drop(columns=["label"])
labels = train_df["label"]

# Apply MinMaxScaler
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(features)
test_scaled = scaler.transform(test_df.drop(columns=["label"]))

print("Feature scaling completed!")


Feature scaling completed!


In [6]:
from sklearn.model_selection import train_test_split

# Split training data into training & validation sets
X_train, X_val, y_train, y_val = train_test_split(train_scaled, labels, test_size=0.2, random_state=42)

print("Train set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)

Train set: (100778, 41) (100778,)
Validation set: (25195, 41) (25195,)


In [7]:
! pip install deap scipy

Collecting deap
  Downloading deap-1.4.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading deap-1.4.2-cp312-cp312-win_amd64.whl (109 kB)
Installing collected packages: deap
Successfully installed deap-1.4.2



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import numpy as np
from deap import base, creator, tools, algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Define fitness function (maximize accuracy)
def evaluate(individual):
    selected_features = [index for index, val in enumerate(individual) if val == 1]
    if not selected_features:  # If no features are selected, return worst score
        return 0,
    
    X_selected = X_train[:, selected_features]
    model = RandomForestClassifier(n_estimators=50, random_state=42)
    scores = cross_val_score(model, X_selected, y_train, cv=3, scoring='accuracy')
    return scores.mean(),  # Return accuracy


In [9]:
# Set up GA optimization
num_features = X_train.shape[1]  # 41 features
creator.create("FitnessMax", base.Fitness, weights=(1.0,))  # Maximize accuracy
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)  # Binary (0 or 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=num_features)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("mate", tools.cxTwoPoint)  # Crossover function
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)  # Mutation function
toolbox.register("select", tools.selTournament, tournsize=3)  # Selection function
toolbox.register("evaluate", evaluate)


In [10]:
# Run Genetic Algorithm
population = toolbox.population(n=20)  # Population size
NGEN = 10  # Number of generations
CXPB, MUTPB = 0.5, 0.2  # Crossover & mutation probability

for gen in range(NGEN):
    offspring = algorithms.varAnd(population, toolbox, cxpb=CXPB, mutpb=MUTPB)
    fits = list(map(toolbox.evaluate, offspring))
    for ind, fit in zip(offspring, fits):
        ind.fitness.values = fit
    population = toolbox.select(offspring, k=len(population))

# Best individual (selected features)
best_individual = tools.selBest(population, k=1)[0]
selected_features = [index for index, val in enumerate(best_individual) if val == 1]
print("Selected Features:", selected_features)


Selected Features: [0, 1, 2, 3, 4, 5, 7, 11, 14, 16, 17, 19, 21, 24, 28, 30, 33, 34, 35, 38, 39, 40]


In [11]:
# Train with selected features
X_train_selected = X_train[:, selected_features]
X_val_selected = X_val[:, selected_features]

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_selected, y_train)

# Evaluate the model
accuracy = model.score(X_val_selected, y_val)
print("Validation Accuracy:", accuracy)


Validation Accuracy: 0.998571145068466


In [12]:
! pip install joblib




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import joblib

# Save the trained Random Forest model
joblib.dump(model, "random_forest_nsl_kdd.pkl")

# Save the MinMaxScaler
joblib.dump(scaler, "scaler.pkl")

# Save the selected feature indices
joblib.dump(selected_features, "selected_features.pkl")

print("Model and scaler saved successfully!")


Model and scaler saved successfully!
