In [26]:
import pandas as pd

# Load the datasets
kdd_train = pd.read_csv('KDDTrain+_20Percent.txt', header=None)
kdd_test = pd.read_csv('KDDTest+.csv', header=None)
kdd_test_21 = pd.read_csv('KDDTest-21.csv', header=None)

# Print the first few rows to inspect the structure
print(kdd_train.head())
print(kdd_test.head())
print(kdd_test_21.head())

# Print the number of columns in each dataset
print(f'KDDTrain+ columns: {kdd_train.shape[1]}')
print(f'KDDTest+ columns: {kdd_test.shape[1]}')
print(f'KDDTest-21 columns: {kdd_test_21.shape[1]}')


   0    1         2   3    4     5   6   7   8   9   ...    33    34    35  \
0   0  tcp  ftp_data  SF  491     0   0   0   0   0  ...  0.17  0.03  0.17   
1   0  udp     other  SF  146     0   0   0   0   0  ...  0.00  0.60  0.88   
2   0  tcp   private  S0    0     0   0   0   0   0  ...  0.10  0.05  0.00   
3   0  tcp      http  SF  232  8153   0   0   0   0  ...  1.00  0.00  0.03   
4   0  tcp      http  SF  199   420   0   0   0   0  ...  1.00  0.00  0.00   

     36    37    38    39    40       41  42  
0  0.00  0.00  0.00  0.05  0.00   normal  20  
1  0.00  0.00  0.00  0.00  0.00   normal  15  
2  0.00  1.00  1.00  0.00  0.00  neptune  19  
3  0.04  0.03  0.01  0.00  0.01   normal  21  
4  0.00  0.00  0.00  0.00  0.00   normal  21  

[5 rows x 43 columns]
   0     1         2     3      4   5   6   7   8   9   ...    33    34    35  \
0   0   tcp   private   REJ      0   0   0   0   0   0  ...  0.04  0.06  0.00   
1   0   tcp   private   REJ      0   0   0   0   0   0  ...  0.0

In [27]:
columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
           'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
           'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
           'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
           'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
           'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
           'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
           'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
           'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 
           'attack', 'difficulty_level']


In [28]:
# Add column names to the datasets
kdd_train.columns = columns
kdd_test.columns = columns
kdd_test_21.columns = columns

# Handle missing values if any
kdd_train = kdd_train.dropna()
kdd_test = kdd_test.dropna()
kdd_test_21 = kdd_test_21.dropna()



In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
# Define the encoder with handle_unknown='ignore'
encoder = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Split dataset into features and labels for the full dataset
X_train = kdd_train.drop(['attack', 'difficulty_level'], axis=1)
y_train = kdd_train['attack']
X_test = kdd_test.drop(['attack', 'difficulty_level'], axis=1)
y_test = kdd_test['attack']

# Encode labels
y_train = y_train.map(lambda x: 0 if x == 'normal' else 1)
y_test = y_test.map(lambda x: 0 if x == 'normal' else 1)

# Normalize or standardize the feature values
scaler = StandardScaler()

# Create preprocessing pipeline
preprocessor = Pipeline(steps=[
    ('encoder', encoder),
    ('scaler', scaler)
])

# Preprocess the data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Create and train models with the smaller subset for initial testing
# Random Forest
rf = RandomForestClassifier(random_state=42)
param_dist_rf = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
random_search_rf = RandomizedSearchCV(estimator=rf, param_distributions=param_dist_rf, n_iter=10, cv=3, random_state=42, n_jobs=-1)
random_search_rf.fit(X_train, y_train)

best_rf = random_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)

print("Random Forest Classifier")
print("Best Parameters:", random_search_rf.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Classifier
Best Parameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}
Accuracy: 0.7713726739160107
ROC-AUC: 0.7960585494562669
              precision    recall  f1-score   support

           0       0.66      0.97      0.79      9450
           1       0.97      0.62      0.76     12529

    accuracy                           0.77     21979
   macro avg       0.81      0.80      0.77     21979
weighted avg       0.83      0.77      0.77     21979



In [33]:
from sklearn.svm import SVC


X_train_sample = X_train[:10000]
y_train_sample = y_train[:10000]

# Support Vector Machines
svm = SVC(probability=True, random_state=42)
param_dist_svm = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

# Limit n_iter for faster computation
random_search_svm = RandomizedSearchCV(estimator=svm, param_distributions=param_dist_svm, n_iter=5, cv=3, random_state=42, n_jobs=-1)
random_search_svm.fit(X_train_sample, y_train_sample)

best_svm = random_search_svm.best_estimator_
y_pred_svm = best_svm.predict(X_test)

print("Support Vector Machine Classifier")
print("Best Parameters:", random_search_svm.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))


Support Vector Machine Classifier
Best Parameters: {'kernel': 'rbf', 'gamma': 'scale', 'C': 10}
Accuracy: 0.7968515401064653
ROC-AUC: 0.8125945225067261
              precision    recall  f1-score   support

           0       0.70      0.92      0.80      9450
           1       0.93      0.70      0.80     12529

    accuracy                           0.80     21979
   macro avg       0.81      0.81      0.80     21979
weighted avg       0.83      0.80      0.80     21979



In [31]:
# Deep Neural Networks
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

def create_dnn_model(input_dim):
    model = Sequential([
        Dense(32, activation='relu', input_dim=input_dim),
        BatchNormalization(),
        Dropout(0.5),
        Dense(16, activation='relu'),
        BatchNormalization(),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

input_dim = X_train.shape[1]
dnn_model = create_dnn_model(input_dim)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = dnn_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

y_pred_dnn_prob = dnn_model.predict(X_test)
y_pred_dnn = (y_pred_dnn_prob > 0.5).astype(int)

print("Deep Neural Network Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred_dnn))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_dnn_prob))
print(classification_report(y_test, y_pred_dnn))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 9ms/step - accuracy: 0.7703 - loss: 0.5203 - val_accuracy: 0.9663 - val_loss: 0.1022
Epoch 2/20
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9503 - loss: 0.1646 - val_accuracy: 0.9718 - val_loss: 0.0779
Epoch 3/20
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9639 - loss: 0.1263 - val_accuracy: 0.9740 - val_loss: 0.0647
Epoch 4/20
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.9653 - loss: 0.1090 - val_accuracy: 0.9752 - val_loss: 0.0577
Epoch 5/20
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.9715 - loss: 0.0931 - val_accuracy: 0.9772 - val_loss: 0.0551
Epoch 6/20
[1m630/630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9711 - loss: 0.0911 - val_accuracy: 0.9790 - val_loss: 0.0457
Epoch 7/20
[1m630/630[0m 