In [1]:
# All imports
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:

# File path for the dataset
file_path = '/Users/maryam/ML-IDS/archive/KDDTrain+.txt'

# Column names as per the dataset description
columns = [
    "duration", "protocol_type", "service", "flag", "src_bytes",
    "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
    "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root",
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label", "difficulty_level"
]

# Load the dataset
df_train = pd.read_csv(file_path, header=None, names=columns)

# Display the first few rows of the dataframe
df_train


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty_level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0,tcp,private,S0,0,0,0,0,0,0,...,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,neptune,20
125969,8,udp,private,SF,105,145,0,0,0,0,...,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,normal,21
125970,0,tcp,smtp,SF,2231,384,0,0,0,0,...,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,normal,18
125971,0,tcp,klogin,S0,0,0,0,0,0,0,...,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,20


In [3]:

# Assuming df is your dataframe for the NSL-KDD dataset
unique_attack_types = df_train['label'].unique()

# This will give you a list-like array of all unique attack types in the dataset
print(unique_attack_types)


['normal' 'neptune' 'warezclient' 'ipsweep' 'portsweep' 'teardrop' 'nmap'
 'satan' 'smurf' 'pod' 'back' 'guess_passwd' 'ftp_write' 'multihop'
 'rootkit' 'buffer_overflow' 'imap' 'warezmaster' 'phf' 'land'
 'loadmodule' 'spy' 'perl']


In [4]:

# Check for missing values
missing_values = df_train.isnull().sum()

# Columns to be one-hot encoded because they are categorical variables
categorical_columns = ['protocol_type', 'service', 'flag']

# Remaining columns for scaling because they are numerical variables
numerical_columns = [col for col in df_train.columns if col not in categorical_columns + ['label', 'difficulty_level']]

# Create a ColumnTransformer for preprocessing 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])

# Creating a pipeline that first preprocesses the data and then applies a placeholder model
# Placeholder model can be replaced later with the chosen ML model
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Apply the preprocessing pipeline to the dataset (excluding the label and difficulty level)
X = df_train.drop(['label', 'difficulty_level'], axis=1)
y = df_train['label']

# Fit and transform the pipeline on the dataset
X_processed = pipeline.fit_transform(X)

# Checking the result of preprocessing
X_processed.shape  # Display the new shape of the data after preprocessing


(125973, 122)

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 43 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  int64  
 1   protocol_type                125973 non-null  object 
 2   service                      125973 non-null  object 
 3   flag                         125973 non-null  object 
 4   src_bytes                    125973 non-null  int64  
 5   dst_bytes                    125973 non-null  int64  
 6   land                         125973 non-null  int64  
 7   wrong_fragment               125973 non-null  int64  
 8   urgent                       125973 non-null  int64  
 9   hot                          125973 non-null  int64  
 10  num_failed_logins            125973 non-null  int64  
 11  logged_in                    125973 non-null  int64  
 12  num_compromised              125973 non-null  int64  
 13 

In [6]:

# import the model
from sklearn.tree import DecisionTreeClassifier

# Create a placeholder model
model = DecisionTreeClassifier()
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# compute the accuracy report of the model
print("Accuracy of the DecisionTreeClassifier is: ", accuracy_score(y_test, y_pred))

# can you try another medel and see if you can get a better accuracy?
# Create a placeholder model
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# compute the accuracy report of the model
print("Accuracy of the RandomForestClassifier is: ", accuracy_score(y_test, y_pred))

# Create a placeholder model
model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# compute the accuracy report of the model
print("Accuracy of the SVC is: ", accuracy_score(y_test, y_pred))






Accuracy of the DecisionTreeClassifier is:  0.9967453859892836
Accuracy of the RandomForestClassifier is:  0.9980154792617583
Accuracy of the SVC is:  0.9911887279222068


In [8]:
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score

# Initialize the models
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

# Dictionary to store the performance metrics of each model
performance_metrics = {}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    performance_metrics[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

# Create a DataFrame from the dictionary
performance_df = pd.DataFrame(performance_metrics).T

# Display the DataFrame
print(performance_df)


               Accuracy  Precision    Recall  F1 Score
Decision Tree  0.996547   0.996972  0.996547  0.996717
Random Forest  0.997936   0.997775  0.997936  0.997789
SVM            0.991189   0.990872  0.991189  0.990866


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")


In [21]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier with default parameters
# You can change these to the parameters you've chosen based on hyperparameter tuning
final_rf_model = RandomForestClassifier()

# Assuming X_processed is your entire training features and y is the corresponding labels
final_rf_model.fit(X_processed, y)

# Now the model is trained on the entire training dataset


# TESTING THE MODEL 

In [18]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Assuming that 'pipeline' is your fitted preprocessing pipeline from the training phase
# and 'columns' is the list of column names used in the training dataset

# Load the test data
test_file_path = '/Users/maryam/ML-IDS/archive/KDDTest+.txt'  # Update this to your test data file path
df_test = pd.read_csv(test_file_path, header=None, names=columns)

# Separate features and labels
X_test = df_test.drop(['label', 'difficulty_level'], axis=1)
y_test = df_test['label']

# Apply the same preprocessing to the test data
X_test_processed = pipeline.transform(X_test)

# Now, X_test_processed is ready for evaluation with your trained model



In [22]:
# Assuming X_test_processed is your preprocessed test features and y_test is the true labels
y_pred_test = final_rf_model.predict(X_test_processed)

# Evaluate the model's performance
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy on test data:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))


Accuracy on test data: 0.7219659332860184

Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

        apache2       0.00      0.00      0.00       737
           back       0.74      0.96      0.84       359
buffer_overflow       0.00      0.00      0.00        20
      ftp_write       0.00      0.00      0.00         3
   guess_passwd       0.00      0.00      0.00      1231
     httptunnel       0.00      0.00      0.00       133
           imap       0.00      0.00      0.00         1
        ipsweep       0.76      0.99      0.86       141
           land       1.00      0.29      0.44         7
     loadmodule       0.00      0.00      0.00         2
       mailbomb       0.00      0.00      0.00       293
          mscan       0.00      0.00      0.00       996
       multihop       0.00      0.00      0.00        18
          named       0.00      0.00      0.00        17
        neptune       0.96      1.00      0.98      4657
           nmap       0.58      1.00      0.73        73
         normal       0.64    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# Assuming X_processed and y are your preprocessed feature set and labels

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Initialize the MLPClassifier with one hidden layer of 100 neurons
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=200, random_state=42)

# Train the model
mlp_model.fit(X_train, y_train)

# Predict on the test set
y_pred_test = mlp_model.predict(X_test)

# Evaluate the model's performance
print("Accuracy on test data:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))


Accuracy on test data: 0.9959912681087517

Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

           back       0.98      0.99      0.99       185
buffer_overflow       1.00      0.22      0.36         9
   guess_passwd       1.00      0.91      0.95        11
           imap       1.00      1.00      1.00         1
        ipsweep       0.98      0.98      0.98       733
           land       0.60      1.00      0.75         3
     loadmodule       0.00      0.00      0.00         0
        neptune       1.00      1.00      1.00      8228
           nmap       0.98      0.97      0.98       313
         normal       1.00      1.00      1.00     13422
           perl       0.50      1.00      0.67         1
            phf       1.00      1.00      1.00         1
            pod       1.00      0.93      0.96        43
      portsweep       1.00      1.00      1.00       573
        rootkit       0.00      0.00      0.00         1
          satan       0.99      0.98      0.98       738
          smurf       0.99    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
from sklearn.metrics import accuracy_score

# Predict on the training set
y_pred_train = mlp_model.predict(X_train)

# Calculate accuracy on the training set
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", train_accuracy)

# Predict on the test set
y_pred_test = mlp_model.predict(X_test)

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Testing Accuracy:", test_accuracy)


Training Accuracy: 0.9981245906844748
Testing Accuracy: 0.9959912681087517
