### Intrusion Detection System Model Development


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline


In [12]:
# Define the file path and column names
file_path = '/Users/maryam/ML-IDS/archive/KDDTrain+.txt'


# Column names as per the dataset description
columns = [
    "duration", "protocol_type", "service", "flag", "src_bytes",
    "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
    "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root",
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label", "difficulty_level"
]

# Load the dataset
df = pd.read_csv(file_path, header=None, names=columns)


In [13]:
# Define categorical and numerical columns
categorical_columns = ['protocol_type', 'service', 'flag']
numerical_columns = [col for col in df.columns if col not in categorical_columns + ['label']]

# Set up the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ])
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Apply preprocessing
X = df.drop('label', axis=1)
y = df['label']
X_processed = pipeline.fit_transform(X)


In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)


In [15]:
# Initialize and train the MLP Classifier
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=200, random_state=42)
mlp_model.fit(X_train, y_train)


In [16]:
# Evaluate the model on the test set
y_pred = mlp_model.predict(X_test)
print("Accuracy on test data:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy on test data: 0.9981742409208176

Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

           back       1.00      1.00      1.00       185
buffer_overflow       1.00      0.44      0.62         9
   guess_passwd       1.00      0.91      0.95        11
           imap       1.00      1.00      1.00         1
        ipsweep       0.98      1.00      0.99       733
           land       1.00      1.00      1.00         3
       multihop       0.00      0.00      0.00         0
        neptune       1.00      1.00      1.00      8228
           nmap       0.98      0.98      0.98       313
         normal       1.00      1.00      1.00     13422
           perl       0.50      1.00      0.67         1
            phf       1.00      1.00      1.00         1
            pod       1.00      0.93      0.96        43
      portsweep       1.00      1.00      1.00       573
        rootkit       0.00      0.00      0.00         1
          satan       1.00      0.99      0.99       738
          smurf       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
