In [None]:
import os  # For file handling
import warnings  # To handle warning messages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [None]:
warnings.filterwarnings("ignore")


In [None]:
file_path = "/content/KDDTrain+.txt"

if os.path.exists(file_path):
    data = pd.read_csv(file_path)
    print("File loaded successfully!")
else:
    print(f"Error: File not found at {file_path}")

File loaded successfully!


In [None]:
data = pd.read_csv(file_path)

In [None]:
#  Preprocessing
data.dropna(inplace=True)

In [None]:
categorical_columns = data.select_dtypes(include=['object']).columns

In [None]:
# Print categorical column names
print("Categorical columns in the dataset:")
print(categorical_columns.tolist())

Categorical columns in the dataset:
['tcp', 'ftp_data', 'SF', 'normal']


In [None]:
label_encoder = LabelEncoder()

In [None]:
for col in categorical_columns:
    if col in data.columns:  # Ensure the column exists in the DataFrame
        data[col] = label_encoder.fit_transform(data[col])
    else:
        print(f"Column '{col}' not found in the dataset.")

print("Label encoding applied to categorical columns.")

Label encoding applied to categorical columns.


In [None]:
# Splitting features and target variable
X = data.drop(columns=['normal'])  # Independent variables
y = data['normal']

In [None]:
# Normalize numerical features (recommended for SVM)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
#  Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
#  Train and evaluate SVM model using SGDClassifier
sgd_svm = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3, random_state=42)
sgd_svm.fit(X_train, y_train)

In [None]:
# Predictions for SVM
y_pred_sgd_svm = sgd_svm.predict(X_test)

In [None]:
# Store accuracy in a variable
sgd_svm_accuracy = accuracy_score(y_test, y_pred_sgd_svm)


In [None]:
# Performance evaluation for SVM
print("SGD SVM Model Performance:")
print(f"Accuracy: {sgd_svm_accuracy:.4f}")
print(classification_report(y_test, y_pred_sgd_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_sgd_svm))


SGD SVM Model Performance:
Accuracy: 0.9757
              precision    recall  f1-score   support

           0       0.95      0.79      0.86       201
           1       0.75      0.43      0.55         7
           3       0.83      0.71      0.77        14
           4       0.00      0.00      0.00         2
           5       0.96      0.92      0.94       743
           6       0.00      0.00      0.00         2
           7       1.00      0.33      0.50         3
           8       0.00      0.00      0.00         1
           9       1.00      1.00      1.00      8238
          10       0.88      0.75      0.81       298
          11       0.97      0.99      0.98     13386
          12       0.00      0.00      0.00         1
          13       1.00      1.00      1.00         1
          14       1.00      0.56      0.71        36
          15       0.97      0.88      0.92       583
          16       0.00      0.00      0.00         2
          17       0.88      0.85    

In [None]:
#  Train and evaluate Decision Tree model
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)  # Adjust max_depth if needed
dt_model.fit(X_train, y_train)



In [None]:
# Predictions for Decision Tree
y_pred_dt = dt_model.predict(X_test)

In [None]:
# Store accuracy in a variable
dt_accuracy = accuracy_score(y_test, y_pred_dt)

In [None]:
# Performance evaluation for Decision Tree
print("\nDecision Tree Model Performance:")
print(f"Accuracy: {dt_accuracy:.4f}")
print(classification_report(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))



Decision Tree Model Performance:
Accuracy: 0.9536
              precision    recall  f1-score   support

           0       0.91      0.97      0.94       201
           1       0.00      0.00      0.00         7
           3       0.00      0.00      0.00        14
           4       0.00      0.00      0.00         2
           5       0.94      0.90      0.92       743
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         3
           8       0.00      0.00      0.00         1
           9       1.00      0.99      0.99      8238
          10       0.90      0.18      0.31       298
          11       0.94      0.99      0.96     13386
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         1
          14       1.00      0.61      0.76        36
          15       0.78      0.90      0.84       583
          16       0.00      0.00      0.00         2
          17       0.98      0

In [None]:
#  Compare results
print("\nModel Comparison:")
print(f"SGD SVM Accuracy: {sgd_svm_accuracy:.4f}")
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")

if sgd_svm_accuracy > dt_accuracy:
    print("\nSGD SVM performed better than Decision Tree.")
else:
    print("\nDecision Tree performed better than SGD SVM.")


Model Comparison:
SGD SVM Accuracy: 0.9757
Decision Tree Accuracy: 0.9536

SGD SVM performed better than Decision Tree.
