In [1]:
!pip install pandas scikit-learn matplotlib seaborn tensorflow




In [2]:
import pandas as pd

# Load data (adjust filename if needed)
df = pd.read_csv("kddcup.data_10_percent_corrected", header=None)

# Preview data
df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9.0,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19.0,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29.0,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39.0,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49.0,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [3]:
columns = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes",
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", "logged_in",
    "num_compromised", "root_shell", "su_attempted", "num_root",
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
    "is_host_login", "is_guest_login", "count", "srv_count",
    "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
    "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count",
    "dst_host_srv_count", "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate", "dst_host_srv_serror_rate", "dst_host_rerror_rate",
    "dst_host_srv_rerror_rate", "label"
]
df.columns = columns


In [4]:
df['label'] = df['label'].apply(lambda x: 0 if x == 'normal.' else 1)


In [5]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['protocol_type', 'service', 'flag']
encoder = LabelEncoder()

for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])


In [6]:
X = df.drop("label", axis=1)
y = df["label"]  # For evaluation (but not used for training)


In [7]:
from sklearn.ensemble import IsolationForest

# Use contamination based on expected attack ratio
iso_forest = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)

# Fit the model
iso_forest.fit(X)

# Predict anomalies (-1 = anomaly, 1 = normal)
y_pred_iforest = iso_forest.predict(X)

# Convert predictions to 0 (normal) and 1 (anomaly)
y_pred_iforest = [1 if i == -1 else 0 for i in y_pred_iforest]


In [8]:
from sklearn.metrics import classification_report, confusion_matrix

print("📊 Isolation Forest Results:")
print(confusion_matrix(y, y_pred_iforest))
print(classification_report(y, y_pred_iforest))


📊 Isolation Forest Results:
[[53058  4023]
 [40986  6427]]
              precision    recall  f1-score   support

           0       0.56      0.93      0.70     57081
           1       0.62      0.14      0.22     47413

    accuracy                           0.57    104494
   macro avg       0.59      0.53      0.46    104494
weighted avg       0.59      0.57      0.48    104494



In [9]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


In [10]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

input_dim = X_scaled.shape[1]

input_layer = Input(shape=(input_dim,))
encoded = Dense(32, activation='relu')(input_layer)
encoded = Dense(16, activation='relu')(encoded)

decoded = Dense(32, activation='relu')(encoded)
output_layer = Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer='adam', loss='mse')

autoencoder.summary()


In [11]:
X_normal = X_scaled[y == 0]  # Only normal data for training

history = autoencoder.fit(
    X_normal, X_normal,
    epochs=10,
    batch_size=256,
    shuffle=True,
    validation_split=0.2,
    verbose=1
)


Epoch 1/10
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.1274 - val_loss: 0.0099
Epoch 2/10
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0083 - val_loss: 0.0034
Epoch 3/10
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0027 - val_loss: 0.0016
Epoch 4/10
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0016 - val_loss: 0.0013
Epoch 5/10
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0012 - val_loss: 0.0011
Epoch 6/10
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - loss: 0.0010 - val_loss: 9.0236e-04
Epoch 7/10
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 8.4113e-04 - val_loss: 7.7809e-04
Epoch 8/10
[1m179/179[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 7.3066e-04 - val_loss: 6.9950e-04
Epoch 9/10
[1m179/1

In [12]:
# Reconstruct all data
X_pred = autoencoder.predict(X_scaled)

# Compute mean squared error per sample
mse = tf.keras.losses.mse(X_scaled, X_pred).numpy()


[1m3266/3266[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step


In [13]:
import numpy as np

threshold = np.percentile(mse, 95)
print("Reconstruction error threshold:", threshold)


Reconstruction error threshold: nan


In [14]:
y_pred_ae = [1 if e > threshold else 0 for e in mse]


In [15]:
from sklearn.metrics import classification_report, confusion_matrix

print("📊 Autoencoder Results:")
print(confusion_matrix(y, y_pred_ae))
print(classification_report(y, y_pred_ae))


📊 Autoencoder Results:
[[57081     0]
 [47413     0]]
              precision    recall  f1-score   support

           0       0.55      1.00      0.71     57081
           1       0.00      0.00      0.00     47413

    accuracy                           0.55    104494
   macro avg       0.27      0.50      0.35    104494
weighted avg       0.30      0.55      0.39    104494

