In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

file_path = "/kaggle/input/network-intrusion-dataset/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv"
df = pd.read_csv(file_path)

possible_labels = ['Label', 'label', ' Label', 'attack', 'Attack', 'Attack_type']
label_col = None
for col in df.columns:
    if col.strip() in possible_labels:
        label_col = col
        break

if not label_col:
    raise ValueError("No suitable label column found in the dataset!")


In [18]:
df = df.rename(columns={label_col: 'Label'})
print(f"Detected and renamed label column: '{label_col}'")

df['Label'] = df['Label'].astype(str).str.strip().str.upper()
print(f"Unique values in the original label column after strip and upper: {df['Label'].unique()}") # Add this line

df['Label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
print(f"Value counts for the converted 'Label' column (0 for BENIGN, 1 otherwise):\n{df['Label'].value_counts()}") # Add this line

df_numeric = df.select_dtypes(include=[np.number])

if 'Label' not in df_numeric.columns:
    df_numeric['Label'] = df['Label']

df_numeric = df_numeric.replace([np.inf, -np.inf], np.nan)
df_numeric = df_numeric.fillna(0)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(df_numeric.drop('Label', axis=1))

X_train = X_scaled[df_numeric['Label'] == 0]
X_test = X_scaled

print(f"Shape of X_train after filtering for BENIGN samples: {X_train.shape}") # Add this line

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

input_dim = X_scaled.shape[1]
encoding_dim = 16

input_layer = Input(shape=(input_dim,))
encoded = Dense(32, activation='relu')(input_layer)
encoded = Dense(encoding_dim, activation='relu')(encoded)
decoded = Dense(32, activation='relu')(encoded)
output_layer = Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

Detected and renamed label column: 'Label'
Unique values in the original label column after strip and upper: ['1']
Value counts for the converted 'Label' column (0 for BENIGN, 1 otherwise):
Label
1    286467
Name: count, dtype: int64
Shape of X_train after filtering for BENIGN samples: (0, 78)


In [19]:
autoencoder.fit(X_test, X_test,
                epochs=20,
                batch_size=256,
                shuffle=True)

reconstructions = autoencoder.predict(X_test)
reconstruction_error = np.mean(np.power(X_test - reconstructions, 2), axis=1)

train_error = np.mean(np.power(X_test - reconstructions, 2), axis=1)
threshold = train_error.mean() + 3 * train_error.std()

predictions = [1 if err > threshold else 0 for err in reconstruction_error]


Epoch 1/20
[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.0467
Epoch 2/20
[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 6.6471e-04
Epoch 3/20
[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 3.6112e-04
Epoch 4/20
[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 2.5901e-04
Epoch 5/20
[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 2.2151e-04
Epoch 6/20
[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 1.8853e-04
Epoch 7/20
[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 1.7628e-04
Epoch 8/20
[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 1.6960e-04
Epoch 9/20
[1m1120/1120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 1.5692e-04
Epoch 10/20
[1m1120/1120[0m [32m━━━━━━

In [20]:
y_test = df_numeric['Label'].values

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("\nClassification Report:")
print(classification_report(y_test, predictions))

Confusion Matrix:
[[     0      0]
 [285295   1172]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.00      0.01    286467

    accuracy                           0.00    286467
   macro avg       0.50      0.00      0.00    286467
weighted avg       1.00      0.00      0.01    286467



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
