In [1]:
# TP3: Artificial Neural Networks (ANN) with TensorFlow
### Machine Learning, Deep Learning and Security – 4th Year Engineering Security, USTHB
#### Imane Ameur – Complete Solution Notebook (November 2025)

# In[1]: Imports & Reproducibility
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Reproducibility (Question 1)
tf.random.set_seed(42)
np.random.seed(42)

print("TensorFlow version:", tf.__version__)

# In[2]: Part 2 – Data Loading and Exploration
url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt"

columns = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes",
    "land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted",
    "num_root","num_file_creations","num_shells","num_access_files",
    "num_outbound_cmds","is_host_login","is_guest_login","count",
    "srv_count","serror_rate","srv_serror_rate","rerror_rate",
    "srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate",
    "dst_host_count","dst_host_srv_count","dst_host_same_srv_rate",
    "dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate",
    "dst_host_srv_serror_rate","dst_host_rerror_rate",
    "dst_host_srv_rerror_rate","label","difficulty"
]

df = pd.read_csv(url, header=None, names=columns)

# Remove difficulty column (will not be used)
df = df.drop('difficulty', axis=1)

print("Shape:", df.shape)
df.head()

TensorFlow version: 2.19.0
Shape: (125973, 42)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


**Question 1**  
We set random seeds to ensure **reproducibility**: every run with the same code and data yields identical results (weight initialisation, shuffling, dropout masks, etc.). Reproducibility is a cornerstone of scientific ML.

**Question 2** → 125973 samples  
**Question 3** → 41 features (excluding label and difficulty)  
**Question 4** → Normal: 67343 (53.46 %), Attack: 58630 (46.54 %)  
**Question 5** → The dataset is reasonably balanced for binary classification, which prevents the model from trivially predicting the majority class and gives stable training dynamics.

In [2]:
# In[]:
print("Label distribution")
print(df['label'].value_counts())
print("\nNormal vs Attack")
print(df['label'].apply(lambda x: 'normal' if x == 'normal' else 'attack').value_counts(normalize=True) * 100)

Label distribution
label
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: count, dtype: int64

Normal vs Attack
label
normal    53.458281
attack    46.541719
Name: proportion, dtype: float64


# In[3]: Part 3 – Data Preprocessing

In [3]:
# Separate features and labels
X = df.drop('label', axis=1)
y_raw = df['label']

# One-hot encoding of categorical variables
categorical_cols = ['protocol_type', 'service', 'flag']
X_encoded = pd.get_dummies(X, columns=categorical_cols, dtype=int)

# Binary labels
y = y_raw.apply(lambda x: 0 if x == 'normal' else 1)

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

print("After one-hot →", X_encoded.shape[1], "features")   # 122

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y)

print("Train samples:", X_train.shape[0])
print("Test  samples:", X_test.shape[0])

After one-hot → 122 features
Train samples: 100778
Test  samples: 25195


**Question 6** → One-hot encoding (pd.get_dummies) because the variables are nominal (no order).  
**Question 7** → 122 features (41 − 3 categorical + 84 one-hot columns).  
**Question 8** → Binary classification.  
**Question 9** → Neural networks use gradient descent; features with vastly different scales cause slow/converging-unstable training. Standardisation gives μ=0, σ=1 → faster & stable convergence.  
**Question 10** → Train 100778, Test 25195

# In[4]: Part 4 – Model Architecture

In [4]:
def build_model(n_hidden_layers, n_neurons, learning_rate, dropout_rate=0.0):
    model = Sequential()
    
    # First hidden layer (needs input shape)
    model.add(Dense(n_neurons, activation='relu', input_shape=(122,)))
    if dropout_rate > 0:
        model.add(Dropout(dropout_rate))
    
    # Additional hidden layers
    for _ in range(n_hidden_layers - 1):
        model.add(Dense(n_neurons, activation='relu'))
        if dropout_rate > 0:
            model.add(Dropout(dropout_rate))
    
    # Output layer
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

In [5]:
# Parameter counts (Question 14)
shallow = build_model(1, 4, 0.05, 0.0)
deep = build_model(3, 32, 0.001, 0.2)

print("Shallow network")
shallow.summary()
print("\nDeep network")
deep.summary()

# Compression ratio shallow: 122 → 4 = 30.5×

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Shallow network



Deep network
