# Machine Learning Project: T7 - Intrusion Detection

## Objectives
- Load and preprocess the KDD dataset.
- Train 2 machine learning models for attack classification.
- Evaluate the models' performance using appropriate metrics.
- Analyze the results and determine the best model for letter recognition.

## Context

## Methodology
1. **Dataset Loading**: Use pandas to load the EMNIST dataset.
2. **Preprocessing**: Separate the labels from the images and reshape the images.
3. **Model Training**: Use various machine learning techniques to train the models.
4. **Evaluation**: Use performance metrics such as accuracy to evaluate the models.
5. **Results Analysis**: Compare the models and select the best one.

# Uploading libraries
Let's begin to prepare all the libraries and load the dataset from the current directory.

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='keras.src.trainers.data_adapters.py_dataset_adapter')
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Layer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from sklearn.compose import ColumnTransformer
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, Activation, Add, GlobalAveragePooling2D, Dense, Flatten
from tensorflow.keras.models import Model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

2025-01-02 16:54:16.376388: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-02 16:54:16.391425: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735833256.418059   34342 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735833256.426742   34342 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-02 16:54:16.455059: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [5]:
df = pd.read_csv("KDDTrainClean.csv")
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [None]:
df.info()
df.loc[df['label'] != "normal", "label"] = 'attack'
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
 
df.head()
# 3. Split dei dati in train e test
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Creare i dati per il grafico a torta
class_counts = df['label'].value_counts()
 
# Creare i dati per il grafico a torta
class_counts = df['label'].value_counts()
 
# Grafico a torta senza spazi
plt.figure(figsize=(8, 8))
class_counts.plot.pie(autopct='%1.1f%%', labels=class_counts.index, colors=['skyblue', 'salmon'], startangle=90)
plt.title("Distribuzione delle classi (normal vs attack)")
plt.ylabel("")  # Rimuove l'etichetta dell'asse y
plt.show()

In [None]:
plt.figure(figsize=(16, 6))
sns.countplot(x='protocol_type', data=df[df['label'] == "attack"], hue='label', palette='husl')
plt.xticks(rotation=45)
plt.title('Distribuzione degli Attacchi per Tipo di Protocollo', fontdict={'fontsize': 16})
plt.xlabel('Protocollo', fontsize=12)
plt.ylabel('Conteggio', fontsize=12)
plt.legend(title='Label', loc='upper right')
plt.show()

In [None]:
# 1. Identifica le colonne categoriali e numeriche
categorical_cols = ['is_host_login', 'protocol_type', 'service', 'flag', 'land', 'logged_in','is_guest_login']  # Colonne categoriali
numerical_cols = df.columns.difference(categorical_cols + ['label']).tolist()  # Colonne numeriche
 
# 2. Encoding della colonna target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
 
# 3. Preprocessing delle feature
# One-hot encoding per le colonne categoriali e scaling per quelle numeriche
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Standardizzazione delle colonne numeriche
        ('cat', OneHotEncoder(), categorical_cols)  # Encoding delle colonne categoriali
    ]
)
 
X = preprocessor.fit_transform(X)
 
# 4. Split dei dati in train e test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
def preprocess(dataframe, target_column='label', categorical_cols=None, numerical_cols=None):
    # Identify categorical and numerical columns if not provided
    if categorical_cols is None:
        categorical_cols = ['is_host_login', 'protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_guest_login']
    if numerical_cols is None:
        numerical_cols = dataframe.columns.difference(categorical_cols + [target_column]).tolist()
    
    # Step 1: Encoding the target column (binary classification)
    dataframe.loc[dataframe[target_column] != "normal", target_column] = "attack"
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(dataframe[target_column])
    
    # Step 2: Scaling numerical columns
    scaler = RobustScaler()
    scaled_numerical = scaler.fit_transform(dataframe[numerical_cols])
    scaled_df = pd.DataFrame(scaled_numerical, columns=numerical_cols)
    
    # Step 3: One-hot encoding categorical columns
    encoder = OneHotEncoder(sparse_output=False)
    encoded_categorical = encoder.fit_transform(dataframe[categorical_cols])
    encoded_df = pd.DataFrame(encoded_categorical, columns=encoder.get_feature_names_out(categorical_cols))
    
    # Step 4: Combine scaled numerical and encoded categorical features
    X = pd.concat([scaled_df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
    
    # Step 5: Split the dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    return X_train, X_test, y_train, y_test
 
X_train, X_test, y_train, y_test = preprocess(df, target_column='label')

# Logistic Regression

In [None]:
# 5. Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)
 
# 6. Predizioni
y_pred = model.predict(X_test)
 
# 7. Valutazione del modello
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Decision Tree

In [None]:
# 5. Gestione delle variabili categoriche
# Encoding delle variabili categoriche in X
#categorical_columns = X.select_dtypes(include=["object"]).columns
#X = pd.get_dummies(X, columns=categorical_columns)
 
categorical_cols = ['protocol_type', 'service', 'flag']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols)  # Encoding delle colonne categoriali
    ]
)
# Encoding dei target in y
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
 
# 6. Split dei dati in train e test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
# 7. Creazione e allenamento del Decision Tree
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
 
# 8. Valutazione sul test set
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
 
 