In [1]:
###########################
# IMPORT
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, cross_val_score
######################################

#Caricamento dati

In [3]:
train_df = pd.read_csv("train_net.csv")

#Elaborazione dati 

In [4]:
from sklearn.model_selection import train_test_split

# Converti i nomi delle colonne in maiuscolo
train_df.columns = train_df.columns.str.upper()

# Sostituisci i valori mancanti con No Alert nella colonna ALERT
train_df['ALERT'] = train_df['ALERT'].fillna('No Alert')

#print(train_df['ALERT'].value_counts())

# Escludi le variabili non desiderate
columns_to_exclude = ['FIRST_SWITCHED', 'FLOW_DURATION_MILLISECONDS', 'LAST_SWITCHED', 'PROTOCOL', 'ID', 'FLOW_ID', 
                      'ANALYSIS_TIMESTAMP','MIN_IP_PKT_LEN', 'MAX_IP_PKT_LEN','TOTAL_PKTS_EXP', 'IPV4_SRC_ADDR', 'IPV4_DST_ADDR',
                      'TOTAL_BYTES_EXP']
train_data = train_df.drop(columns=columns_to_exclude)

# Prepara le feature e il target per il training set
X_train = train_data.drop('ALERT', axis=1)
y_train = train_data['ALERT']


# Gestisci i valori NaN nelle feature
X_train = X_train.fillna(X_train.mean(numeric_only=True)).fillna('MISSING')

# Gestisci i valori NaN nel target
y_train = y_train.fillna(y_train.mode()[0])

# Create dummy columns for the PROTOCOL MAP column
protocol_dummies = pd.get_dummies(X_train['PROTOCOL_MAP'], prefix='PROTOCOL_MAP', drop_first=True) # drop_first=True per evitare la multi-collinearità

# Rimuovi le colonne originali e aggiungi le nuove colonne numeriche
X_train = X_train.drop(['PROTOCOL_MAP'], axis=1)  # Rimuove le colonne testuali
X_train = pd.concat([X_train, protocol_dummies], axis=1)

print(X_train.dtypes)

def split_maintain_distribution(X, y):
  sss=StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=9)
  indexes = sss.split(X, y)
  train_indices, test_indices = next(indexes)
  return X.iloc[train_indices], X.iloc[test_indices], y.iloc[train_indices], y.iloc[test_indices]

X_train, X_val, y_train, y_val = split_maintain_distribution(X_train, y_train)

print('Train set distribution:')
print(y_train.value_counts(normalize=True))
print()
print('Validation set distribution:')
print(y_val.value_counts(normalize=True))


L4_SRC_PORT                 int64
L4_DST_PORT                 int64
TCP_FLAGS                   int64
TCP_WIN_MAX_IN              int64
TCP_WIN_MAX_OUT             int64
TCP_WIN_MIN_IN              int64
TCP_WIN_MIN_OUT             int64
TCP_WIN_MSS_IN              int64
TCP_WIN_SCALE_IN            int64
TCP_WIN_SCALE_OUT           int64
SRC_TOS                     int64
DST_TOS                     int64
TOTAL_FLOWS_EXP             int64
IN_BYTES                    int64
IN_PKTS                     int64
OUT_BYTES                   int64
OUT_PKTS                    int64
ANOMALY                   float64
PROTOCOL_MAP_icmp            bool
PROTOCOL_MAP_ipv6            bool
PROTOCOL_MAP_ipv6-icmp       bool
PROTOCOL_MAP_skip            bool
PROTOCOL_MAP_tcp             bool
PROTOCOL_MAP_udp             bool
dtype: object
Train set distribution:
ALERT
No Alert             0.867550
Port Scanning        0.120410
Denial of Service    0.011948
Malware              0.000092
Name: proportion, dt

#Data scaling

In [8]:
from sklearn.preprocessing import StandardScaler
from joblib import dump

# Fix scaler on train set
scaler = StandardScaler()
fitter = scaler.fit(X_train)
# Salvare il fitter
dump(fitter, 'scaler_fitter.joblib')

# Scale train and validation sets
x_train_scaled = fitter.transform(X_train)
x_validation_scaled = fitter.transform(X_val)

# Convert to pandas dataframe
df_feat_train = pd.DataFrame(x_train_scaled, columns=X_train.columns)
df_feat_validation = pd.DataFrame(x_validation_scaled, columns=X_val.columns)

#Addestramento con Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from joblib import dump

# Inizializzazione del classificatore Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Addestramento del modello
rf_model.fit(df_feat_train, y_train)

# Salva il modello addestrato in un file .joblib
dump(rf_model, 'modello_addestrato.joblib')

# Valutazione sul validation set
val_predictions = rf_model.predict(df_feat_validation)
val_accuracy = accuracy_score(y_val, val_predictions) * 100
print(f'Validation Accuracy: {val_accuracy:.2f}%')


Validation Accuracy: 99.99%


#Valutazione modello

In [13]:
from sklearn.metrics import classification_report, confusion_matrix

# Calcolo di precision, recall e F1-score
print("Classification Report:")
print(classification_report(y_val, val_predictions))

# Calcolo della matrice di confusione
conf_matrix = confusion_matrix(y_val, val_predictions)
print("Confusion Matrix:")
print(conf_matrix)

Classification Report:
                   precision    recall  f1-score   support

Denial of Service       1.00      1.00      1.00     10078
          Malware       1.00      0.99      0.99        78
         No Alert       1.00      1.00      1.00    731800
    Port Scanning       1.00      1.00      1.00    101569

         accuracy                           1.00    843525
        macro avg       1.00      1.00      1.00    843525
     weighted avg       1.00      1.00      1.00    843525

Confusion Matrix:
[[ 10078      0      0      0]
 [     0     77      1      0]
 [     2      0 731774     24]
 [     0      0     20 101549]]
