In [None]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('../utils')
from preprocessing import load_cicids_csv, preprocess_cicids
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from model import train_random_forest, save_model

In [73]:
# Curățare X_test
X_test_clean = X_test.replace([np.inf, -np.inf], np.nan)
mask = ~X_test_clean.isna().any(axis=1)
X_test_clean = X_test_clean[mask]

# Curățare y_test cu aceeași mască
y_test_clean = y_test[mask]

In [74]:
print("NaNs in X_test_clean:", X_test_clean.isna().sum().sum())
print("Infs in X_test_clean:", np.isinf(X_test_clean).sum().sum())

NaNs in X_test_clean: 0
Infs in X_test_clean: 0


In [75]:
csv_path = '../data/TrafficLabelling/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'

df = load_cicids_csv(csv_path)
print("Sample data:")
display(df.head())

Sample data:


Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.10.5-104.16.207.165-54865-443-6,104.16.207.165,443,192.168.10.5,54865,6,7/7/2017 3:30,3,2,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,192.168.10.5-104.16.28.216-55054-80-6,104.16.28.216,80,192.168.10.5,55054,6,7/7/2017 3:30,109,1,1,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,192.168.10.5-104.16.28.216-55055-80-6,104.16.28.216,80,192.168.10.5,55055,6,7/7/2017 3:30,52,1,1,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,192.168.10.16-104.17.241.25-46236-443-6,104.17.241.25,443,192.168.10.16,46236,6,7/7/2017 3:30,34,1,1,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,192.168.10.5-104.19.196.102-54863-443-6,104.19.196.102,443,192.168.10.5,54863,6,7/7/2017 3:30,3,2,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [76]:
df.columns = df.columns.str.strip()  # elimină spațiile la început/sfârșit

# Asumăm că eticheta se numește "Label"
X = df.drop('Label', axis=1)
y = df['Label']


In [77]:
X = X.replace([np.inf, -np.inf], np.nan)
X = X.dropna()
y = y.loc[X.index]  # păstrează doar etichetele corespunzătoare rândurilor păstrate

In [78]:
# Preprocess data
X, y = preprocess_cicids(df)

print(f"Shape of X: {X.shape}")
print(f"Example features:\n{X.columns.tolist()[:10]}")


Shape of X: (225745, 80)
Example features:
['Source Port', 'Destination Port', 'Protocol', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min']


In [79]:
print(type(X), type(y))
print(X is None, y is None)


<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'>
False False


In [80]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")


Train samples: 180596
Test samples: 45149


In [81]:
# Train model
model = train_random_forest(X_clean, y_clean)

In [82]:
# Evaluate model
y_pred = model.predict(X_test_clean)

print("Confusion Matrix:")
print(confusion_matrix(y_test_clean, y_pred))

print("\nClassification Report:")
print(classification_report(y_test_clean, y_pred))

Confusion Matrix:
[[19395     0]
 [    0 25744]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19395
           1       1.00      1.00      1.00     25744

    accuracy                           1.00     45139
   macro avg       1.00      1.00      1.00     45139
weighted avg       1.00      1.00      1.00     45139



In [84]:
# Save model
save_model(model, '../models/model_cicids.pt')
print("Model saved as 'models/model_cicids.pt'")


Model saved as 'models/model_cicids.pt'
