In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, Subset
from collections import Counter

df = pd.read_csv('IoTDIAD_sum.csv')
print(df.head())

                                         Flow ID           Src IP  Src Port  \
0  192.168.137.140-182.92.131.196-27477-32100-17  192.168.137.140     27477   
1             192.168.137.96-8.8.8.8-44596-53-17   192.168.137.96     44596   
2       192.168.137.66-192.168.137.71-42640-80-6   192.168.137.66     42640   
3       51.145.143.28-192.168.137.48-443-57492-6    51.145.143.28       443   
4       192.168.137.175-52.30.64.155-47918-443-6  192.168.137.175     47918   

           Dst IP  Dst Port  Protocol               Timestamp  Flow Duration  \
0  182.92.131.196     32100        17  18/10/2022 02:47:42 PM       87992416   
1         8.8.8.8        53        17  18/10/2022 09:41:43 AM         101826   
2  192.168.137.71        80         6  09/08/2022 10:44:14 AM          29499   
3  192.168.137.48     57492         6  18/10/2022 10:59:50 AM          16398   
4    52.30.64.155       443         6  07/10/2022 06:21:45 PM         818807   

   Total Fwd Packet  Total Bwd packets  ... 

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104947 entries, 0 to 104946
Data columns (total 84 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Flow ID                     104947 non-null  object 
 1   Src IP                      104947 non-null  object 
 2   Src Port                    104947 non-null  int64  
 3   Dst IP                      104947 non-null  object 
 4   Dst Port                    104947 non-null  int64  
 5   Protocol                    104947 non-null  int64  
 6   Timestamp                   104947 non-null  object 
 7   Flow Duration               104947 non-null  int64  
 8   Total Fwd Packet            104947 non-null  int64  
 9   Total Bwd packets           104947 non-null  int64  
 10  Total Length of Fwd Packet  104947 non-null  float64
 11  Total Length of Bwd Packet  104947 non-null  float64
 12  Fwd Packet Length Max       104947 non-null  float64
 13  Fwd Packet Len

In [3]:
df.isna().sum()/len(df)

Flow ID      0.0
Src IP       0.0
Src Port     0.0
Dst IP       0.0
Dst Port     0.0
            ... 
Idle Mean    0.0
Idle Std     0.0
Idle Max     0.0
Idle Min     0.0
Label        0.0
Length: 84, dtype: float64

In [4]:
nan_cols = [col for col in df.columns if df[col].isna().any()]
len(nan_cols)

1

In [5]:
numeric_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
categorical_cols = [col for col in df.columns if col not in numeric_cols]
print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

Numeric columns: ['Src Port', 'Dst Port', 'Protocol', 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets', 'Total Length of Fwd Packet', 'Total Length of Bwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWR Fla

In [6]:
for col in categorical_cols:
  print(df[col].head(1))

0    192.168.137.140-182.92.131.196-27477-32100-17
Name: Flow ID, dtype: object
0    192.168.137.140
Name: Src IP, dtype: object
0    182.92.131.196
Name: Dst IP, dtype: object
0    18/10/2022 02:47:42 PM
Name: Timestamp, dtype: object
0    Spoofing
Name: Label, dtype: object


In [7]:
object_cols = df.select_dtypes(include=['object']).columns.tolist()
if 'type' in object_cols:
    object_cols.remove('type')

print("--- Bắt đầu điều tra các cột 'object' ---")

# Lặp qua TỪNG CỘT trong danh sách
for col in object_cols:
    df[col] = df[col].astype('category').cat.codes
for col in object_cols:
    print(f"{col}: {df[col].unique()[:10]}")  # In ra 10 giá trị đầu tiên của mỗi cột

print("\n--- Điều tra hoàn tất ---")

--- Bắt đầu điều tra các cột 'object' ---
Flow ID: [25173 85223 80731 86071 34866 58310 63105 76088 73673 52464]
Src IP: [270 452 423 695 304 397 378 306 354 228]
Dst IP: [ 592 2129  773  751 1701  307  611  698 1436  746]
Timestamp: [36640 39818 29395 40935 18360   453 12765 28583 28763 16312]
Label: [6 3 0 7 5 2 1 4]

--- Điều tra hoàn tất ---


In [8]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
df = df.sort_values(by='Timestamp').reset_index(drop=True)

# Trích xuất đặc trưng thời gian
for col in ["Year","Month","Day","Hour","Minute","Second"]:
    df[col] = getattr(df["Timestamp"].dt, col.lower())
df.drop(columns=["Timestamp"], inplace=True)

In [9]:
# Mã hóa label
le = LabelEncoder()
df["Label"] = le.fit_transform(df["Label"])

In [10]:
df.head()

Unnamed: 0,Flow ID,Src IP,Src Port,Dst IP,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,...,Idle Std,Idle Max,Idle Min,Label,Year,Month,Day,Hour,Minute,Second
0,86588,907,0,2123,0,0,119999752,269,0,0.0,...,0.0,0.0,0.0,7,1970,1,1,0,0,0
1,48318,362,37106,662,55443,6,90011126,12,6,2904.0,...,3067.527506,29989663.0,29984248.0,7,1970,1,1,0,0,0
2,58348,397,47662,307,80,6,270583,3,3,196.0,...,0.0,0.0,0.0,7,1970,1,1,0,0,0
3,58348,397,47662,307,80,6,136563,2,1,0.0,...,0.0,0.0,0.0,7,1970,1,1,0,0,0
4,35351,306,33596,1112,4070,6,110947268,24,15,121.0,...,305252.096593,10020560.0,8976011.0,7,1970,1,1,0,0,0


In [12]:
X = df.drop(columns=['Label']).values
y = df['Label'].values
X = np.nan_to_num(X)
X = np.clip(X, -1e10, 1e10)

k_features=30
test_size=0.2
random_state=42

# Chia train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, stratify=y, random_state=random_state
)

# Chuẩn hóa
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ANOVA chọn đặc trưng
selector = SelectKBest(f_classif, k=min(k_features, X.shape[1]))
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

trainset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                         torch.tensor(y_train, dtype=torch.long))
testset = TensorDataset(torch.tensor(X_test, dtype=torch.float32),
                        torch.tensor(y_test, dtype=torch.long))

  f = msb / msw


In [13]:
num_clients=5
num_classes=None
alpha=0.5
seed=42

np.random.seed(seed)
labels = np.array([y.item() for _, y in trainset])
idxs = np.arange(len(labels))
if num_classes is None:
    num_classes = len(np.unique(labels))
class_indices = [idxs[labels == c] for c in range(num_classes)]
    
client_dict = {i: [] for i in range(num_clients)}

for c in range(num_classes):
    np.random.shuffle(class_indices[c])
    proportions = np.random.dirichlet(alpha * np.ones(num_clients))
    split_points = (np.cumsum(proportions) * len(class_indices[c])).astype(int)
    split_class = np.split(class_indices[c], split_points[:-1])
    for cid, idx_split in enumerate(split_class):
        client_dict[cid].extend(idx_split)

# Thống kê dữ liệu cho từng client
print("\n [Non-IID Partition Statistics]")
for cid in client_dict:
    client_labels = labels[client_dict[cid]]
    unique, counts = np.unique(client_labels, return_counts=True)
    total = len(client_labels)
    dist = {int(u): round(c/total, 3) for u, c in zip(unique, counts)}
    print(f"Client {cid}: {total} samples | class distribution: {dist}")

for cid in client_dict:
    np.random.shuffle(client_dict[cid])


 [Non-IID Partition Statistics]
Client 0: 27006 samples | class distribution: {0: np.float64(0.193), 2: np.float64(0.162), 3: np.float64(0.083), 4: np.float64(0.146), 5: np.float64(0.008), 6: np.float64(0.318), 7: np.float64(0.09)}
Client 1: 8347 samples | class distribution: {0: np.float64(0.094), 1: np.float64(0.29), 2: np.float64(0.307), 3: np.float64(0.009), 4: np.float64(0.134), 5: np.float64(0.056), 6: np.float64(0.094), 7: np.float64(0.018)}
Client 2: 23748 samples | class distribution: {0: np.float64(0.088), 1: np.float64(0.003), 2: np.float64(0.013), 3: np.float64(0.333), 4: np.float64(0.186), 5: np.float64(0.155), 6: np.float64(0.0), 7: np.float64(0.223)}
Client 3: 10539 samples | class distribution: {0: np.float64(0.051), 1: np.float64(0.0), 2: np.float64(0.418), 3: np.float64(0.158), 4: np.float64(0.062), 5: np.float64(0.055), 6: np.float64(0.212), 7: np.float64(0.043)}
Client 4: 14317 samples | class distribution: {0: np.float64(0.235), 1: np.float64(0.029), 2: np.float64