In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf

In [2]:
DATASET_DIRECTORY = 'CICIoT2023/'

In [3]:
# 获取目录中所有以 .csv 结尾的文件名
df_sets = [k for k in os.listdir(DATASET_DIRECTORY) if k.endswith('.csv')]

# 初始化一个空列表存储 DataFrame
dataframes = []

# 使用 tqdm 遍历文件列表并读取每个 CSV 文件
for file in tqdm(df_sets[126:], desc="Reading CSV files"):
    filepath = os.path.join(DATASET_DIRECTORY, file)
    # 读取 CSV 文件并将其添加到列表中
    dataframes.append(pd.read_csv(filepath))

# 使用 concat 函数将所有 DataFrame 合并为一个
merged_df = pd.concat(dataframes, ignore_index=True)

Reading CSV files: 100%|███████████████████████████████████████████████████████████████| 43/43 [00:57<00:00,  1.34s/it]


In [4]:
dict_2classes = {}
dict_2classes['DDoS-RSTFINFlood'] = 0
dict_2classes['DDoS-PSHACK_Flood'] = 0
dict_2classes['DDoS-SYN_Flood'] = 0
dict_2classes['DDoS-UDP_Flood'] = 0
dict_2classes['DDoS-TCP_Flood'] = 0
dict_2classes['DDoS-ICMP_Flood'] = 0
dict_2classes['DDoS-SynonymousIP_Flood'] = 0
dict_2classes['DDoS-ACK_Fragmentation'] = 0
dict_2classes['DDoS-UDP_Fragmentation'] = 0
dict_2classes['DDoS-ICMP_Fragmentation'] = 0
dict_2classes['DDoS-SlowLoris'] = 0
dict_2classes['DDoS-HTTP_Flood'] = 0

dict_2classes['DoS-UDP_Flood'] = 0
dict_2classes['DoS-SYN_Flood'] = 0
dict_2classes['DoS-TCP_Flood'] = 0
dict_2classes['DoS-HTTP_Flood'] = 0


dict_2classes['Mirai-greeth_flood'] = 0
dict_2classes['Mirai-greip_flood'] = 0
dict_2classes['Mirai-udpplain'] = 0

dict_2classes['Recon-PingSweep'] = 0
dict_2classes['Recon-OSScan'] = 0
dict_2classes['Recon-PortScan'] = 0
dict_2classes['VulnerabilityScan'] = 0
dict_2classes['Recon-HostDiscovery'] = 0

dict_2classes['DNS_Spoofing'] = 0
dict_2classes['MITM-ArpSpoofing'] = 0

dict_2classes['BenignTraffic'] = 1

dict_2classes['BrowserHijacking'] = 0
dict_2classes['Backdoor_Malware'] = 0
dict_2classes['XSS'] = 0
dict_2classes['Uploading_Attack'] = 0
dict_2classes['SqlInjection'] = 0
dict_2classes['CommandInjection'] = 0

dict_2classes['DictionaryBruteForce'] = 0

In [5]:
# 假设你有一个名为merged_df的DataFrame，包含你的所有数据
# X是特征，y是目标变量（如果有的话）
X = merged_df.drop(columns=['label'])  # 特征
y = merged_df['label'].map(dict_2classes)  # 目标变量

In [6]:
df = pd.concat([X, y], axis=1)

In [7]:
tmp = df[df["label"] == 0]
summary = (df["label"] != 0).sum()

In [8]:
maliciousData = tmp.sample(n=summary,random_state=23)

In [9]:
maliciousData

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
6761146,0.000000,54.48,6.00,63.86,7.766448,7.766448,0.0,0.0,1.0,0.0,...,0.550876,54.48,8.308881e+07,9.5,10.405176,0.782690,6.132078,0.05,141.55,0
12095658,0.000000,0.00,1.00,64.00,19.949697,19.949697,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.312806e+07,9.5,9.165151,0.000000,0.000000,0.00,141.55,0
5274149,0.000000,0.00,1.00,64.00,4.994281,4.994281,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.312897e+07,9.5,9.165151,0.000000,0.000000,0.00,141.55,0
4373609,0.117686,24156.50,16.84,64.00,4066.412163,4066.412163,0.0,0.0,0.0,0.0,...,0.391189,50.28,8.309762e+07,9.5,10.009688,0.555344,2.574432,0.06,141.55,0
10114612,0.113376,77.22,6.00,64.00,7.656867,7.656867,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.336540e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2068614,0.000000,54.00,6.00,64.00,6.500789,6.500789,0.0,0.0,0.0,0.0,...,0.000000,54.00,8.307247e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,0
6338989,0.018588,2.20,1.16,64.00,2.904505,2.904505,0.0,0.0,0.0,0.0,...,1.022751,42.33,8.312462e+07,9.5,9.196454,1.448886,8.859046,0.12,141.55,0
3013428,0.000000,0.00,1.00,64.00,13.979452,13.979452,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.314951e+07,9.5,9.165151,0.000000,0.000000,0.00,141.55,0
7967498,0.015541,72.60,6.00,64.00,8.328679,8.328679,0.0,0.0,1.0,0.0,...,11.318969,56.20,8.298535e+07,9.5,10.788676,16.023050,744.415416,0.19,141.55,0


In [10]:
tmp = df[df["label"] != 0]
df = pd.concat([maliciousData, tmp], axis=0, ignore_index=True)

In [11]:
df

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.000000,54.48,6.00,63.86,7.766448,7.766448,0.0,0.0,1.0,0.0,...,0.550876,54.48,8.308881e+07,9.5,10.405176,0.782690,6.132078,0.05,141.55,0
1,0.000000,0.00,1.00,64.00,19.949697,19.949697,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.312806e+07,9.5,9.165151,0.000000,0.000000,0.00,141.55,0
2,0.000000,0.00,1.00,64.00,4.994281,4.994281,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.312897e+07,9.5,9.165151,0.000000,0.000000,0.00,141.55,0
3,0.117686,24156.50,16.84,64.00,4066.412163,4066.412163,0.0,0.0,0.0,0.0,...,0.391189,50.28,8.309762e+07,9.5,10.009688,0.555344,2.574432,0.06,141.55,0
4,0.113376,77.22,6.00,64.00,7.656867,7.656867,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.336540e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607575,0.732928,2250732.00,6.00,93.50,2782.508717,2782.508717,0.0,0.0,0.0,0.0,...,836.076446,917.60,8.809566e-05,5.5,38.208551,1182.390650,885955.437820,0.80,38.50,1
607576,13.401717,11497.60,6.00,59.20,11.688178,11.688178,0.0,0.0,0.0,0.0,...,64.519729,108.20,9.288311e-03,5.5,14.697835,91.244676,4696.483733,0.90,38.50,1
607577,9.528512,116067.30,12.60,82.70,17.366800,17.366800,0.0,0.0,0.0,0.0,...,187.924704,337.50,1.846900e-02,5.5,19.810382,265.765666,61907.935995,0.90,38.50,1
607578,16.250675,85094.50,8.80,103.40,15.286675,15.286675,0.0,0.0,0.0,0.0,...,857.873462,105.80,1.665168e+08,13.5,34.988182,1215.209849,740986.016994,1.00,244.60,1


In [12]:
df.to_csv('./CICIoT2023MergeData/merged_data4.csv', index=False)