In [None]:
import pandas as pd
import glob
import os
import gc
from sklearn.utils import shuffle


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def merge(input1, input2, output):
    df = pd.read_csv(input1)
    df1 = pd.read_csv(input2)
    
    df = pd.concat([df, df1])
    
    df = df.sample(frac=1).reset_index(drop=True)
    
    df.to_csv(output, index=False)

def merge_from_parquet(folder_path, output):
    search_pattern = os.path.join(folder_path, '*.parquet')
    parquet_files = glob.glob(search_pattern)

    if not parquet_files:
        print(f"Lỗi: Không tìm thấy file .parquet nào trong '{folder_path}'.")
        return
    
    header = True
    for file in parquet_files:
        tmp_df = pd.read_parquet(file)
        tmp_df.to_csv(output, mode='a', index=False, header=header)
        header=False
        gc.collect()
    # print(f"Tìm thấy {len(parquet_files)} file parquet. Bắt đầu đọc và gộp...")
    # all_dfs = [pd.read_parquet(f) for f in parquet_files]
    # full_df = pd.concat(all_dfs, ignore_index=True)
    # print(f"Đã gộp thành công. Tổng số mẫu: {len(full_df)}")
    gc.collect()
    

def splitIL(input, output):
    search_pattern = os.path.join(input, '*.parquet')
    parquet_files = glob.glob(search_pattern)

    if not parquet_files:
        print(f"Lỗi: Không tìm thấy file .parquet nào trong '{input}'.")
        return
    
    all_dfs=[pd.read_parquet(f) for f in parquet_files]
    df = pd.concat(all_dfs, ignore_index=True)

    classes = sorted(df['label'].unique())
    # print(classes[:2])
    
    base_classes = classes[:1]
    new_classes = classes[1:]

    # config
    B_total = 100000
    N_new = 20000

    replay_per_class = lambda c: int(B_total / c)

    sample0= df[df['label'].isin([0,1])].groupby("label").apply(lambda x: x.sample(100000)).reset_index(drop=True)
    sample0 = shuffle(sample0)
    sample0.to_csv(output[0], index=False)
    
    del(sample0)
    gc.collect()
    
    for step, new_cls in enumerate(new_classes, start=1):
        old_cls = base_classes + new_classes[:step]
        # print("OLD CLASS: ", old_cls)
        E = replay_per_class(len(old_cls))
        
        old_samples = df[df["label"].isin(old_cls)].groupby("label").apply(lambda x: x.sample(E)).reset_index(drop=True)
        new_samples = df[df["label"] == new_cls].sample(N_new)
        
        incremental_df = shuffle(pd.concat([old_samples, new_samples]))
        incremental_df.to_csv(output[step], index=False)
        
        gc.collect()
    
input  = "C:/Users/hoang/Documents/Dataset_KLTN/scaled_output_parquet"

input1 = "C:/Users/hoang/Documents/Dataset_KLTN/training_dataset.csv"
input2 = "C:/Users/hoang/Documents/Dataset_KLTN/test_dataset.csv"
output = "C:/Users/hoang/Documents/Dataset_KLTN/merge.csv"
outputIL = ["session0.csv", "session1.csv", "session2.csv", "session3.csv"]

outputIL = ["C:/Users/hoang/Documents/Dataset_KLTN/"+ path for path in outputIL]

print(outputIL)
splitIL(input, outputIL)

gc.collect()

# df = pd.read_csv(output)
# print(df.head())


['C:/Users/hoang/Documents/Dataset_KLTN/session0.csv', 'C:/Users/hoang/Documents/Dataset_KLTN/session1.csv', 'C:/Users/hoang/Documents/Dataset_KLTN/session2.csv', 'C:/Users/hoang/Documents/Dataset_KLTN/session3.csv']
Lỗi: Không tìm thấy file .parquet nào trong 'C:/Users/hoang/Documents/Dataset_KLTN/scaled_output_parquet'.


0

In [2]:
# ===== REPLACE, DROP, REMAP SAMPLES ===== 
import os, sys
from pathlib import Path

import pandas as pd 
import dask.dataframe as dd

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle

import hashlib  
import ipaddress
import json
import glob
import gc

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

dir_in = "C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/merge-processed/merge1.3/part.0.parquet"
dir_out = "C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/merge-processed/merge_3_4_5_mitm_dns/full.parquet"
# dir_out2 = "C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/merge-processed/merge_3_7_8_mitm_vulscan/merlin_test.parquet"

if not Path(dir_out).parent.exists():
    Path(dir_out).parent.mkdir( exist_ok=True)

# if not Path(dir_out2).parent.exists():
#     Path(dir_out2).parent.mkdir(exist_ok=True)
df = pd.read_parquet(dir_in)

dtypes = {}    
with open('features.json') as json_file:
    data = json.load(json_file)
    for key, type in data.items():
        if type == "int8":
            dtypes[key]= np.int8
        elif type == "float32":
            dtypes[key] = np.float32
    
    json_file.close()

print(dtypes)

def astype(df):
    for key, type in data.items():
        if type == "int8":
            df[key] = df[key].astype(np.int8)
        elif type == "float32":
            df[key] = df[key].astype(np.float32)
    return df
try:
    df = astype(df)

    print(df['Label'].value_counts())
    # print(df.head(10))

    df= df[~df['Label'].isin([5, 6, 8])]

    # df = df[df['Label'] == 5]

    df['Label'] = df['Label'].replace(2, 5)
    df['Label'] = df['Label'].replace(3, 2)
    # df['Label'] = df['Label'].replace(3, 2)
    df['Label'] = df['Label'].replace(7, 3)
    # df['Label'] = df['Label'].replace(5, 4)


    df['Binary Label'] = df['Label'].apply(lambda x: 1 if x != 0 else 0)

    # train_df = df.sample(frac=0.85, random_state= 42)
    # df = df.drop(train_df.index)
    # print(df['Label'].value_counts())
    df.to_parquet(dir_out)

except Exception as e:
    print(e)
    gc.collect()

gc.collect()

{'Src IP': <class 'numpy.float32'>, 'Src Port': <class 'numpy.int8'>, 'Dst IP': <class 'numpy.float32'>, 'Dst Port': <class 'numpy.int8'>, 'Protocol': <class 'numpy.int8'>, 'Flow Duration': <class 'numpy.float32'>, 'Total Fwd Packet': <class 'numpy.float32'>, 'Total Bwd packets': <class 'numpy.float32'>, 'Total Length of Fwd Packet': <class 'numpy.float32'>, 'Total Length of Bwd Packet': <class 'numpy.float32'>, 'Fwd Packet Length Max': <class 'numpy.float32'>, 'Fwd Packet Length Min': <class 'numpy.float32'>, 'Fwd Packet Length Mean': <class 'numpy.float32'>, 'Fwd Packet Length Std': <class 'numpy.float32'>, 'Bwd Packet Length Max': <class 'numpy.float32'>, 'Bwd Packet Length Min': <class 'numpy.float32'>, 'Bwd Packet Length Mean': <class 'numpy.float32'>, 'Bwd Packet Length Std': <class 'numpy.float32'>, 'Flow Bytes/s': <class 'numpy.float32'>, 'Flow Packets/s': <class 'numpy.float32'>, 'Flow IAT Mean': <class 'numpy.float32'>, 'Flow IAT Std': <class 'numpy.float32'>, 'Flow IAT Max':

0

In [2]:
# ===== CHIA BATCH INCREMENTAL LEARNING ===== #

import os, sys
from pathlib import Path

import pandas as pd 
import dask.dataframe as dd

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle

import hashlib  
import ipaddress
import json
import glob
import gc

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

dtypes = {}    
with open('features.json') as json_file:
    data = json.load(json_file)
    for key, type in data.items():
        if type == "int8":
            dtypes[key]= np.int8
        elif type == "float32":
            dtypes[key] = np.float32
    
    json_file.close()

print(dtypes)

def astype(df):
    for key, type in data.items():
        if type == "int8":
            df[key] = df[key].astype(np.int8)
        elif type == "float32":
            df[key] = df[key].astype(np.float32)
    return df

def splitIL(input, output, base_classes, incre_classes):
    
    # --  Ép kiểu - Giảm RAM -- 
    df = pd.read_parquet(input)
    df = astype(df)
    
    classes = df["Label"].value_counts()
    print(f"===== CÁC NHÃN có trong dữ liệu gốc: {classes} =====")
    
    # config
    B_total = 1200000
    N_new = 60000

    replay_per_class = lambda c: int(B_total / c)
    
    sample0 = (
        df[df["Label"].isin(base_classes)]
        .groupby("Label")
        .apply(lambda x: x.sample(int(B_total/3)))
        .reset_index(drop=True)
    )
    
    sample0 = shuffle(sample0)
    print(sample0["Label"].value_counts())
    sample0.to_parquet(output[0], index=False)
    print(f"✅ Base phase: {base_classes} → {output[0]}")
    
    del(sample0)
    gc.collect()
    
    outfileIndex = 1 
    old_cls = base_classes
    print(old_cls)
    for new_cls in incre_classes:
        E = 0
        if isinstance(new_cls, int):
            E = replay_per_class(len(old_cls)+ 3)
        else:
            E = replay_per_class(len(old_cls)+len(new_cls) + 2)

        print(f"\n--- Increment {new_cls} ---")
        print(f"Old classes: {old_cls}")
        print(f"New class: {new_cls}")
        print(f"Replay mỗi lớp: {E} | Lớp mới: {N_new}")

        # Lấy lại mẫu từ tất cả lớp đã học
        old_samples = (
            df[df["Label"].isin(old_cls)]
            .groupby("Label")
            .apply(lambda x: x.sample(E))
            .reset_index(drop=True)
        )

        # Mẫu mới
        if isinstance(new_cls, int):
            if classes.get(new_cls) >= N_new and classes.get(new_cls) >= 0.9*E:
                new_samples = df[df["Label"] == new_cls].sample(int(0.9*E))
            else:
                new_samples = df[df["Label"] == new_cls].sample(N_new)
            old_samples = shuffle(pd.concat([old_samples, new_samples], ignore_index=True))
            old_cls.append(new_cls)
            print(old_cls)
            
        else:
            for cls in new_cls:
                if classes.get(cls) >= N_new and classes.get(cls) >= 0.9*E:
                    new_samples = df[df["Label"] == cls].sample(int(0.9*E))
                else:
                    new_samples = df[df["Label"] == cls].sample(N_new)
                # new_samples = df[df["Label"] == cls].sample(N_new)
                old_samples = shuffle(pd.concat([old_samples, new_samples], ignore_index=True))
                # Biến lớp vừa lấy thành chung với lớp cũ
                old_cls.append(cls)
                print(old_cls)
        
        print(old_samples["Label"].value_counts())
        
        old_samples.to_parquet(output[outfileIndex], index=False)
        outfileIndex +=1
        del old_samples, new_samples
        gc.collect()
        
        
    
    
if __name__ == "__main__":
    
    base_classes = [0, 1, 2]
    incre_classes = [3, [4, 5]]
    
    dir_in = "C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/merge-processed/merge_3_4_5_mitm_dns/full.parquet"
    dir_out = [f"C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/merge-processed/merge_3_4_5_mitm_dns/session{i}.parquet" for i in range(0, len(incre_classes)+1)]

    print(dir_out)
    splitIL(dir_in, dir_out, base_classes, incre_classes)

    gc.collect()

{'Src IP': <class 'numpy.float32'>, 'Src Port': <class 'numpy.int8'>, 'Dst IP': <class 'numpy.float32'>, 'Dst Port': <class 'numpy.int8'>, 'Protocol': <class 'numpy.int8'>, 'Flow Duration': <class 'numpy.float32'>, 'Total Fwd Packet': <class 'numpy.float32'>, 'Total Bwd packets': <class 'numpy.float32'>, 'Total Length of Fwd Packet': <class 'numpy.float32'>, 'Total Length of Bwd Packet': <class 'numpy.float32'>, 'Fwd Packet Length Max': <class 'numpy.float32'>, 'Fwd Packet Length Min': <class 'numpy.float32'>, 'Fwd Packet Length Mean': <class 'numpy.float32'>, 'Fwd Packet Length Std': <class 'numpy.float32'>, 'Bwd Packet Length Max': <class 'numpy.float32'>, 'Bwd Packet Length Min': <class 'numpy.float32'>, 'Bwd Packet Length Mean': <class 'numpy.float32'>, 'Bwd Packet Length Std': <class 'numpy.float32'>, 'Flow Bytes/s': <class 'numpy.float32'>, 'Flow Packets/s': <class 'numpy.float32'>, 'Flow IAT Mean': <class 'numpy.float32'>, 'Flow IAT Std': <class 'numpy.float32'>, 'Flow IAT Max':

  df[df["Label"].isin(base_classes)]


Label
1    400000
0    400000
2    400000
Name: count, dtype: int64
✅ Base phase: [0, 1, 2] → C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/merge-processed/merge_3_4_5_mitm_dns/session0.parquet
[0, 1, 2]

--- Increment 3 ---
Old classes: [0, 1, 2]
New class: 3
Replay mỗi lớp: 200000 | Lớp mới: 60000


  df[df["Label"].isin(old_cls)]


[0, 1, 2, 3]
Label
2    200000
1    200000
0    200000
3    180000
Name: count, dtype: int64

--- Increment [4, 5] ---
Old classes: [0, 1, 2, 3]
New class: [4, 5]
Replay mỗi lớp: 150000 | Lớp mới: 60000


  df[df["Label"].isin(old_cls)]


[0, 1, 2, 3, 4]
[0, 1, 2, 3, 4, 5]
Label
2    150000
3    150000
0    150000
1    150000
4     60000
5     60000
Name: count, dtype: int64


In [3]:
# ------- CHIA TRAIN TEST IL ------ #

import os, sys
from pathlib import Path

import pandas as pd 
import dask.dataframe as dd

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle

import hashlib  
import ipaddress
import json
import glob
import gc

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

dtypes = {}    
with open('features.json') as json_file:
    data = json.load(json_file)
    for key, type in data.items():
        if type == "int8":
            dtypes[key]= np.int8
        elif type == "float32":
            dtypes[key] = np.float32
    
    json_file.close()

print(dtypes)

def astype(df):
    for key, type in df.dtypes.items():
        # print(f"Key: {key} \t {type}")
        if type == "int8":
            df[key] = df[key].astype(np.int8)
        elif type == "float32":
            df[key] = df[key].astype(np.float32)
            
    return df


if __name__ == '__main__':
    n = 3
    dir_in = [f"C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/merge-processed/merge_3_4_5_mitm_dns/session{i}.parquet" for i in range(0, n)]
    
    dir_out_train = [f"C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/merge-processed/merge_3_4_5_mitm_dns/train_session{i}.parquet" for i in range(0, n)]
    
    dir_out_test = [f"C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/merge-processed/merge_3_4_5_mitm_dns/test_session{i}.parquet" for i in range(0, n)]
    # print(dir_out_train)
    
    for index, filePath in enumerate(dir_in):
        print("\n")
        print(f"===== CURRENT INDEX: {index+1} =====")
        df = pd.read_parquet(filePath)
        df = astype(df)
        
        # train 
        trainDF = df.sample(frac=0.85, random_state= 42)
        
        # test
        df = df.drop(trainDF.index)
        
        val_counts = df['Label'].value_counts()
        print(val_counts)
        # num_classes = len(val_counts)
        
        trainDF['Binary Label'] = trainDF['Label'].apply(lambda x : 0 if x == 0 else 1)
        df['Binary Label'] = df['Label'].apply(lambda x : 0 if x ==0  else 1)
        
        print(df['Binary Label'].value_counts())

        trainDF.to_parquet(dir_out_train[index], index= False)
        df.to_parquet(dir_out_test[index], index=False)
        
        
        del trainDF, df
        gc.collect()

{'Src IP': <class 'numpy.float32'>, 'Src Port': <class 'numpy.int8'>, 'Dst IP': <class 'numpy.float32'>, 'Dst Port': <class 'numpy.int8'>, 'Protocol': <class 'numpy.int8'>, 'Flow Duration': <class 'numpy.float32'>, 'Total Fwd Packet': <class 'numpy.float32'>, 'Total Bwd packets': <class 'numpy.float32'>, 'Total Length of Fwd Packet': <class 'numpy.float32'>, 'Total Length of Bwd Packet': <class 'numpy.float32'>, 'Fwd Packet Length Max': <class 'numpy.float32'>, 'Fwd Packet Length Min': <class 'numpy.float32'>, 'Fwd Packet Length Mean': <class 'numpy.float32'>, 'Fwd Packet Length Std': <class 'numpy.float32'>, 'Bwd Packet Length Max': <class 'numpy.float32'>, 'Bwd Packet Length Min': <class 'numpy.float32'>, 'Bwd Packet Length Mean': <class 'numpy.float32'>, 'Bwd Packet Length Std': <class 'numpy.float32'>, 'Flow Bytes/s': <class 'numpy.float32'>, 'Flow Packets/s': <class 'numpy.float32'>, 'Flow IAT Mean': <class 'numpy.float32'>, 'Flow IAT Std': <class 'numpy.float32'>, 'Flow IAT Max':