In [1]:
import os, sys
from pathlib import Path

import pandas as pd 
import dask.dataframe as dd

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle

import hashlib  
import ipaddress
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


##### FUNC ############
def ip_to_float(ip):
    try:
        return float(int(ipaddress.IPv4Address(ip)))/1e9
    except:
        return 0.0  # for invalid or empty IPs
    
def sum_of_squares(partition):
    return pd.Series([(partition ** 2).sum()])

def string_to_float(s):
    if pd.notna(s):
        return int(hashlib.sha256(str(s).encode('utf-8')).hexdigest(), 16) % 10**8 / 1e8
    return 0


def down_ratio(f):
    return f/(f+1e-9)

# Bucket port
def bucket_port(port):
    if port < 1024:
        return 0  # Well-known
    elif port < 49152:
        return 1  # Registered
    else:
        return 2  # Dynamic/private
##### FUNC ##############

In [2]:
dir_in = "C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/merge-processed/merge.csv"
dir_out = "C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/merge-processed/merge1.1.csv"
# df1 = pd.read_csv("C:/Users/hoang/Documents/Dataset_KLTN/ciciot2023_extracted/DDoS-SYN_Flood.pcap_Flow.csv")
# print(df1.head())

In [3]:
first_chunk = True
chunk_size = 100000
for index, chunk in enumerate(pd.read_csv(dir_in, chunksize=chunk_size)):
    chunk = chunk.dropna()
    chunk = chunk.drop_duplicates()
    chunk = chunk.drop(columns=["Timestamp"])
    
    # print(chunk.dtypes)
    chunk = chunk.replace([np.inf, -np.inf, "inf", "-inf", "Infinity", "-Infinity", r'[N|n][a|A][N|n]', "(empty)"], 0)
    chunk = chunk.fillna(0)
    
    chunk["Src Port"] = chunk["Src Port"].apply(bucket_port)
    chunk["Dst Port"] = chunk["Dst Port"].apply(bucket_port)
    
    chunk['Flow ID'] = chunk['Flow ID'].apply(string_to_float)
    chunk['Src IP'] = chunk['Src IP'].apply(ip_to_float)
    chunk['Dst IP'] = chunk['Dst IP'].apply(ip_to_float)

    chunk.to_csv(dir_out, mode='w' if first_chunk else 'a', header=first_chunk, index=False)
    # chunk.to_parquet(output_file, engine='pyarrow', index=False)
    first_chunk = False
    print(f"Index: {index}") #Chunktail: {chunk.tail()}

Index: 0
Index: 1
Index: 2
Index: 3
Index: 4
Index: 5
Index: 6
Index: 7
Index: 8
Index: 9
Index: 10
Index: 11
Index: 12
Index: 13
Index: 14
Index: 15
Index: 16
Index: 17
Index: 18
Index: 19
Index: 20
Index: 21
Index: 22
Index: 23
Index: 24
Index: 25
Index: 26
Index: 27
Index: 28
Index: 29
Index: 30
Index: 31
Index: 32
Index: 33
Index: 34
Index: 35
Index: 36
Index: 37
Index: 38
Index: 39
Index: 40
Index: 41
Index: 42
Index: 43
Index: 44
Index: 45
Index: 46
Index: 47
Index: 48
Index: 49
Index: 50
Index: 51
Index: 52
Index: 53
Index: 54
Index: 55
Index: 56
Index: 57
Index: 58
Index: 59
Index: 60
Index: 61
Index: 62
Index: 63
Index: 64
Index: 65
Index: 66
Index: 67
Index: 68
Index: 69
Index: 70
Index: 71
Index: 72
Index: 73
Index: 74
Index: 75
Index: 76
Index: 77
Index: 78
Index: 79
Index: 80
Index: 81
Index: 82
Index: 83
Index: 84
Index: 85
Index: 86
Index: 87
Index: 88
Index: 89
Index: 90
Index: 91
Index: 92
Index: 93
Index: 94
Index: 95
Index: 96
Index: 97
Index: 98
Index: 99
Index: 100