
# Introduction
Use vectorizers to read in data. See here: https://stackoverflow.com/questions/31784011/scikit-learn-fitting-data-into-chunks-vs-fitting-it-all-at-once

We cannot fit the data all at the same time on the vectorizer, as it takes too much memory. Luckily, this is not needed. We first iterate over chunks of text data and build up the vocabulary of the corpus. Then we can use it to fit the CountVectorizer efficiently.

Then we can go over the chunks of text data again and transform them with the CountVectorizer into vectors. We can easily store all vectors of the complete data in memory.


In [1]:
from pathlib import Path
import re
import pandas as pd
from tqdm import tqdm
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import pickle
from sklearn.preprocessing import LabelEncoder

import concurrent.futures
import pickle
import os


In [2]:
import sys
sys.path.append(str(Path("./../../../../../").resolve()))

from py_dataset import get_all_files_df
from py_dataset import read_in_files
from py_dataset import feature_plotting
from py_dataset import advacned_sys_log_extraction

In [3]:
# max CPUs to use
max_workers = 3

In [4]:
data_path = Path(
    '/media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced')
assert data_path.exists()

csv_files = [csv_file for csv_file in data_path.glob("**/*.csv")]
assert len(csv_files) > 0

In [5]:
import json

# int(system_calls["calls"]) > 5 and float(process["cpu_usage"]) > 0 ==> 27GB

# def tokenizer(doc):
#     # Using default pattern from CountVectorizer
#     token_pattern = re.compile('(?u)\\b\\w\\w+\\b')
#     return [t for t in token_pattern.findall(doc)]
def tokenizer_vocab(doc):
    y = json.loads(doc.replace("'", '"'))
    vocab = set()
    for process in y:
        for system_calls in process["system_calls"]:
            c = process["service_name"] +" "+ system_calls["syscall"]

            if int(system_calls["calls"]) > 5 and float(process["cpu_usage"]) > 0.5:
                vocab.add(c)
                

            vocab.add(system_calls["syscall"])
    
    return vocab


def yield_dataframe_file():
    # for csv_file in tqdm(csv_files, desc="Reading dataframes from csvs", unit="files"):
    for csv_file in csv_files:
        print(f"Reading {csv_file}")
        df = pd.read_csv(csv_file)

        yield df

def process_df(df):
    documents = df["system_processes_and_calls"].to_numpy()
    vocab_df = set()
    for doc in documents:
        vocabs = set(tokenizer_vocab(str(doc)))
        vocab_df.update(vocabs)
    
    print(f"Processed {len(documents)} documents. Returning {len(vocab_df)} vocabs")
    return vocab_df

In [6]:
for df in yield_dataframe_file():
    break

Reading /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_backdoor_2h_45.3G.zip_logs.csv


In [7]:
import json
x = tokenizer_vocab(df["system_processes_and_calls"].iloc[0])
len(x)

137

In [8]:
import json
x = vectorizer.transform(df["system_processes_and_calls"].iloc[0])
x.shape

NameError: name 'vectorizer' is not defined

In [9]:
biggest_index = np.argmax(x)
x[0, biggest_index]
vectorizer.get_feature_names_out()[biggest_index]

TypeError: 'set' object is not subscriptable

### Creating the Vocab for The CountVectorizer

In [10]:
# vocab_file = data_path / "vocabulary.pkl" #len(tuple(vocabulary)) => 567553
vocab_file = data_path / "vocabulary_topK.pkl" #int(system_calls["calls"]) > 5 and float(process["cpu_usage"]) > 0 ==> 27GB and len 157711


if vocab_file.exists():
    with open(str(vocab_file), 'rb') as f:
        vocabulary = pickle.load(f)
    print("Loaded vocab:", vocabulary)
else:
    vocab_set = set()

    with concurrent.futures.ThreadPoolExecutor(max_workers = max_workers) as executor:
        futures = [executor.submit(process_df, df) for df in yield_dataframe_file()]
        for future in concurrent.futures.as_completed(futures):
            vocab_set.update(future.result())

    vocabulary = tuple(sorted(vocab_set))
    with open(str(vocab_file), 'wb') as f:
        pickle.dump(vocabulary, f)
    
    print("Saved vocab:", vocabulary)

Loaded vocab: ('(mandb) brk', '(mandb) chdir', '(mandb) chmod', '(mandb) clock_gettime', '(mandb) close', '(mandb) connect', '(mandb) fchmod', '(mandb) fchown32', '(mandb) flock', '(mandb) fstat64', '(mandb) fstatat64', '(mandb) fsync', '(mandb) getdents64', '(mandb) getpid', '(mandb) getsockopt', '(mandb) gettimeofday', '(mandb) llseek', '(mandb) lstat64', '(mandb) mmap2', '(mandb) mprotect', '(mandb) msync', '(mandb) munmap', '(mandb) openat', '(mandb) read', '(mandb) recvmsg', '(mandb) rename', '(mandb) rt_sigaction', '(mandb) rt_sigprocmask', '(mandb) socket', '(mandb) stat64', '(mandb) ugetrlimit', '(mandb) unlink', '(mandb) utimensat', '(mandb) write', '(python3) brk', '(python3) close', '(python3) fcntl64', '(python3) fstat64', '(python3) fstatat64', '(python3) futex', '(python3) getcwd', '(python3) getdents64', '(python3) geteuid32', '(python3) getgid32', '(python3) getpid', '(python3) getuid32', '(python3) ioctl', '(python3) llseek', '(python3) lstat64', '(python3) mkdir', '(p

In [11]:
len(tuple(vocabulary))

29699

In [12]:
# vocab_file = data_path / "vocabulary.pkl"

# if vocab_file.exists():
#     with open(str(vocab_file), 'rb') as f:
#         vocabulary = pickle.load(f)
#     print("Loaded vocab:", vocabulary)
# else:
#     vocab_set = set()

#     with concurrent.futures.ThreadPoolExecutor(max_workers = max_workers) as executor:
#         futures = [executor.submit(process_df, df) for df in yield_dataframe_file()]
#         results = [future.result() for future in futures]

#     for vocab_df in results:
#         vocab_set.update(vocab_df)

#     vocab = tuple(sorted(vocab_set))
#     with open(str(vocab_file), 'wb') as f:
#         pickle.dump(vocab, f)

#### Encoding the systemcalls

In [13]:
labels = [
    "1_normal",
    "2_ransomware",
    "3_thetick",
    "4_bashlite",
    "5_httpbackdoor",
    "6_beurk",
    "7_backdoor",
    "8_bdvl",
    "9_xmrig",
]
label_encoder =  LabelEncoder()
label_encoder.fit(labels)

In [14]:
def tokenizer(doc):
    y = json.loads(doc.replace("'", '"'))
    doc_tokenized = list()
    for process in y:
        for system_calls in process["system_calls"]:
            c = process["service_name"] +" "+ system_calls["syscall"]
            c = [c] * int(system_calls["calls"])
            doc_tokenized.extend(c)

            doc_tokenized.append(system_calls["syscall"])
    
    return doc_tokenized

In [15]:
vectorizer = CountVectorizer(vocabulary=vocabulary, tokenizer=tokenizer)


# doc = df["system_processes_and_calls"].iloc[:1]
# x = vectorizer.transform(doc).toarray()
# x[x > 1].shape, X.shape
# vectorizer

In [16]:
import gc
from typing import Tuple
from scipy.sparse import csr_matrix, vstack, hstack, save_npz, load_npz

def process_df(df=None, csv_file="") -> Tuple[csr_matrix, csr_matrix, str]:
    if df is None:
        if csv_file == "":
            raise ValueError("Either df or csv_file should be provided")

        print(f"Reading.. {csv_file}")
        df = pd.read_csv(csv_file)

    len_bf = len(df["system_processes_and_calls"])
    df = df[df["system_processes_and_calls"].notna()]
    print(f"Removed {len_bf} - {len(df)} = {len_bf - len(df)} NaNs from documents in df: ", csv_file)

    if len(df) == 0:
        print("Skipping empty dataframe: ", csv_file)
        return np.array([]), np.array([])

    documents = df["system_processes_and_calls"].to_numpy()
    X_docs = vectorizer.transform(documents)

    labels = label_encoder.transform(df["label"])
    cols = np.column_stack((df["timestamp"].to_numpy(), labels))
    cols = csr_matrix(cols)

    del df

    print(gc.collect())

    return X_docs, cols, csv_file


In [17]:
assert (data_path / 'merged_data_big_dataset').exists()

print(f"Starting with max_workers={max_workers} and {len(csv_files)} csv files.")
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    # futures = [executor.submit(process_df, df, csv_file) for df, csv_file in zip(yield_dataframe_file(), csv_files)]
    futures = [executor.submit(process_df, None,csv_file) for csv_file in csv_files]
    with tqdm(total=len(csv_files)) as pbar:
        for future in concurrent.futures.as_completed(futures):
            X_docs, cols, csv_file = future.result()

            assert X_docs.shape[0] == cols.shape[0], f"X.shape[0] = {X_docs.shape[0]}!= Z.shape[0] = {cols.shape[0]}"
            assert X_docs.shape[1] == len(vocabulary), f"X.shape[1] = {X_docs.shape[1]}!= len(vocabulary) = {len(vocabulary)}"
            assert cols.shape[1] == 2, f"cols.shape[1] = {cols.shape[1]}!= 2"

            merged_array = hstack([X_docs, cols])
            assert merged_array.shape[0] == X_docs.shape[0], f"merged_array.shape[0] = {merged_array.shape[0]}!= X.shape[0] = {X_docs.shape[0]}"
            assert merged_array.shape[1] == X_docs.shape[1] + cols.shape[1], f"merged_array.shape[1] = {merged_array.shape[1]}!= X.shape[1] + Z.shape[1] = {X_docs.shape[1] + cols.shape[1]}"
            
            output_file = str(data_path / 'merged_data_big_dataset' / f"{csv_file.stem}.npz")
            save_npz(output_file, merged_array)
            print(f"Saved to {output_file}, shape: {merged_array.shape}")

            del X_docs, cols, merged_array, output_file
            print(gc.collect())

            pbar.update(1)

Starting with max_workers=5 and 19 csv files.
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_backdoor_2h_45.3G.zip_logs.csv
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_backdoor_2h_45.5G.zip_logs.csv
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_bashlite_2h_451.G.zip_logs.csv
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_Bashlite_2h_48.6G.zip_logs.csv
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_devic

  0%|          | 0/19 [00:00<?, ?it/s]

Removed 575 - 575 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_bdvl_2h_44G.zip_logs.csv
Removed 646 - 646 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_bashlite_2h_451.G.zip_logs.csv
Removed 646 - 646 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_backdoor_2h_45.5G.zip_logs.csv
Removed 644 - 644 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_Bashlite_2h_48.6G.zip_logs.csv
Removed 646 - 646 = 0 NaNs from documents 

  5%|▌         | 1/19 [03:02<54:42, 182.36s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_bdvl_2h_44G.zip_logs.npz, shape: (575, 29701)
0
0
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_beurk_2h_43.6G.zip_logs.csv


 11%|█         | 2/19 [03:20<24:13, 85.50s/it] 

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_backdoor_2h_45.5G.zip_logs.npz, shape: (646, 29701)
0
0
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_beurk_2h_45.3G.zip_logs.csv


 16%|█▌        | 3/19 [03:21<12:33, 47.06s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_bashlite_2h_451.G.zip_logs.npz, shape: (646, 29701)
0
0
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_httpbackdoors_2h_47.8G.zip_logs.csv


 21%|██        | 4/19 [03:23<07:18, 29.23s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_backdoor_2h_45.3G.zip_logs.npz, shape: (646, 29701)
0
Removed 578 - 578 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_bdvl_2h_45.9G.zip_logs.csv
0
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_httpbackdoor_2h_46.5G.zip_logs.csv


 26%|██▋       | 5/19 [03:27<04:43, 20.26s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_Bashlite_2h_48.6G.zip_logs.npz, shape: (644, 29701)
0
Removed 647 - 647 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_httpbackdoors_2h_47.8G.zip_logs.csv
Removed 648 - 648 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_beurk_2h_43.6G.zip_logs.csv
Removed 650 - 650 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_beurk_2h_45.3G.zip_logs.csv
Removed 645 - 645 = 0 NaNs from

 32%|███▏      | 6/19 [05:52<13:34, 62.67s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_bdvl_2h_45.9G.zip_logs.npz, shape: (578, 29701)
0
Removed 846 - 846 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_normal_157min_60G.zip_logs.csv
0
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_normal_83min_32.8G.zip_logs.csv


 37%|███▋      | 7/19 [06:51<12:15, 61.30s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_httpbackdoors_2h_47.8G.zip_logs.npz, shape: (647, 29701)
0
0
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_ramsomware_90min_19.2G.zip_logs.csv


 42%|████▏     | 8/19 [06:53<07:49, 42.70s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_httpbackdoor_2h_46.5G.zip_logs.npz, shape: (645, 29701)
0
0
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_ransomware_62min_12.7G.zip_logs.csv


 47%|████▋     | 9/19 [06:55<04:59, 29.91s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_beurk_2h_43.6G.zip_logs.npz, shape: (648, 29701)
0
0
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_ransomware_90min_18.4G.zip_logs.csv


 53%|█████▎    | 10/19 [06:57<03:10, 21.15s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_beurk_2h_45.3G.zip_logs.npz, shape: (650, 29701)
0
Removed 532 - 532 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_normal_83min_32.8G.zip_logs.csv
Removed 341 - 341 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_ransomware_62min_12.7G.zip_logs.csv
Removed 486 - 486 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_ramsomware_90min_19.2G.zip_logs.csv
Removed 489 - 489 = 0 

 58%|█████▊    | 11/19 [08:15<05:08, 38.54s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_normal_83min_32.8G.zip_logs.npz, shape: (532, 29701)
0
0
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_thetick_2h_44.2G.zip_logs.csv


 63%|██████▎   | 12/19 [08:38<03:56, 33.78s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_ransomware_62min_12.7G.zip_logs.npz, shape: (341, 29701)
0
Removed 645 - 645 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_thetick_2h_43.2G.zip_logs.csv
Removed 647 - 647 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_thetick_2h_44.2G.zip_logs.csv
0
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_xmrig_2h_20.3G.zip_logs.csv


 68%|██████▊   | 13/19 [09:21<03:40, 36.72s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_normal_157min_60G.zip_logs.npz, shape: (846, 29701)
0
0
Reading.. /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_xmrig_2h_20.4G.zip_logs.csv
Removed 612 - 612 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_xmrig_2h_20.3G.zip_logs.csv


 74%|███████▎  | 14/19 [09:44<02:43, 32.68s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_ransomware_90min_18.4G.zip_logs.npz, shape: (489, 29701)
0
0


 79%|███████▉  | 15/19 [09:45<01:32, 23.12s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_ramsomware_90min_19.2G.zip_logs.npz, shape: (486, 29701)
0
Removed 609 - 609 = 0 NaNs from documents in df:  /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/device2_xmrig_2h_20.4G.zip_logs.csv
0


 84%|████████▍ | 16/19 [10:52<01:48, 36.17s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_xmrig_2h_20.3G.zip_logs.npz, shape: (612, 29701)
0
0


 89%|████████▉ | 17/19 [11:04<00:57, 28.83s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_xmrig_2h_20.4G.zip_logs.npz, shape: (609, 29701)
0
0


 95%|█████████▍| 18/19 [11:06<00:20, 20.77s/it]

Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_thetick_2h_43.2G.zip_logs.npz, shape: (645, 29701)
0


100%|██████████| 19/19 [11:12<00:00, 35.40s/it]

0
Saved to /media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/sys_system_calls_Heqing_device2/advanced/merged_data_big_dataset/device2_thetick_2h_44.2G.zip_logs.npz, shape: (647, 29701)
0





In [17]:
# vectorizer = CountVectorizer(vocabulary=vocabulary)
# X = np.array([[]]).reshape(0, len(vocabulary))
# Z = np.array([[]]).reshape(0, 2)

# with tqdm(total=len(csv_files)) as pbar:
#     for i, df in enumerate(yield_dataframe_file()):
#         len_bf = len(df["system_calls"])
#         df = df[df["system_calls"].notna()]
#         print(f"Removed {len_bf} - {len(df)} = {len_bf - len(df)} NaNs from documents in df: ", csv_files[i])

#         documents = df["system_calls"].to_numpy()
#         if len(documents) == 0:
#             print("Skipping empty dataframe: ", csv_files[i])
#             continue

#         X_docs = vectorizer.transform(documents)
#         X = np.concatenate([X, X_docs.toarray()], axis=0)

#         labels = label_encoder.transform(df["label"])
#         cols = np.column_stack((df["timestamp"].to_numpy(), labels))
#         Z = np.concatenate([Z, cols], axis=0)

#         assert X.shape[0] == Z.shape[0], f"X.shape[0] = {X.shape[0]} != Z.shape[0] = {Z.shape[0]}"

#         pbar.update(1)


In [18]:
def load_and_combine_npz_sparse(path):
    output_array = None

    for npz_file in path.glob("*.npz"):
        sparse_array = load_npz(npz_file)

        if output_array is None:
            output_array = sparse_array
        else:
            output_array = vstack((output_array, sparse_array))

    return output_array

def load_and_combine_npz(path):
    output_array = None

    for npz_file in path.glob("*.npz"):
        data = np.load(npz_file)
        array = data["arr_0"]

        print(array.shape)

        if output_array is None:
            output_array = array
        else:
            output_array = np.concatenate((output_array, array), axis=0)

    return output_array

path_to_read = data_path / 'merged_data_big_dataset'

merged = load_and_combine_npz_sparse(path_to_read)

In [19]:
merged.shape, len(vectorizer.get_feature_names_out())

((11532, 29701), 29699)

In [20]:
X = merged[:, :-2]
Z = merged[:, -2:]

In [21]:
transformer = TfidfTransformer()
X_tf = transformer.fit_transform(X).toarray()

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# pipeline = Pipeline([
#     ('vectorizer', CountVectorizer(vocabulary=vocabulary)),
#     ('tfidf', TfidfTransformer())
# ])

# # Fit the pipeline to the data and transform it
# X_transformed = pipeline.fit_transform(X)

# X_transformed_dense = X_transformed.toarray()

In [22]:
feature_names = vectorizer.get_feature_names_out()
idf_weights = transformer.idf_
top_features = np.argsort(idf_weights)[::-1][:10]  # get top 10 features
print(feature_names[top_features])

['NetworkManager epoll_wait' 'NetworkManager getpid'
 'SCTP timer newselect' 'NetworkManager write' 'NetworkManager sendmsg'
 'NetworkManager gettimeofday' 'NetworkManager recvmsg'
 'NetworkManager clock_gettime' 'NetworkManager close'
 'NetworkManager read']


In [None]:
# merged_array = np.column_stack([X_tf, Z])

# output_file = str(data_path / 'merged_data')
# np.savez_compressed(output_file, merged_array)