# Idea
Read into all sys files of a given sensor and calc the TF-IDF Vector for each file.
- Then Vocab as columns/features and each file as a row.
- Then Analyse Columns as before

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
import sys
sys.path.append(str(Path("./../../../../../").resolve()))

from py_dataset import advacned_sys_log_extraction
from py_dataset import get_all_files_df
from py_dataset import read_in_files
from py_dataset import feature_plotting

In [12]:
# max CPUs to use
max_workers = 3

In [3]:
data_path = Path('/media/<User>/DC/MAP_CreationOfNewDatasetsForDFL/code&data/0_raw_collected_data/')
assert data_path.exists()

In [4]:
output_path = Path(
        '/media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/advanced/')
output_path.mkdir(exist_ok=True)

assert output_path.exists()

In [5]:
df = get_all_files_df.main(data_path)
df.head(1)

txt file found, will drop /media/<User>/DC/MAP_CreationOfNewDatasetsForDFL/code&data/0_raw_collected_data/Heqing/device1/3_thetick_4h/collections.txt
txt file found, will drop /media/<User>/DC/MAP_CreationOfNewDatasetsForDFL/code&data/0_raw_collected_data/Heqing/device2/1_normal/note.txt
                                           file_path
0  /media/<User>/DC/MAP_CreationOfNewDatasetsForDFL/...
RangeIndex(start=0, stop=622, step=1)
filetype
csv    530
zip     92
Name: count, dtype: int64
                                           file_path filetype  filesize_bytes
0  /media/<User>/DC/MAP_CreationOfNewDatasetsForDFL/...      csv           49419
feature_family
SYS_data        92
KERN_data       91
FLS_data        91
RES_data        91
block_data      91
network_data    91
entropy_data    75
Name: count, dtype: int64


Unnamed: 0,file_path,filetype,filesize_bytes,researcher_name,device_name,experiment,feature_family,file_name,device
0,/media//DC/MAP_CreationOfNewDatasetsForDFL/...,csv,49419,Heqing,device1,1_normal,block_data,e4_5f_01_68_35_e6.csv,Heqing_device1


# Get only DataSource = Sys and Device = Heqing_device2

In [6]:
single_dev = df[df["device"] == "Heqing_device2"]
assert len(single_dev["device"].unique()) == 1

In [7]:
single_dev_single_data_source = single_dev[single_dev["feature_family"] == "SYS_data"]
assert len(single_dev_single_data_source["feature_family"].unique()) == 1
single_dev_single_data_source.shape

(19, 9)

In [8]:
single_dev_single_data_source.head(1)

Unnamed: 0,file_path,filetype,filesize_bytes,researcher_name,device_name,experiment,feature_family,file_name,device
139,/media//DC/MAP_CreationOfNewDatasetsForDFL/...,zip,3819463330,Heqing,device2,1_normal,SYS_data,device2_normal_157min_60G.zip,Heqing_device2


In [9]:
single_dev_single_data_source["file_name"].value_counts()

file_name
device2_normal_157min_60G.zip         1
device2_normal_83min_32.8G.zip        1
device2_ramsomware_90min_19.2G.zip    1
device2_ransomware_62min_12.7G.zip    1
device2_ransomware_90min_18.4G.zip    1
device2_thetick_2h_43.2G.zip          1
device2_thetick_2h_44.2G.zip          1
device2_bashlite_2h_451.G.zip         1
device2_Bashlite_2h_48.6G.zip         1
device2_httpbackdoors_2h_47.8G.zip    1
device2_httpbackdoor_2h_46.5G.zip     1
device2_beurk_2h_43.6G.zip            1
device2_beurk_2h_45.3G.zip            1
device2_backdoor_2h_45.3G.zip         1
device2_backdoor_2h_45.5G.zip         1
device2_bdvl_2h_44G.zip               1
device2_bdvl_2h_45.9G.zip             1
device2_xmrig_2h_20.3G.zip            1
device2_xmrig_2h_20.4G.zip            1
Name: count, dtype: int64

In [10]:
from typing import Iterator, Tuple
import zipfile
from tqdm import tqdm


def yield_log_files_from_zip(zip_file_path) -> Iterator[Tuple[str, str]]:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
        file_infos = [file_info for file_info in zip_file.infolist() if file_info.filename.endswith('.log')]
        for file_info in tqdm(file_infos, desc="Reading log files from zip", unit="files"):
            with zip_file.open(file_info) as file:
                yield file_info.filename, file.read().decode('utf-8')

In [11]:
import re


def process_log_file(content: str):
    processes, file_uptime = advacned_sys_log_extraction.parse(content)

    # print(f"Found {len(processes)} processes in log file. Uptime: {file_uptime}", end="\r")
    return processes

In [13]:
import os
import concurrent.futures


def process_file(row):
    assert row["filetype"] == "zip", "Only zip files are supported"
    logs = []
    csv_file_name = f"{row['file_name']}_logs.csv"
    path = output_path / csv_file_name
    if path.exists():
        print(f"Skipping {csv_file_name}")
        return

    for file_name, content in yield_log_files_from_zip(row["file_path"]):
        base_name = os.path.basename(file_name)
        timestamp = os.path.splitext(base_name)[0]
        procecess = process_log_file(content)

        logs.append({
            "timestamp": timestamp,
            "system_processes_and_calls": procecess,
            "label": row["experiment"],
        })

    logs_df = pd.DataFrame(logs)
    logs_df.to_csv(str(path), index=False)
    print(f"Saved {csv_file_name}")


def main():
    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_file, row) for _, row in single_dev_single_data_source.iterrows()]
        for future in concurrent.futures.as_completed(futures):
            future.result()

In [14]:
main()

Reading log files from zip: 100%|██████████| 486/486 [01:57<00:00,  4.12files/s]
Reading log files from zip:  46%|████▌     | 389/846 [02:24<02:29,  3.06files/s]

Saved device2_ramsomware_90min_19.2G.zip_logs.csv


Reading log files from zip: 100%|██████████| 532/532 [02:42<00:00,  3.27files/s]
Reading log files from zip:  19%|█▉        | 65/341 [00:21<01:07,  4.09files/s]]

Saved device2_normal_83min_32.8G.zip_logs.csv


Reading log files from zip: 100%|██████████| 341/341 [01:37<00:00,  3.48files/s]
Reading log files from zip:  73%|███████▎  | 359/489 [01:37<00:25,  5.16files/s]

Saved device2_ransomware_62min_12.7G.zip_logs.csv


Reading log files from zip: 100%|██████████| 489/489 [02:14<00:00,  3.63files/s]
Reading log files from zip:  96%|█████████▋| 815/846 [05:36<00:12,  2.53files/s]

Saved device2_ransomware_90min_18.4G.zip_logs.csv


Reading log files from zip: 100%|██████████| 846/846 [05:52<00:00,  2.40files/s]
Reading log files from zip:  37%|███▋      | 239/645 [01:33<02:03,  3.28files/s]

Saved device2_normal_157min_60G.zip_logs.csv


Reading log files from zip: 100%|██████████| 645/645 [04:05<00:00,  2.63files/s]
Reading log files from zip:  73%|███████▎  | 475/647 [03:02<01:00,  2.82files/s]

Saved device2_thetick_2h_43.2G.zip_logs.csv


Reading log files from zip: 100%|██████████| 647/647 [04:09<00:00,  2.60files/s]
Reading log files from zip:  26%|██▌       | 167/644 [01:15<02:54,  2.74files/s]

Saved device2_thetick_2h_44.2G.zip_logs.csv


Reading log files from zip: 100%|██████████| 646/646 [04:13<00:00,  2.54files/s]
Reading log files from zip:  36%|███▌      | 229/644 [01:40<02:16,  3.05files/s]

Saved device2_bashlite_2h_451.G.zip_logs.csv


Reading log files from zip: 100%|██████████| 644/644 [04:44<00:00,  2.27files/s]
Reading log files from zip:  70%|███████   | 453/645 [03:09<01:02,  3.08files/s]

Saved device2_Bashlite_2h_48.6G.zip_logs.csv

Reading log files from zip:  78%|███████▊  | 506/647 [03:35<00:45,  3.08files/s]




Reading log files from zip: 100%|██████████| 647/647 [04:31<00:00,  2.39files/s]
Reading log files from zip:  25%|██▌       | 163/648 [01:02<02:38,  3.07files/s]

Saved device2_httpbackdoors_2h_47.8G.zip_logs.csv


Reading log files from zip: 100%|██████████| 645/645 [04:27<00:00,  2.41files/s]
Reading log files from zip:  34%|███▍      | 221/648 [01:25<02:10,  3.27files/s]

Saved device2_httpbackdoor_2h_46.5G.zip_logs.csv


Reading log files from zip: 100%|██████████| 648/648 [03:52<00:00,  2.78files/s]
Reading log files from zip:  67%|██████▋   | 434/646 [02:39<01:10,  3.01files/s]

Saved device2_beurk_2h_43.6G.zip_logs.csv


Reading log files from zip: 100%|██████████| 650/650 [03:52<00:00,  2.80files/s]
Reading log files from zip:  96%|█████████▌| 619/646 [03:40<00:06,  3.90files/s]

Saved device2_beurk_2h_45.3G.zip_logs.csv


Reading log files from zip: 100%|██████████| 646/646 [03:47<00:00,  2.84files/s]
Reading log files from zip:  36%|███▌      | 230/646 [01:14<01:44,  3.98files/s]

Saved device2_backdoor_2h_45.3G.zip_logs.csv


Reading log files from zip: 100%|██████████| 646/646 [03:24<00:00,  3.17files/s]
Reading log files from zip:  67%|██████▋   | 387/578 [02:18<00:59,  3.21files/s]

Saved device2_backdoor_2h_45.5G.zip_logs.csv


Reading log files from zip: 100%|██████████| 575/575 [03:18<00:00,  2.90files/s]
Reading log files from zip:  61%|██████    | 373/612 [00:50<00:38,  6.25files/s]

Saved device2_bdvl_2h_44G.zip_logs.csv


Reading log files from zip: 100%|██████████| 578/578 [03:31<00:00,  2.73files/s]
Reading log files from zip:  92%|█████████▏| 565/612 [01:18<00:10,  4.31files/s]

Saved device2_bdvl_2h_45.9G.zip_logs.csv


Reading log files from zip: 100%|██████████| 612/612 [01:26<00:00,  7.10files/s]
Reading log files from zip:  43%|████▎     | 264/609 [00:38<00:35,  9.60files/s]

Saved device2_xmrig_2h_20.3G.zip_logs.csv


Reading log files from zip: 100%|██████████| 609/609 [01:12<00:00,  8.36files/s]


Saved device2_xmrig_2h_20.4G.zip_logs.csv
