# Idea
Read into all sys files of a given sensor and calc the TF-IDF Vector for each file.
- Then Vocab as columns/features and each file as a row.
- Then Analyse Columns as before

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
from py_dataset import get_all_files_df
from py_dataset import read_in_files
from py_dataset import feature_plotting

In [3]:
data_path = Path('/media/<User>/DC/MAP_CreationOfNewDatasetsForDFL/code&data/0_raw_collected_data/')
assert data_path.exists()

In [4]:
df = get_all_files_df.main(data_path)
df.head(1)

txt file found, will drop /media/<User>/DC/MAP_CreationOfNewDatasetsForDFL/code&data/0_raw_collected_data/Heqing/device1/3_thetick_4h/collections.txt
txt file found, will drop /media/<User>/DC/MAP_CreationOfNewDatasetsForDFL/code&data/0_raw_collected_data/Heqing/device2/1_normal/note.txt
                                           file_path
0  /media/<User>/DC/MAP_CreationOfNewDatasetsForDFL/...
RangeIndex(start=0, stop=622, step=1)
filetype
csv    530
zip     92
Name: count, dtype: int64
                                           file_path filetype  filesize_bytes
0  /media/<User>/DC/MAP_CreationOfNewDatasetsForDFL/...      csv           49419
feature_family
SYS_data        92
KERN_data       91
FLS_data        91
RES_data        91
block_data      91
network_data    91
entropy_data    75
Name: count, dtype: int64


Unnamed: 0,file_path,filetype,filesize_bytes,researcher_name,device_name,experiment,feature_family,file_name,device
0,/media//DC/MAP_CreationOfNewDatasetsForDFL/...,csv,49419,Heqing,device1,1_normal,block_data,e4_5f_01_68_35_e6.csv,Heqing_device1


# Get only DataSource = Sys and Device = Heqing_device2

In [5]:
single_dev = df[df["device"] == "Heqing_device2"]
assert len(single_dev["device"].unique()) == 1

In [6]:
single_dev_single_data_source = single_dev[single_dev["feature_family"] == "SYS_data"]
assert len(single_dev_single_data_source["feature_family"].unique()) == 1
single_dev_single_data_source.shape

(19, 9)

In [7]:
single_dev_single_data_source.head(1)

Unnamed: 0,file_path,filetype,filesize_bytes,researcher_name,device_name,experiment,feature_family,file_name,device
139,/media//DC/MAP_CreationOfNewDatasetsForDFL/...,zip,3819463330,Heqing,device2,1_normal,SYS_data,device2_normal_157min_60G.zip,Heqing_device2


In [8]:
single_dev_single_data_source["file_name"].value_counts()

file_name
device2_normal_157min_60G.zip         1
device2_normal_83min_32.8G.zip        1
device2_ramsomware_90min_19.2G.zip    1
device2_ransomware_62min_12.7G.zip    1
device2_ransomware_90min_18.4G.zip    1
device2_thetick_2h_43.2G.zip          1
device2_thetick_2h_44.2G.zip          1
device2_bashlite_2h_451.G.zip         1
device2_Bashlite_2h_48.6G.zip         1
device2_httpbackdoors_2h_47.8G.zip    1
device2_httpbackdoor_2h_46.5G.zip     1
device2_beurk_2h_43.6G.zip            1
device2_beurk_2h_45.3G.zip            1
device2_backdoor_2h_45.3G.zip         1
device2_backdoor_2h_45.5G.zip         1
device2_bdvl_2h_44G.zip               1
device2_bdvl_2h_45.9G.zip             1
device2_xmrig_2h_20.3G.zip            1
device2_xmrig_2h_20.4G.zip            1
Name: count, dtype: int64

In [9]:
from typing import Iterator, Tuple
import zipfile
from tqdm import tqdm


def yield_log_files_from_zip(zip_file_path) -> Iterator[Tuple[str, str]]:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
        file_infos = [file_info for file_info in zip_file.infolist() if file_info.filename.endswith('.log')]
        for file_info in tqdm(file_infos, desc="Reading log files from zip", unit="files"):
            with zip_file.open(file_info) as file:
                yield file_info.filename, file.read().decode('utf-8')

In [10]:
import re


def process_log_file(content: str):
    system_calls = re.findall(r'(?<=\s)(\w+)(?=\(arg0)', content)
    system_calls_string = ' '.join(system_calls)

    return system_calls_string

In [11]:
import os
import concurrent.futures


def process_file(row):
    assert row["filetype"] == "zip", "Only zip files are supported"
    logs = []
    csv_file_name = f"{row['file_name']}_logs.csv"
    path = Path(
        '/media/<User>/DC/IS_Data_Exploration_and_Feature_Engineering_for_an_IoT_Device_Behavior_Fingerprinting_Dataset/') / csv_file_name
    if path.exists():
        print(f"Skipping {csv_file_name}")
        return

    for file_name, content in yield_log_files_from_zip(row["file_path"]):
        base_name = os.path.basename(file_name)
        timestamp = os.path.splitext(base_name)[0]
        system_calls_string = process_log_file(content)

        logs.append({
            "timestamp": timestamp,
            "system_calls": system_calls_string,
            "experiment": row["experiment"],
        })

    logs_df = pd.DataFrame(logs)
    logs_df.to_csv(str(path), index=False)


def main():
    with concurrent.futures.ProcessPoolExecutor() as executor:
        futures = [executor.submit(process_file, row) for _, row in single_dev_single_data_source.iterrows()]
        for future in concurrent.futures.as_completed(futures):
            future.result()