# Idea

Read all the csv files containing the features + their importance score

Select the top feature and store them somewhere

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

import sys
repo_base_path = Path("./../").resolve()

output_dir = repo_base_path / "feature_selection" / "result"

assert str(repo_base_path).endswith("csg_is"), f"{repo_base_path} is not a valid path to the CSG_IS repository" 

sys.path.append(str(repo_base_path))

from py_dataset import get_all_files_df
from py_dataset import read_in_files
from py_dataset import feature_plotting
from py_dataset import feature_selection

In [2]:
files = list(output_dir.glob("*.csv"))
files

[PosixPath('/home/<User>/repos/csg_is/feature_selection/result/network_datax960s_Heqing_device2_featurescores.csv'),
 PosixPath('/home/<User>/repos/csg_is/feature_selection/result/network_datax40s_Heqing_device2_featurescores.csv'),
 PosixPath('/home/<User>/repos/csg_is/feature_selection/result/block_data_Heqing_device2_featurescores.csv'),
 PosixPath('/home/<User>/repos/csg_is/feature_selection/result/RES_data_Heqing_device2_featurescores_ROBUST.csv'),
 PosixPath('/home/<User>/repos/csg_is/feature_selection/result/SYS_dataSystemVersion.old-bow_Heqing_device2_featurescores.csv'),
 PosixPath('/home/<User>/repos/csg_is/feature_selection/result/block_data_Heqing_device2_featurescores_ROBUST.csv'),
 PosixPath('/home/<User>/repos/csg_is/feature_selection/result/KERN_data_Heqing_device2_featurescores.csv'),
 PosixPath('/home/<User>/repos/csg_is/feature_selection/result/SYS_dataSystemVersion.TWOxGRAM_Heqing_device2_featurescores.csv'),
 PosixPath('/home/<User>/repos/csg_is/feature_selection/r

In [3]:
import os

dfs = []
for file in files:
    infos = file.stem.split("_")

    df = pd.read_csv(str(file))
    df["data_source"] = "_".join(infos[0:2])
    df["device"] = "_".join(infos[2:4])
    df["is_robust"] = True if len(infos) > 5 and infos[5] == "ROBUST" else False

    dfs.append(df)

df = pd.concat(dfs)
df

Unnamed: 0,chi2,f_classif,mutual_info_classif,feature,label,data_source,device,is_robust
0,0.024650,0.390793,0.018838,(OLD) PacketCount,1_normal,network_datax960s,Heqing_device2,False
1,0.000563,0.008145,0.110309,(OLD) TotalLength,1_normal,network_datax960s,Heqing_device2,False
2,0.015181,0.553157,0.072971,(OLD) AverageLength,1_normal,network_datax960s,Heqing_device2,False
3,0.014969,0.518494,0.000000,(OLD) MedianLength,1_normal,network_datax960s,Heqing_device2,False
4,0.005461,0.251820,0.007398,(OLD) MinLength,1_normal,network_datax960s,Heqing_device2,False
...,...,...,...,...,...,...,...,...
202,51.176344,380.283392,0.045646,writeback:writeback_dirty_page,3_thetick,KERN_data,Heqing_device2,True
203,0.000370,0.012030,0.000000,writeback:writeback_mark_inode_dirty,3_thetick,KERN_data,Heqing_device2,True
204,30.086652,68.701713,0.008235,writeback:writeback_single_inode,3_thetick,KERN_data,Heqing_device2,True
205,30.259659,69.065660,0.008695,writeback:writeback_write_inode,3_thetick,KERN_data,Heqing_device2,True


In [4]:
print(df["device"].unique())
df = df[~df["is_robust"]]

df = df.drop(columns=["is_robust", "device"])

df

['Heqing_device2']


Unnamed: 0,chi2,f_classif,mutual_info_classif,feature,label,data_source
0,0.024650,0.390793,0.018838,(OLD) PacketCount,1_normal,network_datax960s
1,0.000563,0.008145,0.110309,(OLD) TotalLength,1_normal,network_datax960s
2,0.015181,0.553157,0.072971,(OLD) AverageLength,1_normal,network_datax960s
3,0.014969,0.518494,0.000000,(OLD) MedianLength,1_normal,network_datax960s
4,0.005461,0.251820,0.007398,(OLD) MinLength,1_normal,network_datax960s
...,...,...,...,...,...,...
1039,8.739920,72.296545,0.024950,writeback:writeback_single_inode_start,9_xmrig,FLS_data
1040,2.654147,29.874139,0.012072,writeback:writeback_start,9_xmrig,FLS_data
1041,8.899036,69.082521,0.028515,writeback:writeback_write_inode,9_xmrig,FLS_data
1042,8.899036,69.082521,0.029355,writeback:writeback_write_inode_start,9_xmrig,FLS_data


# Remove unnecessary Sys and network with 960s

In [5]:
sys_bow_features = df[df["data_source"]=="SYS_dataSystemVersion.old-bow"]["feature"].unique()

mask = (df["data_source"]=="SYS_dataSystemVersion.TWOxGRAM") & (df["feature"].isin(sys_bow_features))
len(sys_bow_features), mask.value_counts()

df = df[~mask]

In [6]:
mask = df["data_source"]=="SYS_dataSystemVersion.OLDxBOWxWithoutMinMax"
print(df[mask].shape)

df = df[~mask]

(1431, 6)


In [7]:
df = df[~((df["data_source"]=="SYS_dataSystemVersion.TWOxGRAM") | (df["data_source"]=="network_datax960s"))]

# Store Away top k

In [24]:
top_k = 200

for label, group in df.groupby("label"):
    largest_chi = group.nlargest(top_k, columns=["chi2"])#[["chi2", "feature", "data_source"]]
    largest_f_classif = group.nlargest(top_k, columns=["f_classif"])#[["f_classif", "feature", "data_source"]]
    largest_mutual_info_classif = group.nlargest(top_k, columns=["mutual_info_classif"])#[["mutual_info_classif", "feature", "data_source"]]

    # largest_chi["score_type"] = "chi2"
    # largest_chi.rename(columns={"chi2": "score"}, inplace=True)

    # largest_f_classif["score_type"] = "f_classif"
    # largest_f_classif.rename(columns={"f_classif": "score"}, inplace=True)

    # largest_mutual_info_classif["score_type"] = "mutual_info_classif"
    # largest_mutual_info_classif.rename(columns={"mutual_info_classif": "score"}, inplace=True)

    largest = pd.concat([largest_chi, largest_f_classif, largest_mutual_info_classif])


In [25]:
largest.shape

(600, 6)

In [26]:
largest_set = set(largest["feature"].to_list())
largest_set, len(largest_set)

({'(OLD) AverageLength',
  '(OLD) VarianceLength',
  'L1-dcache-load-misses',
  'L1-dcache-loads',
  'L1-dcache-stores',
  'L1-icache-load-misses',
  'L1-icache-loads',
  'LLC-load-misses',
  'LLC-loads',
  'LLC-store-misses',
  'LLC-stores',
  'accept4',
  'access',
  'armv7_cortex_a15/br_mis_pred/',
  'armv7_cortex_a15/br_pred/',
  'armv7_cortex_a15/bus_cycles/',
  'armv7_cortex_a15/bus_cycles/.1',
  'armv7_cortex_a15/cpu_cycles/',
  'armv7_cortex_a15/exc_return/',
  'armv7_cortex_a15/exc_taken/',
  'armv7_cortex_a15/l1d_cache/',
  'armv7_cortex_a15/l1d_cache_refill/',
  'armv7_cortex_a15/l1d_tlb_refill/',
  'armv7_cortex_a15/l1i_cache/',
  'armv7_cortex_a15/l1i_cache_refill/',
  'armv7_cortex_a15/l1i_tlb_refill/',
  'armv7_cortex_a15/l2d_cache/',
  'armv7_cortex_a15/l2d_cache_wb/',
  'armv7_cortex_a15/mem_access/',
  'block:block_bio_backmerge',
  'block:block_bio_queue',
  'block:block_bio_remap',
  'block:block_dirty_buffer',
  'block:block_getrq',
  'block:block_rq_complete',
  '

In [26]:
import pickle

path = repo_base_path / "data_merg" / f"top_{len(largest_set)}_features.pkl"

with open(path, "wb") as f:
    pickle.dump(largest_set, f)