In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [22]:
from typing import List, Union
from time import time

# Data uploading

In [3]:
file_path: str = r"D:\Obfuscation\datasets\TOKENIZER_DATASET_UPDATED.csv"
obfuscated_commands = pd.read_csv(file_path, sep=';')

In [4]:
X, y = obfuscated_commands.iloc[:, :-1], obfuscated_commands.iloc[:, -1]
original_features: List[str] = X.columns

## additional steps

In [6]:
def scale_data(scaler, X) -> None:
    X_scaled = scaler.fit_transform(X)
    return scaler, X_scaled

In [7]:
scaler = MinMaxScaler()
scaler, X_scaled = scale_data(scaler, X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

In [8]:
pca_n_components: List[int] = [
    138, 
    513,
]

In [9]:
# decorator for measuring execution time of function
def calc_time(func):
    def wrapper(*args, **kwargs):
        start_time: float = time()  # in seconds
        result = func(*args, **kwargs)
        execution_time: float = time() - start_time
        print(f"Время выполнения функции: {execution_time // 60} min {execution_time % 60} s")
        return result
    return wrapper

In [49]:
@calc_time
def select_features(selector, X, y):
    selector.fit(X, y)
    selected_columns = X.columns[selector.get_support()]
    selected_features = X[selected_columns]
    return selected_features

# Filter methods

### SelectKBest and SelectPercentile with Chi square, ANOVA F-value and Information Gain(IG) score functions

In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif

In [12]:
score_functions = [
    chi2,
    f_classif,
    mutual_info_classif,
]

filter_methods_labels: List[str] = [
    "SelectKBest-chi2-138",
    "SelectKBest-f_classif-138",
    "SelectKBest-mutual_info_classif-138",
    "SelectPercentile-chi2-138",
    "SelectPercentile-f_classif-138",
    "SelectPercentile-mutual_info_classif-138",
    "SelectKBest-chi2-513",
    "SelectKBest-f_classif-513",
    "SelectKBest-mutual_info_classif-513",
    "SelectPercentile-chi2-513",
    "SelectPercentile-f_classif-513",
    "SelectPercentile-mutual_info_classif-513",
]

filter_methods_result: List = []

In [14]:
# with 138 features
n_components1: int = pca_n_components[0]
filter_methods_1: List = [SelectKBest(score_func=func, k=n_components1) for func in score_functions] + [SelectPercentile(score_func=func, percentile=n_components1/len(original_features) * 100) for func in score_functions]
    
# with 513 features
n_components2: int = pca_n_components[1]
filter_methods_2: List = [SelectKBest(score_func=func, k=n_components2) for func in score_functions] + [SelectPercentile(score_func=func, percentile=n_components2/len(original_features) * 100) for func in score_functions]

In [15]:
for selector in filter_methods_1 + filter_methods_2:
    selected_features = select_features(selector, X_scaled_df.copy(), y)
    filter_methods_result.append(selected_features)

Время выполнения функции: 0.0 min 4.826271057128906 s
Время выполнения функции: 0.0 min 4.880026817321777 s
Время выполнения функции: 7.0 min 30.136009693145752 s
Время выполнения функции: 0.0 min 1.3903450965881348 s
Время выполнения функции: 0.0 min 3.3752481937408447 s
Время выполнения функции: 7.0 min 29.62198281288147 s
Время выполнения функции: 0.0 min 1.1715965270996094 s
Время выполнения функции: 0.0 min 2.729160785675049 s
Время выполнения функции: 7.0 min 30.637426376342773 s
Время выполнения функции: 0.0 min 1.1071209907531738 s
Время выполнения функции: 0.0 min 2.3635640144348145 s
Время выполнения функции: 7.0 min 32.271228313446045 s


In [16]:
for selected_features in filter_methods_result:
    print(selected_features.shape)

(29730, 138)
(29730, 138)
(29730, 138)
(29730, 138)
(29730, 138)
(29730, 138)
(29730, 513)
(29730, 513)
(29730, 513)
(29730, 513)
(29730, 513)
(29730, 513)


# Wrapper methods

In [40]:
from sklearn.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [41]:
estimators = [
    LogisticRegression,
    GaussianNB,
    SVC,
]

wrapper_methods_labels: List[str] = [
    "SFS-LR-138",
    "SFS-LR-513", 
    "RFE-LR-138",
    "RFE-LR-513",
    "SFS-GaussianNB-138",
    "SFS-GaussianNB-513",
    "RFE-GaussianNB-138",
    "RFE-GaussianNB-513",
    "SFS-SVC-138",
    "SFS-SVC-513",
    "RFE-SVC-138",
    "RFE-SVC-513",
]

wrapper_methods_result: List = []

In [43]:
# estimator = LR
wrapper_methods_LR: List = [
    SFS(estimator=LogisticRegression(solver="saga", max_iter=5000, n_jobs=-1, random_state=42),
        n_features_to_select=n_components,
        direction="forward",
        scoring="accuracy",
        n_jobs=-1) for n_components in pca_n_components
] + [
    RFE(estimator=LogisticRegression(solver="saga", max_iter=5000, n_jobs=-1, random_state=42),
        n_features_to_select=n_components) for n_components in pca_n_components
]

In [48]:
# estimator = GausianNaiveBayes from NB
wrapper_methods_GNB: List = [
    SFS(estimator=GaussianNB(),
        n_features_to_select=n_components,
        direction="forward",
        scoring="accuracy",
        n_jobs=-1) for n_components in pca_n_components
] + [
    RFE(estimator=GaussianNB(),
        n_features_to_select=n_components) for n_components in pca_n_components
]

In [47]:
# estimator = SupportVectorClassifier(SVC) from SVM
wrapper_methods_SVC: List = [
    SFS(estimator=SVC(gamma='auto'),
        n_features_to_select=n_components,
        direction="forward",
        scoring="accuracy",
        n_jobs=-1) for n_components in pca_n_components
] + [
    RFE(estimator=SVC(gamma='auto'),
        n_features_to_select=n_components) for n_components in pca_n_components
]

In [None]:
for selector in wrapper_methods_LR + wrapper_methods_GNB + wrapper_methods_SVC:
    selected_features = select_features(selector, X_scaled_df.copy(), y)
    wrapper_methods_result.append(select_features)

In [None]:
for selected_features in wrapper_methods_result:
    print(selected_features.shape)

# Embedded methods

In [50]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import ElasticNet

In [51]:
wrapper_methods_labels: List[str] = [
]

wrapper_methods_result: List[List[str]] = []

In [None]:
# Random Forest Classifier
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [None]:
# Lasso (least absolute shrinkage and selection operator)
lasso = Lasso(random_state=42, max_iter=5000)

In [None]:
# RidgeClassifier
ridge = RidgeClassifier(solver="saga", max_iter=5000, random_state=42)

In [None]:
# ElasticNet
elastic = ElasticNet(random_state=42, max_iter=5000)

# Writing to CSV file

In [25]:
import csv

filename: str = r"D:\Obfuscation\data\selection_result.csv"

In [37]:
def write_csv(filename: str, row: List[Union[str, int, float]]) -> None:
    with open(filename, mode='a', newline='', encoding="UTF8") as file:
        writer = csv.writer(file)
        writer.writerow(row)

In [38]:
# writing header
header: List[str] = ["Method_with_parameters"] + original_features
write_csv(filename, row=header)

### Processing filter methods result  

In [39]:
# Порядок в выбранных признаках сохранен, поэтому воспользуемся двумя указателями без сортировки имен признаков
i: int = 0  # pointer in list of selected features
j: int = 0  # pointer in list of original features
for k, df in enumerate(filter_methods_result):
    selected_features = df.columns
    row: List[str] = [filter_methods_labels[k]] + ['0'] * len(original_features)
    while i < len(selected_features) and j < len(original_features):
        if selected_features[i] == original_features[j]:
            row[j] = '1'
            i += 1
        j += 1
    write_csv(filename, row)
    i = j = 0

### Processing wrapper methods result

In [None]:
i: int = 0  # pointer in list of selected features
j: int = 0  # pointer in list of original features
for k, df in enumerate(wrapper_methods_result):
    selected_features = df.columns
    row: List[str] = [wrapper_methods_labels[k]] + ['0'] * len(original_features)
    while i < len(selected_features) and j < len(original_features):
        if selected_features[i] == original_features[j]:
            row[j] = '1'
            i += 1
        j += 1
    write_csv(filename, row)
    i = j = 0

### Processing embedded methods result