In [1]:
# import libraries

import numpy as np
import pandas as pd
import pickle

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

In [2]:
# defining paths to files

ORIGINAL_TOKENIZER_DATASET_FILENAME: str = r"D:\Obfuscation\data\datasets\TOKENIZER_DATASET.csv"
TOKENIZER_DATASET_FILENAME: str = r"D:\Obfuscation\data\datasets\TOKENIZER_DATASET_WITH_SHORT_CLEAR_COMMANDS.csv"
TRAIN_DATASET_FILENAME: str = r"D:\Obfuscation\data\datasets\balanced\BALANCED_DATASET_770_WITHOUT_CMD.csv"

In [3]:
# import datasets

original_tokenizer_df: pd.DataFrame = pd.read_csv(ORIGINAL_TOKENIZER_DATASET_FILENAME)
tokenizer_df: pd.DataFrame = pd.read_csv(TOKENIZER_DATASET_FILENAME)
train_df: pd.DataFrame = pd.read_csv(TRAIN_DATASET_FILENAME)

In [4]:
def show_info(dataframe: pd.DataFrame) -> None:
    """printing some characteristics about dataframe"""
    print(dataframe.shape)
    print(dataframe.head())

In [5]:
show_info(tokenizer_df)

(29730, 2712)
   index_in_powershell_dataset  \
0                            0   
1                            1   
2                            2   
3                            3   
4                            4   

                                  command_obfuscated  \
0                                    $PSVersionTable   
1  $UserCredential = Get-Credential $Session = Ne...   
2  $members = Import-CSV c:itadd-to-group.csv | S...   
3  $os = Get-WmiObject win32_operatingsystem $upt...   
4  Add-ADGroupMember -Identity group-name -Member...   

   AstGroupedArrayElementRangeCounts_0-10_Count  \
0                                           0.0   
1                                           0.0   
2                                           0.0   
3                                           0.0   
4                                           1.0   

   AstGroupedArrayElementRangeCounts_0-10_Percent  \
0                                             0.0   
1                              

In [6]:
show_info(train_df)

(10728, 2712)
   index_in_powershell_dataset  \
0                            0   
1                            1   
2                            2   
3                            3   
4                            4   

                                  command_obfuscated  \
0                                    $PSVersionTable   
1  $UserCredential = Get-Credential $Session = Ne...   
2  $members = Import-CSV c:itadd-to-group.csv | S...   
3  $os = Get-WmiObject win32_operatingsystem $upt...   
4  Add-ADGroupMember -Identity group-name -Member...   

   AstGroupedArrayElementRangeCounts_0-10_Count  \
0                                           0.0   
1                                           0.0   
2                                           0.0   
3                                           0.0   
4                                           1.0   

   AstGroupedArrayElementRangeCounts_0-10_Percent  \
0                                             0.0   
1                              

In [7]:
show_info(original_tokenizer_df)

(29730, 5001)
                                       command_clear  \
0                                    $PSVersionTable   
1  $UserCredential = Get-Credential $Session = Ne...   
2  $members = Import-CSV c:itadd-to-group.csv | S...   
3  $os = Get-WmiObject win32_operatingsystem $upt...   
4  Add-ADGroupMember -Identity group-name -Member...   

                                  command_obfuscated  \
0                                    $PSVersionTable   
1  $UserCredential = Get-Credential $Session = Ne...   
2  $members = Import-CSV c:itadd-to-group.csv | S...   
3  $os = Get-WmiObject win32_operatingsystem $upt...   
4  Add-ADGroupMember -Identity group-name -Member...   

   AstGroupedArrayElementRangeCounts_0-10_Count  \
0                                           0.0   
1                                           0.0   
2                                           0.0   
3                                           0.0   
4                                           1.0   

   As

In [8]:
class MakerShortCommandsClear(BaseEstimator, TransformerMixin):
    
    def __init__(self, tokenizer_df: pd.DataFrame) -> None:
        super().__init__()
        self.__tokenizer_df: pd.DataFrame = tokenizer_df
    
    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "MakerShortCommandsClear":
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X["obfuscated"] = self.__tokenizer_df["obfuscated"].copy()
        return X
    
    
class CommandsEncodingFixer(BaseEstimator, TransformerMixin):
    
    def __init__(self, tokenizer_df: pd.DataFrame) -> None:
        super().__init__()
        self.__tokenizer_df: pd.DataFrame = tokenizer_df
    
    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "CommandsEncodingFixer":
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X["command_obfuscated"] = self.__tokenizer_df["command_obfuscated"].copy()
        return X
    
    
class TrainObjectsRemover(BaseEstimator, TransformerMixin):
    
    def __init__(self, tokenizer_df: pd.DataFrame, train_df: pd.DataFrame) -> None:
        super().__init__()
        self.__tokenizer_df: pd.DataFrame = tokenizer_df
        self.__train_df: pd.DataFrame = train_df
            
    def __get_remaining_objects_indexes(self) -> list[int]:
        """substruction one dataset from another"""
        tokenizer_indexes, train_indexes = self.__tokenizer_df["index_in_powershell_dataset"], self.__train_df["index_in_powershell_dataset"]  # index in source dataset
        train_ptr = result_ptr = 0
        remaining_objects_indexes: list[int] = [0] * (len(self.__tokenizer_df) - len(self.__train_df))
        for tokenizer_ptr in np.arange(len(self.__tokenizer_df)):
            if tokenizer_indexes[tokenizer_ptr] == train_indexes[train_ptr]:
                train_ptr += 1
            else:
                remaining_objects_indexes[result_ptr] = tokenizer_ptr
                result_ptr += 1
        return remaining_objects_indexes
    
    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "TrainObjectsRemover":
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        indexes: list[int] = self.__get_remaining_objects_indexes()
        X = X.iloc[indexes, :].reset_index(drop=True)
        return X

In [9]:
class CommandsDropper(BaseEstimator, TransformerMixin):
    
    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "CommandsDropper":
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        return X.drop(["command_clear", "command_obfuscated"], axis=1)
    
    
class UselessFeaturesDropper(BaseEstimator, TransformerMixin):
    
    def __init__(self, original_tokenizer_df: pd.DataFrame) -> None:  # receive original tokenizer df because we need the same dimensionality
        super().__init__()
        self.__original_tokenizer_df: pd.DataFrame = original_tokenizer_df
    
    def __get_features_to_drop(self) -> list[str]:
        features_to_drop: list[str] = [feature for feature in self.__original_tokenizer_df.columns if len(self.__original_tokenizer_df[feature].unique()) == 1]
        return features_to_drop        
    
    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "UselessFeaturesDropper":
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        features_to_drop: list[str] = self.__get_features_to_drop()
        return X.drop(features_to_drop, axis=1)
    

class TargetValuesDropper(BaseEstimator, TransformerMixin):
    
    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "TargetValuesDropper":
        return self
    
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        return X.drop(["obfuscated"], axis=1)

In [10]:
from typing import Union


class Selector(BaseEstimator, TransformerMixin):
    
    def __init__(self, all_commands: pd.Series) -> None:
        super().__init__()
        self._all_commands: pd.Series = all_commands
            
    def get_indexes(self) -> list[int]:
        pass
    
    def fit(self, X: pd.DataFrame, y: pd.Series = None) -> "Selector":
        return self
    
    def transform(self, X: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
        pass

class CmdCommandsSelector(Selector):
    
    def __init__(self, all_commands: pd.Series) -> None:
        super().__init__(all_commands)
    
    def get_indexes(self) -> list[int]:
        indexes: list[int] = []
        for i, command in enumerate(self._all_commands):
            if "cmd" not in command:
                continue
            indexes.append(i)
        return indexes
    
    def transform(self, X: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
        indexes: list[int] = self.get_indexes()
        return X.iloc[indexes, :].reset_index(drop=True) if type(X) is pd.DataFrame else X[indexes].reset_index(drop=True)
    
    
class PslCommandsSelector(Selector):
    
    def __init__(self, all_commands: pd.Series) -> None:
        super().__init__(all_commands)
    
    def get_indexes(self) -> list[int]:
        cmd_indexes: list[int] = CmdCommandsSelector(self._all_commands).get_indexes()
        psl_indexes: list[int] = [0] * (len(self._all_commands) - len(cmd_indexes))
        cmd_ptr = psl_ptr = 0
        for command_index in np.arange(len(self._all_commands)):   
            if command_index == cmd_indexes[cmd_ptr]:
                cmd_ptr += 1
            else:
                psl_indexes[psl_ptr] = command_index
                psl_ptr += 1
        return psl_indexes
    
    def transform(self, X: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
        indexes: list[int] = self.get_indexes()
        return X.iloc[indexes, :].reset_index(drop=True) if type(X) is pd.DataFrame else X[indexes].reset_index(drop=True)


### Handle score results

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score


class ScoreHandler:
    
    def __init__(self, model, X: pd.DataFrame, y_true: pd.Series, commands: pd.Series) -> None:
        self.__model = model
        self.__X: pd.DataFrame = X
        self.__y_true: pd.Series = y_true
        self.__commands: pd.Series = commands
        self.__y_pred: np.ndarray = None
            
    def set_parameters(self, X: pd.DataFrame, y_true: pd.Series, commands: pd.Series = None) -> None:   
        n_features: int = X.shape[1]
        if n_features != self.__model.n_features_in_ or len(X) != len(y_true):
            return
        if commands is not None and len(commands) == len(X):
            self.__commands = commands
        self.__X = X
        self.__y_true = y_true
        
    def show_params_dimensionality(self) -> None:
        print(f"X: {self.__X.shape} || y_true: {self.__y_true.shape} || commands: {self.__commands.shape}")
        
    def calculate_score(self) -> str:
        self.__y_pred = self.__model.predict(self.__X)
        
        accuracy: float = np.round(accuracy_score(y_true=self.__y_true, y_pred=self.__y_pred), 5)  # true predictions / all predictions
        precision: float = np.round(precision_score(y_true=self.__y_true, y_pred=self.__y_pred), 5)  # true positive predictions / all positive predictions
        recall: float = np.round(recall_score(y_true=self.__y_true, y_pred=self.__y_pred), 5)  # true positive predictions / all positive labels
        
        return f"Model: {self.__model} || Accuracy: {accuracy} || Precision: {precision} || Recall: {recall}"
    
    def __compare_prediction_result(self) -> list[str]:
        comparison_result: list[str] = ['+' if self.__y_pred[i] == self.__y_true[i] else '-' for i in np.arange(len(self.__y_pred))]
        return comparison_result
    
    def __build_dataframe(self) -> pd.DataFrame:
        result: pd.DataFrame = pd.DataFrame({
            "command": self.__commands,
            "y_true": self.__y_true,
            "y_pred": self.__y_pred,
            "is_equal": self.__compare_prediction_result()
        })
        return result
    
    @staticmethod
    def save_to_csv(dataframe: pd.DataFrame, csv_filename: str, index: bool = False, header: bool = True) -> None:
        """writing dataframe to csv_filename"""
        dataframe.to_csv(
            path_or_buf=csv_filename, 
            sep=',',
            encoding='UTF8',
            index=index,
            header=header
        )
    
    def handle(self, csv_filename: str = None) -> pd.DataFrame:
        score_result: str = self.calculate_score()
        score_result_extended: pd.DataFrame = self.__build_dataframe()
        
        print(score_result)
        if csv_filename is not None:
            self.save_to_csv(dataframe=score_result_extended, csv_filename=csv_filename)
        return score_result_extended

### Preprocessing and dividing original tokenizer dataset

In [12]:
preprocessing_pipeline: Pipeline = Pipeline([
    ("maker_short_commands_clear", MakerShortCommandsClear(tokenizer_df)),
    ("commands_encoding_fixer", CommandsEncodingFixer(tokenizer_df)),
    ("train_objects_remover", TrainObjectsRemover(tokenizer_df, train_df)),
])
    
dropper_pipeline: Pipeline = Pipeline([
    ("target_values_dropper", TargetValuesDropper()),
    ("commands_dropper", CommandsDropper()),
    ("useless_features_dropper", UselessFeaturesDropper(original_tokenizer_df)),
])

In [13]:
original_tokenizer_df_processed: pd.DataFrame = preprocessing_pipeline.fit_transform(original_tokenizer_df)
target: pd.Series = original_tokenizer_df_processed["obfuscated"]
commands: pd.Series = original_tokenizer_df_processed["command_obfuscated"]

In [14]:
X: pd.DataFrame = dropper_pipeline.fit_transform(original_tokenizer_df_processed)

## Defining CSV filepaths to save score results

In [15]:
# Multinomial Naive Bayes

cmd_mnb_filename: str = r"D:/Obfuscation/data/datasets/additional/tests/mnb/cmd.csv"
others_in_dataset_mnb_filename: str = r"D:/Obfuscation/data/datasets/additional/tests/mnb/others_in_dataset.csv"
poor_obfuscated_mnb_filename: str = r"D:/Obfuscation/data/datasets/additional/tests/mnb/poor_obfuscated.csv"
cyrillic_mnb_filename: str = "D:/Obfuscation/data/datasets/additional/tests/mnb/cyrillic.csv"

In [16]:
# Gaussian Naive Bayes
    
cmd_gnb_filename: str = r"D:/Obfuscation/data/datasets/additional/tests/gnb/cmd.csv"
others_in_dataset_gnb_filename: str = r"D:/Obfuscation/data/datasets/additional/tests/gnb/others_in_dataset.csv"
poor_obfuscated_gnb_filename: str = r"D:/Obfuscation/data/datasets/additional/tests/gnb/poor_obfuscated.csv"
cyrillic_gnb_filename: str = "D:/Obfuscation/data/datasets/additional/tests/gnb/cyrillic.csv"

In [17]:
# Logistic Regression

cmd_logreg_filename: str = r"D:/Obfuscation/data/datasets/additional/tests/logreg/cmd.csv"
others_in_dataset_logreg_filename: str = r"D:/Obfuscation/data/datasets/additional/tests/logreg/others_in_dataset.csv"
poor_obfuscated_logreg_filename: str = r"D:/Obfuscation/data/datasets/additional/tests/logreg/poor_obfuscated.csv"
cyrillic_logreg_filename: str = "D:/Obfuscation/data/datasets/additional/tests/logreg/cyrillic.csv"

In [18]:
# K-Nearest Neighbors

cmd_knn_filename: str = r"D:/Obfuscation/data/datasets/additional/tests/knn/cmd.csv"
others_in_dataset_knn_filename: str = r"D:/Obfuscation/data/datasets/additional/tests/knn/others_in_dataset.csv"
poor_obfuscated_knn_filename: str = r"D:/Obfuscation/data/datasets/additional/tests/knn/poor_obfuscated.csv"
cyrillic_knn_filename: str = "D:/Obfuscation/data/datasets/additional/tests/knn/cyrillic.csv"

## Defining CSV filepaths for additional test datasets

In [19]:
poor_obfuscated_dataset_filename: str = r"D:/Obfuscation/data/datasets/additional/data/POOR_OBFUSCATED_DATASET.csv"
poor_obfuscated_tokenizer_dataset_filename: str = r"D:/Obfuscation/data/datasets/additional/data/POOR_OBFUSCATED_TOKENIZER_DATASET.csv"

cyrillic_dataset_filename: str = r"D:/Obfuscation/data/datasets/additional/data/CYRILLIC_DATASET.csv"
cyrillic_tokenizer_dataset_filename: str = r"D:/Obfuscation/data/datasets/additional/data/CYRILLIC_TOKENIZER_DATASET.csv"

## Selecting Pipeline (for cmd/psl commands from dataset)

In [20]:
def select_data(SelectorClass: Selector, X: pd.DataFrame, commands: pd.Series, target: pd.Series) -> tuple[pd.DataFrame, pd.Series, pd.Series]:
    """
    Select data from dataset according to command type: cmd or others(psl)
    Returns: X_selected, commands_selected, target_selected
    """
    selector: Selector = SelectorClass(commands)
    pipeline: Pipeline = Pipeline([
        ("selector", selector),
        ("min_max_scaler", MinMaxScaler()),
    ])
        
    X_selected: pd.DataFrame = pd.DataFrame(pipeline.fit_transform(X), columns=X.columns)
    commands_selected: pd.Series = selector.fit_transform(commands)
    target_selected: pd.Series = selector.fit_transform(target)
        
    return X_selected, commands_selected, target_selected

In [21]:
# Cmd commands

X_cmd, commands_cmd, target_cmd = select_data(CmdCommandsSelector, X, commands, target)

In [22]:
# Other(psl) commands in remaining objects(no cmd)

X_psl, commands_psl, target_psl = select_data(PslCommandsSelector, X, commands, target)

### Generating and saving poor obfuscated commands

In [23]:
def get_clear_commands(tokenizer_df: pd.DataFrame) -> list[str]:
    """get real clear commands (which are with no one obfuscation technique)"""
    return tokenizer_df[tokenizer_df["obf_methods_combination_number"] == 0]["command_obfuscated"].to_list()

In [24]:
clear_commands: list[str] = get_clear_commands(tokenizer_df)

In [25]:
commands_length: list[int] = [len(command) for command in clear_commands]
print(f"Min length = {min(commands_length)} || Mean length = {sum(commands_length) / len(clear_commands)} || Max length = {max(commands_length)}")

Min length = 2 || Mean length = 100.27333333333333 || Max length = 22017


In [83]:
# 1 char will be inserted randomly ` for every 20 symbols in command if length of command more or equal then 20 else just add 1 char `

def obfuscate(clear_commands: list[str], commands_length: list[int], symbols: int = 20, char: str = '`') -> list[str]:
    obfuscated_command: list[str] = ["" for _ in range(len(clear_commands))]
    for i, command in enumerate(clear_commands):
        parts_quantity: int = commands_length[i] // symbols + 1
        command_slices: list[str] = ['' for _ in range(parts_quantity + 1)]
        prev_index: int = 0
        for j in range(parts_quantity):
            index: int = min(symbols * j + np.random.randint(0, min(symbols, len(command))), len(command))
            command_slices[j] = command[prev_index:index]
            prev_index = index
        command_slices[-1] = command[prev_index:]
        obfuscated_command[i] = char.join(command_slices)
    return obfuscated_command

In [84]:
poor_obfuscated_commands: list[str] = obfuscate(clear_commands, commands_length)
poor_obfuscated_target: list[int] = [1 for _ in range(len(clear_commands))]

In [85]:
# create dataframe of poor_obfuscated commands

poor_obfuscated_df: pd.DataFrame = pd.DataFrame({
    "command_clear": clear_commands,
    "command_obfuscated": poor_obfuscated_commands,
    "obfuscated": poor_obfuscated_target,
})

In [86]:
print(poor_obfuscated_df.shape)
poor_obfuscated_df.head()

(1650, 3)


Unnamed: 0,command_clear,command_obfuscated,obfuscated
0,$PSVersionTable,$PSVersion`Table,1
1,$UserCredential = Get-Credential $Session = Ne...,`$UserCredential = Get-Credential `$Session = ...,1
2,$members = Import-CSV c:itadd-to-group.csv | S...,$members = Import-C`SV c`:itadd-to-group.cs`v ...,1
3,$os = Get-WmiObject win32_operatingsystem $upt...,$os` = Get-WmiObject win32_op`eratingsystem $u...,1
4,Add-ADGroupMember -Identity group-name -Member...,Ad`d-ADGroupMember -Identity group-name` -Memb...,1


In [87]:
# save to csv

ScoreHandler.save_to_csv(dataframe=poor_obfuscated_df, csv_filename=poor_obfuscated_dataset_filename)

### Generating and saving commands with cyrillic symbols

In [61]:
clear_commands_cyrillic: list[str] = [
    "Powershell -command ""Write-Host 'Мой голос это мой паспорт, верифицируй меня.'""",
    "Write-host 'Привет, мир!'",
    "powershell -noProfile -nonInteractive -WindowsStyle Hidden -Execution bypass Write-host ""Старайся сильнее""",
    "Write-host 'Мой голос это мой паспорт, верифицируй меня.'",
    "Write-host 'Обфуска́ция или запутывание кода — приведение исходного текста или исполняемого кода программы к виду, сохраняющему её функциональность, но затрудняющему анализ, понимание алгоритмов работы и модификацию при декомпиляции.'",
    "Write-host 'Машинное обучение — класс методов искусственного интеллекта, характерной чертой которых является не прямое решение задачи, а обучение за счёт применения решений множества сходных задач. Для построения таких методов используются средства математической статистики, численных методов, математического анализа, методов оптимизации, теории вероятностей, теории графов, различные техники работы с данными в цифровой форме.'",
    "Write-host 'Красивое лучше, чем уродливое. Явное лучше, чем неявное. Простое лучше, чем сложное. Сложное лучше, чем запутанное. Плоское лучше, чем вложенное. Разреженное лучше, чем плотное. Читаемость имеет значение. Особые случаи не настолько особые, чтобы нарушать правила. При этом практичность важнее безупречности. Ошибки никогда не должны замалчиваться. Если они не замалчиваются явно. Встретив двусмысленность, отбрось искушение угадать. Должен существовать один и, желательно, только один очевидный способ сделать это.'",
    "Write-host 'Красивое лучше, чем уродливое. Явное лучше, чем неявное. Простое лучше, чем сложное. Сложное лучше, чем запутанное. Плоское лучше, чем вложенное. Разреженное лучше, чем плотное. Читаемость имеет значение. Особые случаи не настолько особые, чтобы нарушать правила. При этом практичность важнее безупречности. Ошибки никогда не должны замалчиваться. Если они не замалчиваются явно. Встретив двусмысленность, отбрось искушение угадать. Должен существовать один и, желательно, только один очевидный способ сделать это. Хотя он поначалу может быть и не очевиден, если вы не голландец. Сейчас лучше, чем никогда. Хотя никогда зачастую лучше, чем прямо сейчас. Если реализацию сложно объяснить — идея плоха. Если реализацию легко объяснить — идея, возможно, хороша. Пространства имён — отличная штука! Будем делать их больше!'",
]

target_cyrillic: list[int] = [0] * len(clear_commands_cyrillic)

cyrillic_df: pd.DataFrame = pd.DataFrame({
    "command_clear": clear_commands_cyrillic,
    "command_obfuscated": clear_commands_cyrillic,
    "obfuscated": target_cyrillic,
})

In [62]:
show_info(cyrillic_df)

(8, 3)
                                       command_clear  \
0  Powershell -command Write-Host 'Мой голос это ...   
1                          Write-host 'Привет, мир!'   
2  powershell -noProfile -nonInteractive -Windows...   
3  Write-host 'Мой голос это мой паспорт, верифиц...   
4  Write-host 'Обфуска́ция или запутывание кода —...   

                                  command_obfuscated  obfuscated  
0  Powershell -command Write-Host 'Мой голос это ...           0  
1                          Write-host 'Привет, мир!'           0  
2  powershell -noProfile -nonInteractive -Windows...           0  
3  Write-host 'Мой голос это мой паспорт, верифиц...           0  
4  Write-host 'Обфуска́ция или запутывание кода —...           0  


In [63]:
ScoreHandler.save_to_csv(dataframe=cyrillic_df, csv_filename=cyrillic_dataset_filename)

## Processing raw datasets with poor obfuscated and cyrillic

In [23]:
# pipeline, containing dropping features and scaling

dropper_scaler_pipeline: Pipeline = Pipeline([
    ("target_values_dropper", TargetValuesDropper()),
    ("commands_dropper", CommandsDropper()),
    ("useless_features_dropper", UselessFeaturesDropper(original_tokenizer_df)),
    ("min_max_scaler", MinMaxScaler()),
])

### Poor obfuscated commands

In [24]:
# import data from csv

poor_obfuscated_df: pd.DataFrame = pd.read_csv(poor_obfuscated_tokenizer_dataset_filename)

In [25]:
show_info(poor_obfuscated_df)

(1408, 5001)
                                       command_clear  \
0  $UserCredential = Get-Credential $Session = Ne...   
1  $members = Import-CSV c:itadd-to-group.csv | S...   
2  Add-ADGroupMember -Identity group-name -Member...   
3                  Backup-Gpo -All -Path E:GPObackup   
4                                Checkpoint-Computer   

                                  command_obfuscated  \
0  `$UserCredential = Get-Credential `$Session = ...   
1  $members = Import-C`SV c`:itadd-to-group.cs`v ...   
2  Ad`d-ADGroupMember -Identity group-name` -Memb...   
3                Backup-Gpo -All` -Path E:GPObackup`   
4                               Checkpoi`nt-Computer   

   AstGroupedArrayElementRangeCounts_0-10_Count  \
0                                           0.0   
1                                           0.0   
2                                           0.0   
3                                           0.0   
4                                           0.0   

   Ast

In [26]:
commands_poor_obfuscated: pd.Series = poor_obfuscated_df["command_obfuscated"]
target_poor_obfuscated: pd.Series = poor_obfuscated_df["obfuscated"]
X_poor_obfuscated: pd.DataFrame = pd.DataFrame(dropper_scaler_pipeline.fit_transform(poor_obfuscated_df), columns=tokenizer_df.columns[2:-2])

### Commands containing cyrillic symbols

In [27]:
# import data from csv

cyrillic_df: pd.DataFrame = pd.read_csv(cyrillic_tokenizer_dataset_filename)

In [28]:
show_info(cyrillic_df)

(8, 5001)
                                       command_clear  \
0  Powershell -command Write-Host 'Мой голос это ...   
1                          Write-host 'Привет, мир!'   
2  powershell -noProfile -nonInteractive -Windows...   
3  Write-host 'Мой голос это мой паспорт, верифиц...   
4  Write-host 'Обфуска́ция или запутывание кода —...   

                                  command_obfuscated  \
0  Powershell -command Write-Host 'Мой голос это ...   
1                          Write-host 'Привет, мир!'   
2  powershell -noProfile -nonInteractive -Windows...   
3  Write-host 'Мой голос это мой паспорт, верифиц...   
4  Write-host 'Обфуска́ция или запутывание кода —...   

   AstGroupedArrayElementRangeCounts_0-10_Count  \
0                                           0.0   
1                                           0.0   
2                                           0.0   
3                                           0.0   
4                                           0.0   

   AstGro

In [29]:
commands_cyrillic: pd.Series = cyrillic_df["command_obfuscated"]
target_cyrillic: pd.Series = cyrillic_df["obfuscated"]
X_cyrillic: pd.DataFrame = pd.DataFrame(dropper_scaler_pipeline.fit_transform(cyrillic_df), columns=tokenizer_df.columns[2:-2])

### Loading trained models

In [30]:
def load_model(pkl_filename: str, mode: str = 'rb'):
    try:
        with open(pkl_filename, mode) as file:
            pickle_model = pickle.load(file)
        return pickle_model
    except Exception as exception_instance:
        print(f"The exception was occured while loading mode from file {pkl_filename}: {exception_instance}")

In [31]:
# declare filename paths to models

gaussianNB_model_filename: str = r"D:\Obfuscation\models\GaussianNB_model.pkl"
multinomialNB_model_filename: str = r"D:\Obfuscation\models\MultinomialNB_model.pkl"
logisticRegression_model_filename: str = r"D:\Obfuscation\models\LogisticRegression_model.pkl"
knn_model_filename: str = r"D:\Obfuscation\models\KNearestNeighbors_model.pkl"

In [32]:
# import trained models

gnb_model = load_model(gaussianNB_model_filename)
mnb_model = load_model(multinomialNB_model_filename)
logreg_model = load_model(logisticRegression_model_filename)
knn_model = load_model(knn_model_filename)

## Running all additional test data on models

### Multinomial Naive Bayes

In [36]:
score_handler_mnb: ScoreHandler = ScoreHandler(
    model=mnb_model,
    X=X_cmd,
    y_true=target_cmd,
    commands=commands_cmd
)

#### Cmd

In [37]:
cmd_mnb_score: pd.DataFrame = score_handler_mnb.handle(cmd_mnb_filename)

Model: MultinomialNB(alpha=1e-05) || Accuracy: 0.44478 || Precision: 1.0 || Recall: 0.41854


In [38]:
cmd_mnb_score["is_equal"].value_counts()

-    3112
+    2493
Name: is_equal, dtype: int64

#### Psl (other commands)

In [39]:
score_handler_mnb.set_parameters(
    X=X_psl,
    y_true=target_psl,
    commands=commands_psl
)

In [40]:
psl_mnb_score: pd.DataFrame = score_handler_mnb.handle(others_in_dataset_mnb_filename)

Model: MultinomialNB(alpha=1e-05) || Accuracy: 0.89087 || Precision: 0.99968 || Recall: 0.8659


In [41]:
psl_mnb_score["is_equal"].value_counts()

+    11935
-     1462
Name: is_equal, dtype: int64

#### Poor obfuscated commands

In [42]:
score_handler_mnb.set_parameters(
    X=X_poor_obfuscated,
    y_true=target_poor_obfuscated,
    commands=commands_poor_obfuscated
)

In [43]:
poor_obfuscated_mnb_score: pd.DataFrame = score_handler_mnb.handle(poor_obfuscated_mnb_filename)

Model: MultinomialNB(alpha=1e-05) || Accuracy: 0.03693 || Precision: 1.0 || Recall: 0.03693


In [44]:
poor_obfuscated_mnb_score["is_equal"].value_counts()

-    1356
+      52
Name: is_equal, dtype: int64

#### Commands with cyrillic symbols

In [45]:
score_handler_mnb.set_parameters(
    X=X_cyrillic,
    y_true=target_cyrillic,
    commands=commands_cyrillic
)

In [46]:
cyrillic_mnb_score: pd.DataFrame = score_handler_mnb.handle(cyrillic_mnb_filename)

Model: MultinomialNB(alpha=1e-05) || Accuracy: 1.0 || Precision: 0.0 || Recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [47]:
cyrillic_mnb_score

Unnamed: 0,command,y_true,y_pred,is_equal
0,Powershell -command Write-Host 'Мой голос это ...,0,0,+
1,"Write-host 'Привет, мир!'",0,0,+
2,powershell -noProfile -nonInteractive -Windows...,0,0,+
3,"Write-host 'Мой голос это мой паспорт, верифиц...",0,0,+
4,Write-host 'Обфуска́ция или запутывание кода —...,0,0,+
5,Write-host 'Машинное обучение — класс методов ...,0,0,+
6,"Write-host 'Красивое лучше, чем уродливое. Явн...",0,0,+
7,"Write-host 'Красивое лучше, чем уродливое. Явн...",0,0,+


### Gaussian Naive Bayes

In [62]:
score_handler_gnb: ScoreHandler = ScoreHandler(
    model=gnb_model,
    X=X_cmd,
    y_true=target_cmd,
    commands=commands_cmd
)

#### Cmd

In [63]:
cmd_gnb_score: pd.DataFrame = score_handler_gnb.handle(cmd_gnb_filename)

Model: GaussianNB(var_smoothing=1e-06) || Accuracy: 0.92471 || Precision: 0.96935 || Recall: 0.95123


In [64]:
cmd_gnb_score["is_equal"].value_counts()

+    5183
-     422
Name: is_equal, dtype: int64

#### Psl (other commands)

In [65]:
score_handler_gnb.set_parameters(
    X=X_psl,
    y_true=target_psl,
    commands=commands_psl
)

In [66]:
psl_gnb_score: pd.DataFrame = score_handler_gnb.handle(others_in_dataset_gnb_filename)

Model: GaussianNB(var_smoothing=1e-06) || Accuracy: 0.98246 || Precision: 0.99378 || Recall: 0.98456


In [67]:
psl_gnb_score["is_equal"].value_counts()

+    13162
-      235
Name: is_equal, dtype: int64

#### Poor obfuscated commands

In [68]:
score_handler_gnb.set_parameters(
    X=X_poor_obfuscated,
    y_true=target_poor_obfuscated,
    commands=commands_poor_obfuscated
)

In [69]:
poor_obfuscated_gnb_score: pd.DataFrame = score_handler_gnb.handle(poor_obfuscated_gnb_filename)

Model: GaussianNB(var_smoothing=1e-06) || Accuracy: 0.96165 || Precision: 1.0 || Recall: 0.96165


In [70]:
poor_obfuscated_gnb_score["is_equal"].value_counts()

+    1354
-      54
Name: is_equal, dtype: int64

#### Commands with cyrillic symbols

In [71]:
score_handler_gnb.set_parameters(
    X=X_cyrillic,
    y_true=target_cyrillic,
    commands=commands_cyrillic
)

In [72]:
cyrillic_gnb_score: pd.DataFrame = score_handler_gnb.handle(cyrillic_gnb_filename)

Model: GaussianNB(var_smoothing=1e-06) || Accuracy: 0.125 || Precision: 0.0 || Recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [73]:
cyrillic_gnb_score

Unnamed: 0,command,y_true,y_pred,is_equal
0,Powershell -command Write-Host 'Мой голос это ...,0,1,-
1,"Write-host 'Привет, мир!'",0,1,-
2,powershell -noProfile -nonInteractive -Windows...,0,0,+
3,"Write-host 'Мой голос это мой паспорт, верифиц...",0,1,-
4,Write-host 'Обфуска́ция или запутывание кода —...,0,1,-
5,Write-host 'Машинное обучение — класс методов ...,0,1,-
6,"Write-host 'Красивое лучше, чем уродливое. Явн...",0,1,-
7,"Write-host 'Красивое лучше, чем уродливое. Явн...",0,1,-


### Logistic Regression

In [47]:
score_handler_logreg: ScoreHandler = ScoreHandler(
    model=logreg_model,
    X=X_cmd,
    y_true=target_cmd,
    commands=commands_cmd
)

#### Cmd

In [48]:
cmd_logreg_score: pd.DataFrame = score_handler_logreg.handle(cmd_logreg_filename)

Model: LogisticRegression(C=0.1, max_iter=1000, multi_class='ovr', n_jobs=-1,
                   penalty='l1', solver='liblinear') || Accuracy: 0.67029 || Precision: 0.99971 || Recall: 0.6549


In [49]:
cmd_logreg_score["is_equal"].value_counts()

+    3757
-    1848
Name: is_equal, dtype: int64

#### Psl (other commands)

In [50]:
score_handler_logreg.set_parameters(
    X=X_psl,
    y_true=target_psl,
    commands=commands_psl
)

In [51]:
psl_logreg_score: pd.DataFrame = score_handler_logreg.handle(others_in_dataset_logreg_filename)

Model: LogisticRegression(C=0.1, max_iter=1000, multi_class='ovr', n_jobs=-1,
                   penalty='l1', solver='liblinear') || Accuracy: 0.99545 || Precision: 1.0 || Recall: 0.99439


In [52]:
psl_logreg_score["is_equal"].value_counts()

+    13336
-       61
Name: is_equal, dtype: int64

#### Poor obfuscated commands

In [53]:
score_handler_logreg.set_parameters(
    X=X_poor_obfuscated,
    y_true=target_poor_obfuscated,
    commands=commands_poor_obfuscated
)

In [54]:
poor_obfuscated_logreg_score: pd.DataFrame = score_handler_logreg.handle(poor_obfuscated_logreg_filename)

Model: LogisticRegression(C=0.1, max_iter=1000, multi_class='ovr', n_jobs=-1,
                   penalty='l1', solver='liblinear') || Accuracy: 0.42543 || Precision: 1.0 || Recall: 0.42543


In [55]:
poor_obfuscated_logreg_score["is_equal"].value_counts()

-    809
+    599
Name: is_equal, dtype: int64

#### Commands with cyrillic symbols

In [56]:
score_handler_logreg.set_parameters(
    X=X_cyrillic,
    y_true=target_cyrillic,
    commands=commands_cyrillic
)

In [57]:
cyrillic_logreg_score: pd.DataFrame = score_handler_logreg.handle(cyrillic_logreg_filename)

Model: LogisticRegression(C=0.1, max_iter=1000, multi_class='ovr', n_jobs=-1,
                   penalty='l1', solver='liblinear') || Accuracy: 1.0 || Precision: 0.0 || Recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [58]:
cyrillic_logreg_score

Unnamed: 0,command,y_true,y_pred,is_equal
0,Powershell -command Write-Host 'Мой голос это ...,0,0,+
1,"Write-host 'Привет, мир!'",0,0,+
2,powershell -noProfile -nonInteractive -Windows...,0,0,+
3,"Write-host 'Мой голос это мой паспорт, верифиц...",0,0,+
4,Write-host 'Обфуска́ция или запутывание кода —...,0,0,+
5,Write-host 'Машинное обучение — класс методов ...,0,0,+
6,"Write-host 'Красивое лучше, чем уродливое. Явн...",0,0,+
7,"Write-host 'Красивое лучше, чем уродливое. Явн...",0,0,+


### K-Nearest Neighbors (KNN)

In [33]:
score_handler_knn: ScoreHandler = ScoreHandler(
    model=knn_model,
    X=X_cmd,
    y_true=target_cmd,
    commands=commands_cmd
)

#### Cmd

In [34]:
cmd_knn_score: pd.DataFrame = score_handler_knn.handle(cmd_knn_filename)

Model: KNeighborsClassifier(n_jobs=-1) || Accuracy: 0.38965 || Precision: 0.9864 || Recall: 0.36584


In [36]:
cmd_knn_score["is_equal"].value_counts()

-    3421
+    2184
Name: is_equal, dtype: int64

#### Psl (other commands)

In [38]:
score_handler_knn.set_parameters(
    X=X_psl,
    y_true=target_psl,
    commands=commands_psl
)

In [39]:
psl_knn_score: pd.DataFrame = score_handler_knn.handle(others_in_dataset_knn_filename)

Model: KNeighborsClassifier(n_jobs=-1) || Accuracy: 0.98238 || Precision: 1.0 || Recall: 0.97831


In [40]:
psl_knn_score["is_equal"].value_counts()

+    13161
-      236
Name: is_equal, dtype: int64

#### Poor obfuscated commands

In [41]:
score_handler_knn.set_parameters(
    X=X_poor_obfuscated,
    y_true=target_poor_obfuscated,
    commands=commands_poor_obfuscated
)

In [42]:
poor_obfuscated_knn_score: pd.DataFrame = score_handler_knn.handle(poor_obfuscated_knn_filename)

Model: KNeighborsClassifier(n_jobs=-1) || Accuracy: 0.17116 || Precision: 1.0 || Recall: 0.17116


In [43]:
poor_obfuscated_knn_score["is_equal"].value_counts()

-    1167
+     241
Name: is_equal, dtype: int64

#### Commands with cyrillic symbols

In [44]:
score_handler_knn.set_parameters(
    X=X_cyrillic,
    y_true=target_cyrillic,
    commands=commands_cyrillic
)

In [45]:
cyrillic_knn_score: pd.DataFrame = score_handler_knn.handle(cyrillic_knn_filename)

Model: KNeighborsClassifier(n_jobs=-1) || Accuracy: 1.0 || Precision: 0.0 || Recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
cyrillic_knn_score

Unnamed: 0,command,y_true,y_pred,is_equal
0,Powershell -command Write-Host 'Мой голос это ...,0,0,+
1,"Write-host 'Привет, мир!'",0,0,+
2,powershell -noProfile -nonInteractive -Windows...,0,0,+
3,"Write-host 'Мой голос это мой паспорт, верифиц...",0,0,+
4,Write-host 'Обфуска́ция или запутывание кода —...,0,0,+
5,Write-host 'Машинное обучение — класс методов ...,0,0,+
6,"Write-host 'Красивое лучше, чем уродливое. Явн...",0,0,+
7,"Write-host 'Красивое лучше, чем уродливое. Явн...",0,0,+
