In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score

import pickle

In [3]:
# declaration filename's paths
OBFUSCATED_COMMANDS_FILENAME: str = r"D:\Obfuscation\data\datasets\balanced\BALANCED_DATASET_770_WITHOUT_CMD.csv"
    
TRAIN_FEATURES_FILENAME: str = r"D:\Obfuscation\data\datasets\divided\train_features.csv"
VALIDATE_FEATURES_FILENAME: str = r"D:\Obfuscation\data\datasets\divided\validate_features.csv"
TEST_FEATURES_FILENAME: str = r"D:\Obfuscation\data\datasets\divided\test_features.csv"
            
TRAIN_LABELS_FILENAME: str = r"D:\Obfuscation\data\datasets\divided\train_labels.csv"
VALIDATE_LABELS_FILENAME: str = r"D:\Obfuscation\data\datasets\divided\validate_labels.csv"
TEST_LABELS_FILENAME: str = r"D:\Obfuscation\data\datasets\divided\test_labels.csv"

In [3]:
# import dataset
obfuscated_commands: pd.DataFrame = pd.read_csv(OBFUSCATED_COMMANDS_FILENAME)

In [4]:
print(obfuscated_commands.shape)
obfuscated_commands.head()

(10728, 2712)


Unnamed: 0,index_in_powershell_dataset,command_obfuscated,AstGroupedArrayElementRangeCounts_0-10_Count,AstGroupedArrayElementRangeCounts_0-10_Percent,AstGroupedArrayElementRangeCounts_10-20_Count,AstGroupedArrayElementRangeCounts_10-20_Percent,AstGroupedArrayElementRangeCounts_20-30_Count,AstGroupedArrayElementRangeCounts_20-30_Percent,AstGroupedArrayElementRangeCounts_30-40_Count,AstGroupedArrayElementRangeCounts_30-40_Percent,...,AstVariableNameMetrics_Length_Range,AstVariableNameMetrics_Length_Total,AstVariableNameMetrics_UpperAlphaPercent_Average,AstVariableNameMetrics_UpperAlphaPercent_Maximum,AstVariableNameMetrics_UpperAlphaPercent_Median,AstVariableNameMetrics_UpperAlphaPercent_Minimum,AstVariableNameMetrics_UpperAlphaPercent_Mode,AstVariableNameMetrics_UpperAlphaPercent_Range,obfuscated,obf_methods_combination_number
0,0,$PSVersionTable,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,14.0,0.285714,0.285714,0.285714,0.285714,0.285714,0.0,0,0
1,1,$UserCredential = Get-Credential $Session = Ne...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7.0,36.0,0.169643,0.25,0.142857,0.142857,0.142857,0.107143,0,0
2,2,$members = Import-CSV c:itadd-to-group.csv | S...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,3,$os = Get-WmiObject win32_operatingsystem $upt...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,4,Add-ADGroupMember -Identity group-name -Member...,1.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


In [5]:
def get_X_y(dataframe: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
    start_features_index, end_features_index = 2, -3  # both are included
    label: str = "obfuscated"
    
    X: pd.DataFrame = dataframe.iloc[:, start_features_index:end_features_index + 1]
    y: pd.Series = dataframe[label]
    
    return X, y

In [14]:
def scale_MinMax(X: pd.DataFrame) -> pd.DataFrame:
    scaler: MinMaxScaler = MinMaxScaler()
    X_scaled_values: np.ndarray = scaler.fit_transform(X)
    X_scaled: pd.DataFrame = pd.DataFrame(X_scaled_values, columns=X.columns)
    return X_scaled

In [7]:
def get_train_valid_test_data(X: pd.DataFrame, y: pd.Series, valid_size: float = 0.2, test_size: float = 0.2) -> dict[str, tuple[pd.DataFrame, pd.Series]]:    
    def get_train_test_data(X: pd.DataFrame, y: pd.Series, test_size: float) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, shuffle=True)
        return X_train, X_test, y_train, y_test
    
    data: dict[str, tuple[pd.DataFrame, pd.Series]] = dict()
    
    X_train, X_test, y_train, y_test = get_train_test_data(X, y, test_size=test_size + valid_size)
    X_valid, X_test, y_valid, y_test = get_train_test_data(X_test, y_test, test_size=test_size / (test_size + valid_size))
    
    data["train"], data["valid"], data["test"] = (X_train, y_train), (X_valid, y_valid), (X_test, y_test)
    return data

In [15]:
# get separately features and labels and scale features
X, y = get_X_y(obfuscated_commands)
X_scaled: pd.DataFrame = scale_MinMax(X)

In [18]:
# train test split
divided_data: dict[str, tuple[pd.DataFrame, pd.Series]] = get_train_valid_test_data(X_scaled, y)
    
X_train, y_train = divided_data.get('train', [None, None])
X_valid, y_valid = divided_data.get('valid', [None, None])
X_test, y_test = divided_data.get('test', [None, None])

In [19]:
for y_data in [y_train, y_valid, y_test]:
    print(np.round(len(y_data) / len(y), 2))

0.6
0.2
0.2


In [21]:
# save the data with column with initial object's indexes
all_filenames: tuple[str, ...] = (TRAIN_FEATURES_FILENAME, VALIDATE_FEATURES_FILENAME, TEST_FEATURES_FILENAME, TRAIN_LABELS_FILENAME, VALIDATE_LABELS_FILENAME, TEST_LABELS_FILENAME)

for data, filename in zip((X_train, X_valid, X_test, y_train, y_valid, y_test), all_filenames):
    data.to_csv(filename, index=False)

## Cross Validation

In [4]:
# load divided data into train, validate, test

train_features: pd.DataFrame = pd.read_csv(TRAIN_FEATURES_FILENAME)
train_labels: pd.Series = pd.read_csv(TRAIN_LABELS_FILENAME)
    
validate_features: pd.DataFrame = pd.read_csv(VALIDATE_FEATURES_FILENAME)
validate_labels: pd.Series = pd.read_csv(VALIDATE_LABELS_FILENAME)
    
test_features: pd.DataFrame = pd.read_csv(TEST_FEATURES_FILENAME)
test_labels: pd.Series = pd.read_csv(TEST_LABELS_FILENAME)

### Multinomial Naive Bayes

In [5]:
mnb: MultinomialNB = MultinomialNB()

scores_mnb: np.ndarray[float] = cross_val_score(estimator=mnb, X=train_features, y=train_labels.values.ravel(), n_jobs=-1, cv=5)

In [6]:
print(scores_mnb)
scores_mnb.mean()

[0.92468944 0.8989899  0.92229992 0.92463092 0.9013209 ]


0.9143862176470872

### Gaussian Naive Bayes

In [7]:
gnb: GaussianNB = GaussianNB()

scores_gnb: np.ndarray[float] = cross_val_score(estimator=gnb, X=train_features, y=train_labels.values.ravel(), n_jobs=-1, cv=5)

In [8]:
print(scores_gnb)
scores_gnb.mean()

[0.98602484 0.98756799 0.98290598 0.98834499 0.97902098]


0.9847729565120869

## Hyperparameter tuning

In [9]:
def show_tuning_result(cross_val_result: GridSearchCV) -> None:
    print(f"Optimal Hyperparams: {cross_val_result.best_params_}\n")
    means: np.ndarray[float] = cross_val_result.cv_results_["mean_test_score"]
    stds: np.ndarray[float] = cross_val_result.cv_results_["std_test_score"]
    params_combinatons: np.ndarray[dict[str, any]] = cross_val_result.cv_results_["params"]
    
    for mean, std, params in zip(means, stds, params_combinatons):
        print(f"Mean: {np.round(mean, 5)} || Standard deviation: {np.round(std, 5)} || Hyperparameters: {params}")

### Multinomial Naive Bayes

In [10]:
mnb: MultinomialNB = MultinomialNB()
    
hyperparams_mnb: dict[str, any] = {
    'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000],
}

cross_val_mnb: GridSearchCV = GridSearchCV(mnb, hyperparams_mnb, cv=5, n_jobs=-1, verbose=2)
cross_val_mnb.fit(train_features, train_labels.values.ravel());

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [11]:
show_tuning_result(cross_val_mnb)

Optimal Hyperparams: {'alpha': 1e-05}

Mean: 0.91749 || Standard deviation: 0.00991 || Hyperparameters: {'alpha': 1e-05}
Mean: 0.91718 || Standard deviation: 0.01023 || Hyperparameters: {'alpha': 0.0001}
Mean: 0.91672 || Standard deviation: 0.01082 || Hyperparameters: {'alpha': 0.001}
Mean: 0.91578 || Standard deviation: 0.01132 || Hyperparameters: {'alpha': 0.1}
Mean: 0.91439 || Standard deviation: 0.01167 || Hyperparameters: {'alpha': 1}
Mean: 0.91377 || Standard deviation: 0.01125 || Hyperparameters: {'alpha': 10}
Mean: 0.91252 || Standard deviation: 0.01106 || Hyperparameters: {'alpha': 100}
Mean: 0.91097 || Standard deviation: 0.0111 || Hyperparameters: {'alpha': 1000}


In [12]:
mnb1: MultinomialNB = MultinomialNB(alpha=1e-05)
mnb1.fit(train_features, train_labels.values.ravel())

mnb2: MultinomialNB = MultinomialNB(alpha=0.0001)
mnb2.fit(train_features, train_labels.values.ravel())

mnb3: MultinomialNB = MultinomialNB(alpha=0.001)
mnb3.fit(train_features, train_labels.values.ravel());

### Gaussian Naive Bayes

In [13]:
gnb: MultinomialNB = GaussianNB()
    
hyperparams_gnb: dict[str, any] = {
    'var_smoothing': np.logspace(0,-9, num=10),
}

cross_val_gnb: GridSearchCV = GridSearchCV(gnb, hyperparams_gnb, cv=5, n_jobs=-1, verbose=2)
cross_val_gnb.fit(train_features, train_labels.values.ravel());

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [14]:
show_tuning_result(cross_val_gnb)

Optimal Hyperparams: {'var_smoothing': 1e-05}

Mean: 0.70649 || Standard deviation: 0.00737 || Hyperparameters: {'var_smoothing': 1.0}
Mean: 0.87663 || Standard deviation: 0.01062 || Hyperparameters: {'var_smoothing': 0.1}
Mean: 0.90211 || Standard deviation: 0.01047 || Hyperparameters: {'var_smoothing': 0.01}
Mean: 0.92946 || Standard deviation: 0.00376 || Hyperparameters: {'var_smoothing': 0.001}
Mean: 0.98477 || Standard deviation: 0.00239 || Hyperparameters: {'var_smoothing': 0.0001}
Mean: 0.98586 || Standard deviation: 0.00211 || Hyperparameters: {'var_smoothing': 1e-05}
Mean: 0.98571 || Standard deviation: 0.00181 || Hyperparameters: {'var_smoothing': 1e-06}
Mean: 0.98524 || Standard deviation: 0.0022 || Hyperparameters: {'var_smoothing': 1e-07}
Mean: 0.98524 || Standard deviation: 0.0022 || Hyperparameters: {'var_smoothing': 1e-08}
Mean: 0.98477 || Standard deviation: 0.00343 || Hyperparameters: {'var_smoothing': 1e-09}


In [15]:
gnb1: GaussianNB = GaussianNB(var_smoothing=1e-05)
gnb1.fit(train_features, train_labels.values.ravel())

gnb2: GaussianNB = GaussianNB(var_smoothing=1e-06)
gnb2.fit(train_features, train_labels.values.ravel())

gnb3: GaussianNB = GaussianNB(var_smoothing=1e-07)
gnb3.fit(train_features, train_labels.values.ravel());

## Model Evaluation

In [16]:
def show_score(model, features: pd.DataFrame, y_true: pd.Series) -> None:
    y_pred: np.ndarray = model.predict(features)
        
    accuracy: float = np.round(accuracy_score(y_true=y_true, y_pred=y_pred), 5)  # true predictions / all predictions
    precision: float = np.round(precision_score(y_true=y_true, y_pred=y_pred), 5)  # true positive predictions / all positive predictions
    recall: float = np.round(recall_score(y_true=y_true, y_pred=y_pred), 5)  # true positive predictions / all positive labels
        
    print(f"Model: {model} || Accuracy: {accuracy} || Precision: {precision} || Recall: {recall}")

### Multinomial Naive Bayes

In [17]:
# score three the best MultinomialNB models with different hyperparametes on the validate data

for model in (mnb1, mnb2, mnb3):
    show_score(model, features=validate_features, y_true=validate_labels)

Model: MultinomialNB(alpha=1e-05) || Accuracy: 0.90261 || Precision: 0.98739 || Recall: 0.81303
Model: MultinomialNB(alpha=0.0001) || Accuracy: 0.90214 || Precision: 0.98737 || Recall: 0.81209
Model: MultinomialNB(alpha=0.001) || Accuracy: 0.90214 || Precision: 0.98737 || Recall: 0.81209


In [18]:
# choose mnb1 model and score on the test data

show_score(model=mnb1, features=test_features, y_true=test_labels)

Model: MultinomialNB(alpha=1e-05) || Accuracy: 0.9068 || Precision: 0.99101 || Recall: 0.82123


### Gaussian Naive Bayes

In [19]:
# score three the best GaussianNB models with different hyperparametes on the validate data

for model in (gnb1, gnb2, gnb3):
    show_score(model, features=validate_features, y_true=validate_labels)

Model: GaussianNB(var_smoothing=1e-05) || Accuracy: 0.98649 || Precision: 0.98954 || Recall: 0.983
Model: GaussianNB(var_smoothing=1e-06) || Accuracy: 0.98742 || Precision: 0.99049 || Recall: 0.98395
Model: GaussianNB(var_smoothing=1e-07) || Accuracy: 0.98695 || Precision: 0.99049 || Recall: 0.983


In [20]:
# choose gnb2 model and score on the test data

show_score(model=gnb2, features=test_features, y_true=test_labels)

Model: GaussianNB(var_smoothing=1e-06) || Accuracy: 0.98276 || Precision: 0.98777 || Recall: 0.97765


## Saving models

In [25]:
def save_model(model, pkl_filename: str) -> bool:
    try:
        with open(pkl_filename, 'wb') as file:
            pickle.dump(model, file=file)
    except:
        return False
    return True

In [21]:
# declare pkl filenames

mnb_filename: str = r"D:\Obfuscation\models\MultinomialNB_model.pkl"
gnb_filename: str = r"D:\Obfuscation\models\GaussianNB_model.pkl"

In [26]:
models = (mnb1, gnb2)
filenames: tuple[str, ...] = (mnb_filename, gnb_filename)
    
for model, filename in zip(models, filenames):
    saving_result: bool = save_model(model=model, pkl_filename=filename)
    print(f"For model {model} saving to file {filename} was {'successful!' if saving_result else 'failed :|'}")

For model MultinomialNB(alpha=1e-05) saving to file D:\Obfuscation\models\MultinomialNB_model.pkl was successful!
For model GaussianNB(var_smoothing=1e-06) saving to file D:\Obfuscation\models\GaussianNB_model.pkl was successful!
