In [1]:
import numpy as np
import warnings
import pandas as pd
import typing as tp
from sklearn import preprocessing
import xgboost as xgb

from data_preparation_tools import preprocessing_data
from cv_wrappers_and_tools import cv_xgb, plot_roc_auc, make_prediction_xgb, save_model_and_results

warnings.filterwarnings("ignore", category=UserWarning)
xgb.set_config(verbosity = 0)

  from pandas import MultiIndex, Int64Index


# Feature Engineering

In [10]:
def preprocessing_cat_features_le(cat_features: list, df: pd.DataFrame) -> tp.Tuple[pd.DataFrame, dict]:
    df = df.copy()

    maps = {}
    for feature in cat_features:
        le = preprocessing.LabelEncoder()
        df[feature] = le.fit_transform(df[feature])
        maps[feature] = le
    return df, maps


def preprocessing_data_1(train: pd.DataFrame, test: pd.DataFrame, target: str, black_list: list,
                         var_for_prepocessing: list) -> tp.Tuple[pd.DataFrame, pd.DataFrame, dict, list]:

    """
    Function for preprocessing a training dataset with a target variable and a test dataset without a target variable

    :param train: a training dataset with a target variable.
    :param test: a test dataset without a target variable.
    :param target: Predictive variable
    :param black_list: A list with the features that won't be used in predictions.
    :return: Pre-processed training and test dataset, maps for features and
    features, that will be used for predictions.
    """
    train.fillna(-1)
    test.fillna(-1)


    df_full = pd.concat([train, test])

    for var in var_for_prepocessing:
        outlier_filtering(df_full, var)

    features_num = df_full.select_dtypes('number').columns.tolist()
    features_cat = df_full.select_dtypes(exclude='number').columns.tolist()
    black_list.append(target)


    features_num = [feature for feature in features_num if feature not in black_list]
    features_cat = [feature for feature in features_cat if feature not in black_list]
    new_df_full, maps = preprocessing_cat_features_le(features_cat, df_full)
    features = features_cat+features_num

    df_test = new_df_full.loc[new_df_full.notified.isna()][features]
    df_train = new_df_full.loc[new_df_full.notified.notna()][features+[target]]
    return df_train, df_test, maps, features

In [20]:
def set_top_n_categories_in_variable(df, feature, n_cat):
    top_n_var = df[feature].value_counts().nlargest(n_cat).index
    df[feature] = df[feature].where(df[feature].isin(top_n_var), other='Other')

In [21]:
def outlier_filtering(df, features_n):
    for feature, n in features_n:
        set_top_n_categories_in_variable(df, feature, n)
        print(df[feature].value_counts())

In [13]:
df_train = pd.read_csv('input/cybersecurity_training.csv', sep='|', index_col=0)
df_test = pd.read_csv('input/cybersecurity_test.csv', sep='|', index_col=0)
print(df_train.shape)
print(df_test.shape)
df_full = pd.concat([df_train, df_test])


(39427, 62)
(20000, 61)


In [25]:
nominal_features = [
    ("categoryname", 5),
    # ("ip", 15),
    ("ipcategory_name", 4),
    ("ipcategory_scope", 2),
    ("dstipcategory_dominate", 1),
    ("srcipcategory_dominate", 2),
    ("srcipcategory_dominate", 2)
]

numeric_features = [
    ("parent_category", 2),
    ("overallseverity", 3),
    ("alerttype_cd", 3),
    ("direction_cd", 2),
    ("eventname_cd", 3),
    ("severity_cd", 3),
    ("devicetype_cd", 2),
    ("devicevendor_cd", 2),
    ("srcipcategory_cd", 3),
    ("dstipcategory_cd", 2),
    ("trustscore", 2),
    ("dstportcategory_dominate", 3),
    ("p6", 3),
    ("p6", 1),
    ("p5m", 3),
    ("p5w", 2),
    ("p5d", 2),
    ("p8w", 2),
    ("p8d", 2)
]

numeric_continue_features = [
    ("timestamp_dist", 7),
    ("correlatedcount", 10),
    ("srcip_cd", 5), # num_f
    ("dstip_cd", 5), # num_f
    ("srcport_cd", 7), # num_f
    ("dstport_cd", 4), # num_f
    ("reportingdevice_cd", 3), # num_f
    ("protocol_cd", 3), # num_f
    ("username_cd", 2) # num_f
]

numeric_continue_features_for_normalization = ["thrcnt_month", "thrcnt_week", "thrcnt_day"]

In [None]:
params = {
    'subsample': 0.9,
    'colsample_bytree': 0.5,
    'lambdax': 1,
    'max_depth':24,
    'early_stopping_rounds': 100
}

features_for_preprocessing = [nominal_features, ]

feature_black_list = ['grandparent_category']
# Select target variable
target = 'notified'
preprocessed_train_df, preprocessed_test_df, _, features = preprocessing_data_1(df_train, df_test, target, feature_black_list, features_for_preprocessing)
trainResults, testResults, predictions, indices, hists = cv_xgb(preprocessed_train_df, target, features, debug=False, mlflow_tracking=True, exp_name="exp_4_feature_eng", **params)
print(f'Train mean: {np.mean(trainResults)}, Valid mean: {np.mean(testResults)}')

Attack                  30788
Exploit                 18366
Control and Maintain     4702
Reconnaissance           2635
Other                    2083
Attack Preparation        853
Name: categoryname, dtype: int64
INTERNET    37943
PRIV-10     14517
PRIV-192     3674
PRIV-172     2854
Other         439
Name: ipcategory_name, dtype: int64
Internet           37947
Private network    21457
Other                 23
Name: ipcategory_scope, dtype: int64
INTERNET    55142
Other        4285
Name: dstipcategory_dominate, dtype: int64
INTERNET    29438
PRIV-10     19520
Other       10469
Name: srcipcategory_dominate, dtype: int64
INTERNET    29438
PRIV-10     19520
Other       10469
Name: srcipcategory_dominate, dtype: int64


In [22]:
outlier_filtering(df_full, nominal_features)

Attack                  30788
Exploit                 18366
Control and Maintain     4702
Reconnaissance           2635
Other                    2083
Attack Preparation        853
Name: categoryname, dtype: int64
Other            40235
YT.LB.32.21       4462
YT.LB.34.21       3173
YT.LB.36.21       2882
YT.LB.38.21       2812
MC.ER.197.27      2088
OQ.QJ.38.32       1704
YT.RD.254.202      511
YT.LB.32.10        328
ZU.SK.99.55        266
YT.LB.36.10        203
YT.LB.34.10        178
10.KW.GO.30        174
YT.LB.38.10        149
192.SL.PF.5        137
172.ER.RX.11       125
Name: ip, dtype: int64
INTERNET    37943
PRIV-10     14517
PRIV-192     3674
PRIV-172     2854
Other         439
Name: ipcategory_name, dtype: int64
Internet           37947
Private network    21457
Other                 23
Name: ipcategory_scope, dtype: int64
INTERNET    55142
Other        4285
Name: dstipcategory_dominate, dtype: int64
INTERNET    29438
PRIV-10     19520
Other       10469
Name: srcipcategory_domin