In [2]:
import warnings
import pandas as pd
import typing as tp
from sklearn import preprocessing
import xgboost as xgb

warnings.filterwarnings("ignore", category=UserWarning)
xgb.set_config(verbosity = 0)

  from pandas import MultiIndex, Int64Index


# Feature Engineering

In [3]:
# Label encoder. This func helps to preprocess categorical variables.
def preprocessing_cat_features_le(cat_features: list, df: pd.DataFrame) -> tp.Tuple[pd.DataFrame, dict]:
    df = df.copy()

    maps = {}
    for feature in cat_features:
        le = preprocessing.LabelEncoder()
        df[feature] = le.fit_transform(df[feature].astype(str))
        maps[feature] = le
    return df, maps

# Function for preprocessing a training dataset with a target variable and a test dataset without a target variable
def preprocessing_data(train: pd.DataFrame, test: pd.DataFrame, target: str, var_for_preprocessing: dict,
                       is_one_hot_enc: bool = False, n_cat: int = 8) -> tp.Tuple[pd.DataFrame, pd.DataFrame, list, dict]:
    # Data imputation
    train = train.fillna(-1)
    test = test.fillna(-1)

    # Join training and test dataset
    df_full = pd.concat([train, test])

    # Add some feature to black list, witch will be removed from main dataset
    black_list = var_for_preprocessing.get("feature_black_list")
    if black_list:
        black_list.append(target)
    else:
        black_list = [target]

    for var_type, var in var_for_preprocessing.items():
        # Preprocessing categorical and numeric categorical features
        if var_type == "categorical_features" or var_type == "numeric_categorical_features":
            [set_top_n_categories_in_variable(df_full, feature, n) for feature, n in var]
        # Normalisation
        elif var_type == "continuous_numeric_features":
            df_full[var]= df_full[var].apply(lambda x: (x - x.mean())/x.std())
        # One hot encoding for our selected variables
        elif var_type == "one_hot_encoding":
            df_bin_var = pd.get_dummies(df_full[var], drop_first=True)
            black_list.extend(var)
            df_full = pd.concat([df_full, df_bin_var], axis=1)
    # Automatic One Hot Encoding for categorical variable
    if is_one_hot_enc:
        var_bin_list = list()
        for var_type, var in var_for_preprocessing.items():
            if var_type == "categorical_features" or var_type == "numeric_categorical_features":
                var_list = [feature for feature, n in var if n_cat >= n > 1]
                var_bin_list.extend(var_list)
        df_bin_var = pd.get_dummies(df_full[var_bin_list], drop_first=True)
        black_list.extend(var_bin_list)
        df_full = pd.concat([df_full, df_bin_var], axis=1)

    cat_features = list(set(df_full.select_dtypes(exclude='number').columns.tolist())-set(black_list))

    if cat_features:
        # Label encoding
        new_df_full, maps = preprocessing_cat_features_le(cat_features, df_full)
        # The features, that will be used for modeling.
        features = list(set(new_df_full.columns.tolist())-set(black_list))
    else:
        new_df_full = df_full.copy()
        # The features, that will be used for modeling.
        features = list(set(new_df_full.columns.tolist())-set(black_list))
        maps = None

    df_test = new_df_full.loc[new_df_full.notified.isna()][features]
    df_train = new_df_full.loc[new_df_full.notified.notna()][features+[target]]
    return df_train, df_test, features, maps

# Function to set top N categories in given categorical variable
def set_top_n_categories_in_variable(df: pd.DataFrame, feature: str, n_cat: int):
    top_n_var = df[feature].value_counts().nlargest(n_cat).index
    df[feature] = df[feature].where(df[feature].isin(top_n_var), other='Other')

In [4]:
df_train = pd.read_csv('input/cybersecurity_training.csv', sep='|', index_col=0)
df_test = pd.read_csv('input/cybersecurity_test.csv', sep='|', index_col=0)
print(df_train.shape)
print(df_test.shape)
df_full = pd.concat([df_train, df_test])

(39427, 62)
(20000, 61)


In [5]:
# Selected categor
categorical_features = [
    ("categoryname", 7),
    ("ipcategory_name", 5),
    ("ipcategory_scope", 2),
    ("dstipcategory_dominate", 4),
    ("srcipcategory_dominate", 4)
]

numeric_categorical_features = [
    ("parent_category", 2),
    ("overallseverity", 3),
    ("alerttype_cd", 3),
    ("direction_cd", 2),
    ("eventname_cd", 4),
    ("severity_cd", 3),
    ("devicetype_cd", 2),
    ("devicevendor_cd", 2),
    ("srcipcategory_cd", 3),
    ("dstipcategory_cd", 2),
    ("trustscore", 2),
    ("dstportcategory_dominate", 3),
    ("username_cd", 8),
    ("protocol_cd", 4),
    ("reportingdevice_cd", 5),
    ("dstport_cd", 5),
    ("srcport_cd", 10),
    ("dstip_cd", 6),
    ("srcip_cd", 10),
    ("p6", 3),
    ("p9", 1),
    ("p5m", 3),
    ("p5w", 2),
    ("p5d", 2),
    ("p8w", 2),
    ("p8m", 3),
    ("p8d", 2)
]

numeric_continue_features = ["thrcnt_month", "thrcnt_week", "thrcnt_day", "timestamp_dist", "correlatedcount"]
feature_black_list = ['grandparent_category']
smt_to_do = "ip"
one_hot_encoding = ["weekday"]
features_for_preprocessing = {"categorical_features": categorical_features,
                              "numeric_categorical_features": numeric_categorical_features,
                              # "numeric_continue_features": numeric_continue_features,
                              "feature_black_list": feature_black_list,
                              "one_hot_encoding": one_hot_encoding}

# Select target variable
target = 'notified'

In [6]:
df_train, df_test, *_ =  preprocessing_data(df_train, df_test, target, features_for_preprocessing)
df_train.head()

Unnamed: 0_level_0,ipcategory_name,n1,thrcnt_day,domain_cd,severity_cd,n4,reportingdevice_cd,isiptrusted,alerttype_cd,weekday_Tue,...,untrustscore,weekday_Wed,p5m,correlatedcount,devicetype_cd,thrcnt_month,srcportcategory_dominate,flowscore,srcipcategory_dominate,notified
alert_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Nhq,0,-1.0,675,0,1,-1.0,1,0,1,1,...,2,0,1,69,1,16711,4,3,4,0.0
XZt,4,-1.0,2,0,3,-1.0,1,1,3,0,...,5,0,2,5302,1,15,3,5,4,0.0
bBz,0,-1.0,628,0,1,-1.0,1,0,1,1,...,4,0,1,346,1,7393,4,3,2,0.0
ZNr,0,-1.0,96,0,0,-1.0,0,0,0,1,...,5,0,1,1,0,2048,0,3,0,0.0
poV,0,0.0,632,0,1,0.0,1,0,1,0,...,5,0,0,1,1,2793,4,3,2,0.0


In [7]:
df_test.head()

Unnamed: 0_level_0,ipcategory_name,n1,thrcnt_day,domain_cd,severity_cd,n4,reportingdevice_cd,isiptrusted,alerttype_cd,weekday_Tue,...,n8,untrustscore,weekday_Wed,p5m,correlatedcount,devicetype_cd,thrcnt_month,srcportcategory_dominate,flowscore,srcipcategory_dominate
alert_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Slg,0,0.0,42,0,1,0.0,1,0,1,0,...,0.0,3,1,0,1,1,1302,4,3,2
WKM,0,0.0,3,0,1,0.0,1,0,1,0,...,0.0,5,0,0,1,1,20,3,3,0
dkm,0,0.0,602,1,1,0.0,1,0,1,0,...,0.0,2,0,2,1,1,16131,4,3,2
RIX,3,0.0,4,1,1,1.0,1,1,1,0,...,0.0,2,0,2,1,1,53,4,3,3
qFU,0,0.0,20,0,1,0.0,1,0,1,0,...,0.0,3,0,0,14,1,541,4,3,4


In [None]:
# We can use one hot encoding for almost all categorical variable
df_train, df_test, *_ =  preprocessing_data(df_train, df_test, target, features_for_preprocessing, True, 10)
df_train.shape