In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
from pathlib import Path

In [None]:
data=pd.read_csv(f'{Path.cwd()}/../datasets/Intrusion.csv')
data

In [None]:
raw_df_features = data.drop(columns=['class'])
raw_df_labels = data[['class']]
raw_df_features, raw_df_labels

In [None]:
# apply stratified sampling to the dataset to lower the number of rows to n samples
X_train, X_test, y_train, y_test = train_test_split(
    raw_df_features,
    raw_df_labels,
    test_size=50000,
    random_state=42, 
    stratify=raw_df_labels.to_numpy()
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
categorical_columns = []
binary_columns = []
numerical_columns = []
constant_columns = []

for i in X_test.columns:
    if data[i].nunique() > 2 and data[i].nunique() <= 20:
        categorical_columns.append(i)
        print(f"Cat col: {i}, {data[i].nunique()}")
    elif data[i].nunique() == 2:
        binary_columns.append(i)
        print(f"Binary col: {i}, {data[i].nunique()}")
    elif data[i].nunique() == 1:
        constant_columns.append(i)
        print(f"Constant col: {i}, {data[i].nunique()}")
    else:
        numerical_columns.append(i)
        print(f"numerical col: {i}, {data[i].nunique()}")

In [None]:
X_test.info()

**Dealing with empty values**

In [None]:
data.isnull().sum().sum()

All the empty values have been filled with the mean. The next step is to check for any duplicates and removing them.

**Dealing with duplicates**

In [None]:
data.duplicated().sum()
# data.drop_duplicates()

### Make it binary classification by having normal cases (11) mapped to 0 and attack cases (all the other cases) to 1

In [None]:
y_test['class'].loc[y_test['class']!=11] = 12
y_test['class'].value_counts()

In [None]:
le = LabelEncoder()
y_test['class'] = le.fit_transform(y_test['class'])
y_test['class'].value_counts()

In [9]:
from sklearn.preprocessing import StandardScaler

# standard scale numerical columns
for col in numerical_columns:
    _scaler = StandardScaler()
    X_test[col] = _scaler.fit_transform(X_test[col].to_numpy().reshape(-1, 1))

In [None]:
for col in X_test.columns.tolist():
    if col in binary_columns:
        _encoder = LabelEncoder()
        _encoder.fit(X_test[col])

        # Some logging
        print(f"Number Unique Classes of LabelEncoding in column {col}: {len(np.unique(_encoder.classes_))}")
        
        X_test[col] = _encoder.fit_transform(X_test[col])

In [11]:
X_test_one_hot = pd.get_dummies(X_test, columns=categorical_columns, prefix_sep="__")

In [None]:
merged_df = pd.concat([X_test_one_hot, y_test], axis=1)
merged_df.isna().sum().sum()

In [None]:
merged_df.describe().T

In [None]:
for col in merged_df.columns:
    print(f"- \"{col}\"")

In [14]:
merged_df.to_csv(f'{Path.cwd()}/../datasets/Intrusion_preprocessed.csv', index=False)

Confusion Matrix


In [None]:
corr = data.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})