In [49]:
import pandas as pd
import numpy as np
from pandas import DataFrame
import os
import re
from sklearn import preprocessing

# Util functions

In [47]:
def encode_numeric_zscore(df: DataFrame, name: str, mean=None, std=None):
    if mean is None:
        mean = df[name].mean()
    if std is None:
        std = df[name].std()
    df[name] = (df[name] - mean) / std
def encode_text_dummy(df: DataFrame, name: str):
    dummies = pd.get_dummies(df[name])
    for column in dummies.columns:
        dummy_name = "{}-{}".format(name, column)
        df[dummy_name] = dummies[column]
    df.drop(name, axis=1, inplace=True)
def encode_text_index(df: DataFrame, name: str):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_
def to_xy(df: DataFrame, target: str):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    target_type = df[target].dtype
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)
        

In [36]:
with open("../Dataset/features.names", "r") as f:
    features = [line.strip() for line in f.readlines()]
with open("../Dataset/feature_type.names", "r") as f:
    feature_types = [line.strip() for line in f.readlines()]
features.append("class")
df = pd.read_csv("../Dataset/kddcup.data_10_percent", names=features)

In [37]:
df["duration"].describe()

count    494021.000000
mean         47.979302
std         707.746472
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max       58329.000000
Name: duration, dtype: float64

In [38]:
df["duration"]

0         0
1         0
2         0
3         0
4         0
         ..
494016    0
494017    0
494018    0
494019    0
494020    0
Name: duration, Length: 494021, dtype: int64

In [39]:
for column in df.columns:
    print(column)
    print(df[column].describe())
    print("=======================================")

duration
count    494021.000000
mean         47.979302
std         707.746472
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max       58329.000000
Name: duration, dtype: float64
protocol_type
count     494021
unique         3
top         icmp
freq      283602
Name: protocol_type, dtype: object
service
count     494021
unique        66
top        ecr_i
freq      281400
Name: service, dtype: object
flag
count     494021
unique        11
top           SF
freq      378440
Name: flag, dtype: object
src_bytes
count    4.940210e+05
mean     3.025610e+03
std      9.882181e+05
min      0.000000e+00
25%      4.500000e+01
50%      5.200000e+02
75%      1.032000e+03
max      6.933756e+08
Name: src_bytes, dtype: float64
dst_bytes
count    4.940210e+05
mean     8.685324e+02
std      3.304000e+04
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      5.155468e+06
Name: dst_bytes, dtype: float64
land
count    49402

count     494021
unique        23
top       smurf.
freq      280790
Name: class, dtype: object


In [40]:
for feature, feature_type in zip(features, feature_types):
    if feature_type == "continuous":
        encode_numeric_zscore(df, feature)
    else:
        encode_text_dummy(df, feature)
encode_text_index(df, "class")

array(['back.', 'buffer_overflow.', 'ftp_write.', 'guess_passwd.',
       'imap.', 'ipsweep.', 'land.', 'loadmodule.', 'multihop.',
       'neptune.', 'nmap.', 'normal.', 'perl.', 'phf.', 'pod.',
       'portsweep.', 'rootkit.', 'satan.', 'smurf.', 'spy.', 'teardrop.',
       'warezclient.', 'warezmaster.'], dtype=object)

In [46]:
df.dropna(inplace=True, axis=1)
set(df["class"])

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22}

In [48]:
to_xy(df, "class")

int64
