# KDD Cup 1999 Data
## Abstract
This is the data set used for The Third International Knowledge Discovery and Data Mining Tools Competition, which was held in conjunction with KDD-99 The Fifth International Conference on Knowledge Discovery and Data Mining. The competition task was to build a network intrusion detector, a predictive model capable of distinguishing between ``bad`` connections, called intrusions or attacks, and ``good`` normal connections. This database contains a standard set of data to be audited, which includes a wide variety of intrusions simulated in a military network environment.

And....What do we do?
This is an different approach, which can be not so usual...but it works!
1. Get the data
2. Encode numeric features with zscore.
3. Encode objects features with get dummies (One Hot Encoder).
4. Split: Train and test (For good connections - no attack).
5. Transform the DataFrame into a vector
6. Create an autoencoder with the train vector, with multiple output neurons and loss mean_squared_error.
7. Use the RMSE to evaluate: `Attack will have higher RMSE than No-Attack` 

In [1]:
import pandas as pd
from tensorflow.keras.utils import get_file

pd.set_option('display.max_columns', 6)
pd.set_option('display.max_rows', 5)

try:
    path = get_file('kddcup.data_10_percent.gz', origin=\
    'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz')
except:
    print('Error downloading')
    raise
    
print(path) 
df = pd.read_csv(path, header=None)
print("Read {} rows.".format(len(df)))
df.dropna(inplace=True,axis=1)
print("Read {} rows.".format(len(df)))

Downloading data from http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz
/root/.keras/datasets/kddcup.data_10_percent.gz
Read 494021 rows.
Read 494021 rows.


In [2]:
#readind the cols names!
header = pd.read_csv('http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names',skiprows=[0],header=None)
header.columns = ['name']
header['name'] = header['name'].apply(lambda x: x.split(':')[0])
header_cols = list(header['name'])
header_cols.append('outcome')

In [3]:
# The CSV file has no column heads, so add them
df.columns = header_cols

# display 5 rows
pd.set_option('display.max_columns', 5)
pd.set_option('display.max_rows', 5)
df

Unnamed: 0,duration,protocol_type,...,dst_host_srv_rerror_rate,outcome
0,0,tcp,...,0.0,normal.
1,0,tcp,...,0.0,normal.
...,...,...,...,...,...
494019,0,tcp,...,0.0,normal.
494020,0,tcp,...,0.0,normal.


In [4]:
df.groupby('outcome')['outcome'].count()

outcome
back.               2203
buffer_overflow.      30
                    ... 
warezclient.        1020
warezmaster.          20
Name: outcome, Length: 23, dtype: int64

In [5]:
# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd
    
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] 
# for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

In [6]:
numeric_cols = ['duration',
'src_bytes',
'dst_bytes',
'wrong_fragment',
'urgent',
'hot',
'num_failed_logins',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',]

text_cols = ['protocol_type',
'service',
'flag',
'land',
'logged_in',
'is_host_login',
'is_guest_login']

In [7]:
pd.set_option('display.max_columns', 6)
pd.set_option('display.max_rows', 5)

#encoding text_dummy
for text_col in text_cols:
    encode_text_dummy(df, text_col)

#numeric cols as zscores
for numeric_col in numeric_cols:
    encode_numeric_zscore(df, numeric_col)
    
df.dropna(inplace=True,axis=1)
df[0:5]

Unnamed: 0,duration,src_bytes,dst_bytes,...,is_host_login-0,is_guest_login-0,is_guest_login-1
0,-0.067792,-0.002879,0.138664,...,1,1,0
1,-0.067792,-0.00282,-0.011578,...,1,1,0
2,-0.067792,-0.002824,0.014179,...,1,1,0
3,-0.067792,-0.00284,0.014179,...,1,1,0
4,-0.067792,-0.002842,0.035214,...,1,1,0


In [8]:
normal_mask = df['outcome']=='normal.'
attack_mask = df['outcome']!='normal.'

df.drop('outcome',axis=1,inplace=True)

df_normal = df[normal_mask]
df_attack = df[attack_mask]

print(f"Normal count: {len(df_normal)}")
print(f"Attack count: {len(df_attack)}")

Normal count: 97278
Attack count: 396743


In [9]:
# This is the numeric feature vector, as it goes to the neural net
x_normal = df_normal.values
x_attack = df_attack.values

In [10]:
from sklearn.model_selection import train_test_split

x_normal_train, x_normal_test = train_test_split(
    x_normal, test_size=0.25, random_state=42)

In [11]:
print(f"Normal train count: {len(x_normal_train)}")
print(f"Normal test count: {len(x_normal_test)}")

Normal train count: 72958
Normal test count: 24320


In [12]:
from sklearn import metrics
import numpy as np
import pandas as pd
from IPython.display import display, HTML 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

#Here we go! Our model!
model = Sequential()
model.add(Dense(25, input_dim=x_normal.shape[1], activation='relu'))
model.add(Dense(3, activation='relu')) # size to compress to
model.add(Dense(25, activation='relu'))
model.add(Dense(x_normal.shape[1])) # Multiple output neurons
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_normal_train,x_normal_train,verbose=2,epochs=100)

Epoch 1/100
2280/2280 - 5s - loss: 0.3277
Epoch 2/100
2280/2280 - 3s - loss: 0.2687
Epoch 3/100
2280/2280 - 3s - loss: 0.2098
Epoch 4/100
2280/2280 - 3s - loss: 0.1950
Epoch 5/100
2280/2280 - 3s - loss: 0.1752
Epoch 6/100
2280/2280 - 3s - loss: 0.1728
Epoch 7/100
2280/2280 - 3s - loss: 0.1699
Epoch 8/100
2280/2280 - 3s - loss: 0.1745
Epoch 9/100
2280/2280 - 3s - loss: 0.1709
Epoch 10/100
2280/2280 - 3s - loss: 0.1654
Epoch 11/100
2280/2280 - 3s - loss: 0.1693
Epoch 12/100
2280/2280 - 3s - loss: 0.1600
Epoch 13/100
2280/2280 - 3s - loss: 0.1646
Epoch 14/100
2280/2280 - 3s - loss: 0.1526
Epoch 15/100
2280/2280 - 3s - loss: 0.1541
Epoch 16/100
2280/2280 - 3s - loss: 0.1555
Epoch 17/100
2280/2280 - 3s - loss: 0.1457
Epoch 18/100
2280/2280 - 3s - loss: 0.1453
Epoch 19/100
2280/2280 - 3s - loss: 0.1418
Epoch 20/100
2280/2280 - 3s - loss: 0.1433
Epoch 21/100
2280/2280 - 3s - loss: 0.1429
Epoch 22/100
2280/2280 - 3s - loss: 0.1451
Epoch 23/100
2280/2280 - 3s - loss: 0.1403
Epoch 24/100
2280/22

<tensorflow.python.keras.callbacks.History at 0x7f59700f4390>

In [13]:
#predicting!
pred = model.predict(x_normal_test)
score1 = np.sqrt(metrics.mean_squared_error(pred,x_normal_test))
pred = model.predict(x_normal)
score2 = np.sqrt(metrics.mean_squared_error(pred,x_normal))
pred = model.predict(x_attack)
score3 = np.sqrt(metrics.mean_squared_error(pred,x_attack))
print(f"Out of Sample Normal Score (RMSE): {score1}")
print(f"Insample Normal Score (RMSE): {score2}")
print(f"Attack Underway Score (RMSE): {score3}")

Out of Sample Normal Score (RMSE): 0.3081728513351363
Insample Normal Score (RMSE): 0.32222162701051993
Attack Underway Score (RMSE): 0.5667091608570322
