In [1]:
import pandas as pd
from tensorflow.keras.utils import get_file

pd.set_option('display.max_columns', 6)
pd.set_option('display.max_rows', 5)

#讀取kdd公開資料集
try:
    path = get_file('kddcup.data_10_percent.gz', origin=\
    'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz')
except:
    print('Error downloading')
    raise
    
print(path) 

df = pd.read_csv(path, header=None)

print("Read {} rows.".format(len(df)))
df.dropna(inplace=True,axis=1) 
df.columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'outcome'
]

pd.set_option('display.max_columns', 5)
pd.set_option('display.max_rows', 5)
df

/Users/barg/.keras/datasets/kddcup.data_10_percent.gz
Read 494021 rows.


Unnamed: 0,duration,protocol_type,...,dst_host_srv_rerror_rate,outcome
0,0,tcp,...,0.0,normal.
1,0,tcp,...,0.0,normal.
...,...,...,...,...,...
494019,0,tcp,...,0.0,normal.
494020,0,tcp,...,0.0,normal.


In [2]:
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

In [3]:
df_numeric = ['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 
            'hot', 'num_failed_logins', 'num_compromised', 'root_shell', 
            'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 
            'num_access_files', 'num_outbound_cmds', 'count', 'srv_count', 
            'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 
            'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 
            'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 
            'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 
            'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 
            'dst_host_srv_rerror_rate']

df_dummy = ['protocol_type',
            'service',
            'flag',
            'land',
            'logged_in',
            'is_host_login',
            'is_guest_login']


In [4]:
for col in df_numeric: 
    encode_numeric_zscore(df, col)
for col in df_dummy: 
    encode_text_dummy(df, col)
    
df.dropna(inplace=True,axis=1)
df[0:5]

Unnamed: 0,duration,src_bytes,...,is_guest_login-0,is_guest_login-1
0,-0.067792,-0.002879,...,1,0
1,-0.067792,-0.00282,...,1,0
2,-0.067792,-0.002824,...,1,0
3,-0.067792,-0.00284,...,1,0
4,-0.067792,-0.002842,...,1,0


In [5]:
normal_mask = df['outcome']=='normal.'
attack_mask = df['outcome']!='normal.'

df.drop('outcome',axis=1,inplace=True)

df_normal = df[normal_mask]
df_attack = df[attack_mask]

print(f"Normal count: {len(df_normal)}")
print(f"Attack count: {len(df_attack)}")

Normal count: 97278
Attack count: 396743


In [6]:
x_normal = df_normal.values
x_attack = df_attack.values

In [12]:
from sklearn.model_selection import train_test_split

x_normal_train, x_normal_test = train_test_split(
    x_normal, test_size=0.25, random_state=42)

In [13]:

print(f"Normal train count: {len(x_normal_train)}")
print(f"Normal test count: {len(x_normal_test)}")

Normal train count: 72958
Normal test count: 24320


In [7]:
df_normal

Unnamed: 0,duration,src_bytes,...,is_guest_login-0,is_guest_login-1
0,-0.067792,-0.002879,...,1,0
1,-0.067792,-0.002820,...,1,0
...,...,...,...,...,...
494019,-0.067792,-0.002767,...,1,0
494020,-0.067792,-0.002840,...,1,0


In [9]:
x_normal

array([[-0.06779165, -0.00287852,  0.13866427, ...,  1.        ,
         1.        ,  0.        ],
       [-0.06779165, -0.00281983, -0.01157786, ...,  1.        ,
         1.        ,  0.        ],
       [-0.06779165, -0.00282388,  0.0141788 , ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [-0.06779165, -0.00285626,  0.01003231, ...,  1.        ,
         1.        ,  0.        ],
       [-0.06779165, -0.00276721,  0.01003231, ...,  1.        ,
         1.        ,  0.        ],
       [-0.06779165, -0.00284007,  0.01106137, ...,  1.        ,
         1.        ,  0.        ]])

In [14]:

from sklearn import metrics
import numpy as np
import pandas as pd
from IPython.display import display, HTML 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

model = Sequential()
model.add(Dense(64, input_dim=x_normal.shape[1], activity_regularizer=tf.keras.regularizers.l2(0.0001),activation='relu'))#
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(32, input_dim=x_normal.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(x_normal.shape[1]))
model.compile(loss='mean_squared_error',optimizer='adam')
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)#設置early stopping
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_18 (Dense)             (None, 64)                7744      
_________________________________________________________________
dense_19 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_20 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_21 (Dense)             (None, 32)                544       
_________________________________________________________________
dense_22 (Dense)             (None, 64)                2112      
_________________________________________________________________
dense_23 (Dense)             (None, 120)               7800      
Total params: 20,808
Trainable params: 20,808
Non-trainable params: 0
__________________________________________________

In [22]:
#只使用normal的資料建立auto-encoder的模型
model.fit(x_normal_train,x_normal_train,verbose=1,batch_size=100,epochs=20,callbacks=callback)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


<tensorflow.python.keras.callbacks.History at 0x65046a810>

In [23]:
pred = model.predict(x_normal_test)
score1 = np.sqrt(metrics.mean_squared_error(pred,x_normal_test))
pred = model.predict(x_attack)
score2 = np.sqrt(metrics.mean_squared_error(pred,x_attack))
print(f"normal score : {score1}")
print(f"attack score : {score2}")

normal score : 0.10242798562743476
attack score : 0.4586095413009826
