In [11]:
import json
import csv
import pandas as pd
from sklearn import preprocessing
import numpy as np
import os
from collections.abc import Sequence
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure, show
import sklearn.feature_extraction.text as sk_text
from sklearn import metrics
import tensorflow.keras 
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping , ModelCheckpoint

from collections import Counter
from datetime import datetime

ModuleNotFoundError: No module named 'tensorflow'

In [12]:
### HELPER FUNCTIONS ###

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd
    
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()


def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

In [16]:
# Make a pandas dataframe with csv file
df = pd.read_csv('network_intrusion_data.csv', encoding="utf-8")
df.columns = ['duration', 'protocol_type', 'service', 'flag','src_bytes', 'dst_bytes','land', 'wrong_fragment', 
              'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
              'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 
              'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
              'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 
              'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 
              'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 
              'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome']
# print(df.size)

# Drop redundant records
df.drop_duplicates(subset=None, inplace=True)
# print(df.size)

# Drop records with missing values
df.dropna()
# print(df.size)



# Add new column for encoding
df['attack'] = 0

# Encode outcome of attacks. 0 for normal, 1 for everything else
for index, row in df.iterrows():
    if row['outcome'] != 'normal.':
        row['attack'] = 1
        
#encode the input for categorical data
encode_text_dummy(df, 'protocol_type')
encode_text_dummy(df, 'service')
encode_text_dummy(df, 'flag')
encode_text_dummy(df, 'outcome')
        
# print(df)

In [17]:
print(df)

        duration  src_bytes  dst_bytes  land  wrong_fragment  urgent  hot  \
0              0        239        486     0               0       0    0   
1              0        235       1337     0               0       0    0   
2              0        219       1337     0               0       0    0   
3              0        217       2032     0               0       0    0   
4              0        217       2032     0               0       0    0   
...          ...        ...        ...   ...             ...     ...  ...   
494015         0        310       1881     0               0       0    0   
494016         0        282       2286     0               0       0    0   
494017         0        203       1200     0               0       0    0   
494018         0        291       1200     0               0       0    0   
494019         0        219       1234     0               0       0    0   

        num_failed_logins  logged_in  num_compromised  ...  outcome-phf.  \