In [None]:
#import import_ipynb
#import library

In [5]:
#%%timeit
%time
import collections
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os, io, requests, csv, shutil
import tensorflow as tf
import tensorflow.keras 
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model, Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import roc_curve, auc

import logging
logging.getLogger('tensorflow').disabled = True

#1Hot Encoding for numpy array
def apply_1hot_numpy(npa, size):
    return tf.keras.utils.to_categorical(npa,size)

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs which is numpy array 
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, collections.Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    
    #Tensorflow only support numpy array for now 
    
    #if output column is integer that means the problem is classification problem
    #in this case, tensorflow wants output to be 1hot encoding 
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target]) #1 hot encoding 
        # call dataframe.values to convert dataframe to numpy array 
        # return 2 value , X and Y 
        # first return value is the X matrix convert to np
        # second return value is the Y matrix after 1 hot encoding and convert to np 
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) * (normalized_high - normalized_low) + normalized_low

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.figure(figsize=(15,15))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
   # plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')    


# Read file , fill missing value with NA 
def readfile(filename):
    path = "./data/"
    return pd.read_csv(os.path.join(path,filename) , na_values=['NA','?'])


def get_real_column():
    return [
 'duration',
 'protocol_type',
 'service',
 'flag',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate',
 'outcome'
]


#Encode outcome , 0 = good   1 = bad 
def outcome_encode(x):
    if x == 'normal.':
        return 0
    else:
        return 1
# Simple function to evaluate the coefficients of a regression

%matplotlib inline    
from IPython.display import display   

def report_coef(names,coef,intercept):
    r = pd.DataFrame( { 'coef': coef, 'positive': coef>=0  }, index = names )
    r = r.sort_values(by=['coef'])
    display(r)
    print("Intercept: {}".format(intercept))
    r['coef'].plot(kind='barh', color=r['positive'].map({True: 'b', False: 'r'}))
    
    
def log_report_coef(names,coef,intercept):
    r = pd.DataFrame( { 'coef': coef, 'positive': coef>=0  }, index = names )
    r = r.sort_values(by=['coef'])
    r = r[r.coef.abs() > 0.010] #0.038 , 0.015
   # display(r)
    print("Intercept: {}".format(intercept))
    r['coef'].plot(kind='barh', color=r['positive'].map({True: 'b', False: 'r'}) , figsize = (10,10) , fontsize = 16)
    return list(r.index)

def grab_important_feature(df):
    #Drop b/c no prediction power
    df.drop('service' , axis=1 , inplace=True) 
    #Encode categorical feature 
    encode_text_dummy(df, 'flag')
    encode_text_dummy(df, 'protocol_type')
    
    #Noramlize Column 0 - 37 
    outcomeIndex = df.columns.get_loc("outcome") #Get outcome column index 
    for i in range(38):
        if i != 17 and i !=18 and i != 16 and i != outcomeIndex:
            encode_numeric_zscore(df, df.columns[i])
            
    # Prepare x and encode y 
    attacks = encode_text_index(df,'outcome')
    x,y = to_xy(df,'outcome')
    # Prepare y 
    y = df.outcome 
    
    # Split x,y
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=45) 
    
    # Use LogisticRegression with L2 to get coef
    import sklearn
    from sklearn.linear_model import LogisticRegression
    regressor = LogisticRegression(penalty='l2',  C=0.001, solver='lbfgs', multi_class='multinomial')
    regressor.fit(x_train,y_train)
    pred = regressor.predict(x_test)

    #Get log_report_coef to generate visual representation of important feature 
    names = list(df.columns.values)
    names.remove("outcome")
    names = np.array(names) # (52,)
    important_feature = log_report_coef(
      names,
      regressor.coef_[0],
      regressor.intercept_)
    
    #Return a fd with only important feature + encoded attacks 
    df = df[important_feature]
    foo = df.copy()
    foo['outcome'] = y
    return foo , attacks

#Balance df by making every class have about the same sample. 
def balance_it(df):
    attack9 = df[df.outcome == 9]
    attack11 = df[df.outcome == 11]
    attack9 = attack9.sample(frac=0.01929)  
    attack11 = attack11.sample(frac=0.01138)
    df = df[df.outcome != 9]
    df = df[df.outcome != 11]
    # Add mew attack9,attack11 back
    df = pd.concat([attack9,attack11,df])
    #OPTIONAL? : Remove certain attack. 
    #Drop 1,2,4,6,7,8,12,13,16,19,22
    for x in [1,2,3,4,6,7,8,12,13,16,19,22]:
        df = df[df.outcome != x]
    # After balance 
    print(df.groupby(['outcome']).outcome.count())
    return df
print("Lib Loaded")

Wall time: 0 ns
Lib Loaded


In [4]:
def hi():
    print("Library Loaded.. don't worry")