# Reduction of Memory usage

# 1. Load Packages

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)




# 2. Function for reducing memory usage of a pandas dataframe

In [2]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

# 3. Load Data

In [3]:
props = pd.read_csv(r"K:/CIC-2017-dataset/CIC-IDS-2017\MergedML_CSV/MergedML.csv")  #The properties dataset

#train = pd.read_csv(r"../input/train_2016_v2.csv")   # The parcelid's with their outcomes
#samp = pd.read_csv(r"../input/sample_submission.csv")  #The parcelid's for the testset

  interactivity=interactivity, compiler=compiler, result=result)


# 4. Run Function

In [5]:
props, NAlist = reduce_mem_usage(props)
print("_________________")
print("")
print("Warning: the following columns have missing values filled with 'df['column_name'].min() -1': ")
print("_________________")
print("")
print(NAlist)

Memory usage of properties dataframe is : 647.3080892562866  MB
******************************
Column:   Destination Port
dtype before:  uint32
dtype after:  uint32
******************************
******************************
Column:   Flow Duration
dtype before:  int32
dtype after:  int32
******************************
******************************
Column:   Total Fwd Packets
dtype before:  uint32
dtype after:  uint32
******************************
******************************
Column:   Total Backward Packets
dtype before:  uint32
dtype after:  uint32
******************************
******************************
Column:  Total Length of Fwd Packets
dtype before:  uint32
dtype after:  uint32
******************************
******************************
Column:   Total Length of Bwd Packets
dtype before:  uint32
dtype after:  uint32
******************************
******************************
Column:   Fwd Packet Length Max
dtype before:  uint16
dtype after:  uint16
***************

dtype after:  uint32
******************************
******************************
Column:   Subflow Bwd Bytes
dtype before:  uint32
dtype after:  uint32
******************************
******************************
Column:  Init_Win_bytes_forward
dtype before:  int32
dtype after:  int32
******************************
******************************
Column:   Init_Win_bytes_backward
dtype before:  int32
dtype after:  int32
******************************
******************************
Column:   act_data_pkt_fwd
dtype before:  uint32
dtype after:  uint32
******************************
******************************
Column:   min_seg_size_forward
dtype before:  int32
dtype after:  int32
******************************
******************************
Column:  Active Mean
dtype before:  float32
dtype after:  float32
******************************
******************************
Column:   Active Std
dtype before:  float32
dtype after:  float32
******************************
*********************

In [6]:
props.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2542141 entries, 0 to 2542140
Data columns (total 79 columns):
 Destination Port               uint32
 Flow Duration                  int32
 Total Fwd Packets              uint32
 Total Backward Packets         uint32
Total Length of Fwd Packets     uint32
 Total Length of Bwd Packets    uint32
 Fwd Packet Length Max          uint16
 Fwd Packet Length Min          uint16
 Fwd Packet Length Mean         float32
 Fwd Packet Length Std          float32
Bwd Packet Length Max           uint16
 Bwd Packet Length Min          uint16
 Bwd Packet Length Mean         float32
 Bwd Packet Length Std          float32
Flow Bytes/s                    object
 Flow Packets/s                 object
 Flow IAT Mean                  float32
 Flow IAT Std                   float32
 Flow IAT Max                   int32
 Flow IAT Min                   int32
Fwd IAT Total                   uint32
 Fwd IAT Mean                   float32
 Fwd IAT Std             

Reducing DataFrame memory size by ~58%, memory usage was reduced from (1.5+ GB) to (647.3+ MB), that was really a good job!

dtypes before: float64(22), int64(54), object(3)
dtypes after : float32(22), int32(8), int64(2), object(3), uint16(6), uint32(19), uint8(19)


In [6]:
props.to_csv('K:/CIC-2017-dataset/CIC-IDS-2017/MergedML_CSV/reducedDF.csv', index=False)

In [7]:
data = props.drop(['Flow Bytes/s', ' Flow Packets/s'], axis=1)

In [8]:
data = data.drop([" Bwd PSH Flags",  # 0
           " Fwd URG Flags",  # 0
           " Bwd URG Flags",  # 0
           " CWE Flag Count",   # 0
           "Fwd Avg Bytes/Bulk",  # 0
           " Fwd Avg Packets/Bulk", # 0
           " Fwd Avg Bulk Rate",  # 0
           " Bwd Avg Bytes/Bulk",  # 0
           " Bwd Avg Packets/Bulk", # 0
           "Bwd Avg Bulk Rate"], axis=1)  # 0

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2542141 entries, 0 to 2542140
Data columns (total 67 columns):
 Destination Port               uint32
 Flow Duration                  int32
 Total Fwd Packets              uint32
 Total Backward Packets         uint32
Total Length of Fwd Packets     uint32
 Total Length of Bwd Packets    uint32
 Fwd Packet Length Max          uint16
 Fwd Packet Length Min          uint16
 Fwd Packet Length Mean         float32
 Fwd Packet Length Std          float32
Bwd Packet Length Max           uint16
 Bwd Packet Length Min          uint16
 Bwd Packet Length Mean         float32
 Bwd Packet Length Std          float32
 Flow IAT Mean                  float32
 Flow IAT Std                   float32
 Flow IAT Max                   int32
 Flow IAT Min                   int32
Fwd IAT Total                   uint32
 Fwd IAT Mean                   float32
 Fwd IAT Std                    float32
 Fwd IAT Max                    uint32
 Fwd IAT Min            

In [10]:
data.to_csv('K:/CIC-2017-dataset/CIC-IDS-2017/MergedML_CSV/reduced67.csv', index=False)

In [11]:
data.shape   # (row: 2,542,141  column: 67)

(2542141, 67)

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2542141 entries, 0 to 2542140
Data columns (total 67 columns):
 Destination Port               uint32
 Flow Duration                  int32
 Total Fwd Packets              uint32
 Total Backward Packets         uint32
Total Length of Fwd Packets     uint32
 Total Length of Bwd Packets    uint32
 Fwd Packet Length Max          uint16
 Fwd Packet Length Min          uint16
 Fwd Packet Length Mean         float32
 Fwd Packet Length Std          float32
Bwd Packet Length Max           uint16
 Bwd Packet Length Min          uint16
 Bwd Packet Length Mean         float32
 Bwd Packet Length Std          float32
 Flow IAT Mean                  float32
 Flow IAT Std                   float32
 Flow IAT Max                   int32
 Flow IAT Min                   int32
Fwd IAT Total                   uint32
 Fwd IAT Mean                   float32
 Fwd IAT Std                    float32
 Fwd IAT Max                    uint32
 Fwd IAT Min            

In [13]:
# grouping by forest cover type and calculating total occurance
data.groupby(' Label').size()

 Label
BENIGN                        1984531
Bot                              1966
DDoS                           128027
DoS GoldenEye                   10293
DoS Hulk                       231073
DoS Slowhttptest                 5499
DoS slowloris                    5796
FTP-Patator                      7938
Heartbleed                         11
PortScan                       158930
SSH-Patator                      5897
Web Attack � Brute Force         1507
Web Attack � Sql Injection         21
Web Attack � XSS                  652
dtype: int64

In [14]:
data[' Label'] = data[' Label'].map({'BENIGN': 0, 
                                     'FTP-Patator':1, 'SSH-Patator':1,
                                     'DoS Hulk':2, 'DoS GoldenEye':2,
                                     'DoS slowloris':2, 'DoS Slowhttptest':2,
                                     'Heartbleed':3,
                                     'Web Attack � Brute Force':4,
                                     'Web Attack � Sql Injection':4,
                                     'Web Attack � XSS':4,
                                     'Bot':5,
                                     'DDoS':6,
                                     'PortScan':7})

In [15]:
# grouping by forest cover type and calculating total occurance
data.groupby(' Label').size()

 Label
0    1984531
1      13835
2     252661
3         11
4       2180
5       1966
6     128027
7     158930
dtype: int64

In [18]:
print("{:.3f} % of all transactions are normal. ".format(np.sum(data[' Label']) / data.shape[0] * 100))

95.133 % of all transactions are normal. 


In [20]:
## feeding top 15 features in a variable as dataframe including target variable

## AdaBoost Sample
#sample = data[['Wilderness_Area4', 'Elevation','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Aspect','Wilderness_Area4', 'Soil_Type4', 'Soil_Type10' 'Cover_Type']]

sample = data[[' Destination Port', 'Init_Win_bytes_forward', ' Init_Win_bytes_backward', ' Flow IAT Min',
' Fwd IAT Min', ' Bwd IAT Min', ' Average Packet Size', ' Bwd Packet Length Std',
' Fwd Packet Length Std', ' Packet Length Std', ' Total Backward Packets', ' Total Length of Bwd Packets',
' min_seg_size_forward', ' Label']]

In [21]:
sample.shape

(2542141, 14)

In [22]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2542141 entries, 0 to 2542140
Data columns (total 14 columns):
 Destination Port               uint32
Init_Win_bytes_forward          int32
 Init_Win_bytes_backward        int32
 Flow IAT Min                   int32
 Fwd IAT Min                    int32
 Bwd IAT Min                    uint32
 Average Packet Size            float32
 Bwd Packet Length Std          float32
 Fwd Packet Length Std          float32
 Packet Length Std              float32
 Total Backward Packets         uint32
 Total Length of Bwd Packets    uint32
 min_seg_size_forward           int32
 Label                          int64
dtypes: float32(4), int32(5), int64(1), uint32(4)
memory usage: 145.5 MB


In [23]:
sample.to_csv('K:/CIC-2017-dataset/CIC-IDS-2017/MergedML_CSV/feature14.csv', index=False)

In [27]:
sample.describe()

Unnamed: 0,Destination Port,Init_Win_bytes_forward,Init_Win_bytes_backward,Flow IAT Min,Fwd IAT Min,Bwd IAT Min,Average Packet Size,Bwd Packet Length Std,Fwd Packet Length Std,Packet Length Std,Total Backward Packets,Total Length of Bwd Packets,min_seg_size_forward,Label
count,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0,2542141.0
mean,8057.795,7148.799,2034.795,172354.3,1057119.0,990803.0,201.624,361.7607,72.16381,315.7866,10.86743,17297.97,-3055.86,0.9513288
std,18368.61,14472.64,8520.327,3039023.0,8732086.0,8411019.0,341.5041,877.7975,291.445,657.6181,1051.811,2387238.0,1144922.0,2.099401
min,0.0,-1.0,-1.0,-14.0,-12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-536870700.0,0.0
25%,53.0,-1.0,-1.0,3.0,0.0,0.0,9.0,0.0,0.0,2.309401,1.0,6.0,20.0,0.0
50%,80.0,251.0,-1.0,4.0,3.0,1.0,74.5,0.0,0.0,27.38613,2.0,130.0,20.0,0.0
75%,443.0,8192.0,235.0,66.0,48.0,45.0,158.0667,130.5307,31.1127,223.5179,5.0,603.0,32.0,0.0
max,65535.0,65535.0,65535.0,120000000.0,120000000.0,120000000.0,3893.333,8194.66,7125.597,4731.522,291922.0,655453000.0,138.0,7.0


In [28]:
# importing feature scaling function
from sklearn.preprocessing import MinMaxScaler

# passing range to the function and then save it
scaler = MinMaxScaler(feature_range = (0,1))

# feeding sample features to var 'X'
X = sample.iloc[:,:-1]

# feeding our target variable to var 'y'
y = sample[' Label']

# apply feature scaling to all features
X_scaled = scaler.fit_transform(X)
#s_sample_2 = scaler.fit_transform(X2)

  return self.partial_fit(X, y)


In [30]:
X_scaled

array([[7.50560769e-01, 5.03540039e-03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 9.99999780e-01],
       [7.50560769e-01, 5.03540039e-03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 9.99999780e-01],
       [7.50560769e-01, 5.03540039e-03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 9.99999780e-01],
       ...,
       [6.75974670e-03, 4.45571899e-01, 9.46044922e-04, ...,
        3.52833976e-04, 3.29395075e-04, 9.99999780e-01],
       [6.75974670e-03, 4.45571899e-01, 9.46044922e-04, ...,
        2.12385500e-04, 1.68105104e-04, 9.99999780e-01],
       [6.75974670e-03, 4.45571899e-01, 9.46044922e-04, ...,
        1.95257637e-04, 1.52640991e-04, 9.99999780e-01]])

Now our data is ready to be splitted into 75%-25% train-test set respectively.

In [31]:
# importing train-test function
from sklearn.model_selection import train_test_split

# split the data in 75%-25% train-test respectively with fixed state
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.25, random_state = 53)

# Modelling and results


# 1. Auto-encoder as unsupervised learning

Parameters

In [32]:
# Parameters
learning_rate = 0.001
training_epochs = 10
batch_size = 256
display_step = 1

# Network Parameters
n_hidden_1 = 9 # 1st layer num features
#n_hidden_2 = 9 # 2nd layer num features
n_input = X_train.shape[1] # 13 input values
data_dir = '.'

In [33]:
X_train.shape[1]

13

In [35]:

import tensorflow as tf
import os
from datetime import datetime 
from sklearn.metrics import roc_auc_score as auc 
import seaborn as sns

X = tf.placeholder("float", [None, n_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    #'encoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'decoder_h1': tf.Variable(tf.random_normal([n_hidden_1, n_input])),
    #'decoder_h2': tf.Variable(tf.random_normal([n_hidden_1, n_input])),
}
biases = {
    'encoder_b1': tf.Variable(tf.random_normal([n_hidden_1])),
    #'encoder_b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'decoder_b1': tf.Variable(tf.random_normal([n_input])),
    #'decoder_b2': tf.Variable(tf.random_normal([n_input])),
}


# Building the encoder
def encoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.tanh(tf.add(tf.matmul(x, weights['encoder_h1']),
                                   biases['encoder_b1']))
    # Decoder Hidden layer with sigmoid activation #2
    #layer_2 = tf.nn.tanh(tf.add(tf.matmul(layer_1, weights['encoder_h2']),
                                   #biases['encoder_b2']))
    return layer_1


# Building the decoder
def decoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.tanh(tf.add(tf.matmul(x, weights['decoder_h1']),
                                   biases['decoder_b1']))
    # Decoder Hidden layer with sigmoid activation #2
    #layer_2 = tf.nn.tanh(tf.add(tf.matmul(layer_1, weights['decoder_h2']),
                                  # biases['decoder_b2']))
    return layer_1

# Construct model
encoder_op = encoder(X)
decoder_op = decoder(encoder_op)

# Prediction
y_pred = decoder_op
# Targets (Labels) are the input data.
y_true = X

# Define batch mse
batch_mse = tf.reduce_mean(tf.pow(y_true - y_pred, 2), 1)

# Define loss and optimizer, minimize the squared error
cost = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(cost)

# TRAIN StARTS
save_model = os.path.join(data_dir, 'temp_saved_model_1layer.ckpt')
saver = tf.train.Saver()

# Initializing the variables
init = tf.global_variables_initializer()

with tf.Session() as sess:
    now = datetime.now()
    sess.run(init)
    total_batch = int(X_train.shape[0]/batch_size)
    # Training cycle
    for epoch in range(training_epochs):
        # Loop over all batches
        for i in range(total_batch):
            batch_idx = np.random.choice(X_train.shape[0], batch_size)
            batch_xs = X_train[batch_idx]
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([optimizer, cost], feed_dict={X: batch_xs})
            
        # Display logs per epoch step
        if epoch % display_step == 0:
            train_batch_mse = sess.run(batch_mse, feed_dict={X: X_train})
            print("Epoch:", '%04d' % (epoch+1),
                  "cost=", "{:.9f}".format(c), 
                  "Train auc=", "{:.6f}".format(auc(y_train, train_batch_mse)), 
                  "Time elapsed=", "{}".format(datetime.now() - now))

    print("Optimization Finished!")
    
    save_path = saver.save(sess, save_model)
    print("Model saved in file: %s" % save_path)

  from ._conv import register_converters as _register_converters


ValueError: multiclass format is not supported