Cleans the data for a given dataset

Note: To clean all the datasets (1-13), run run_clusters.ipynb
      To clean just one dataset, uncomment the line below 

In [55]:
# file = 2

# Imports

In [56]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import preprocessing
from pandas import DataFrame

# Get data from file

In [57]:
print('cleaning data')
print(file)

cleaning data


In [58]:
# Grab random 200,000 entries from the original file, put them in a 
# new csv called 'data.csv'

import random
FILE = "read_data/read_data_{}.csv".format(file)
data = pd.read_csv(FILE)
 
try:
    n =  len(data)
    s = 200000 
    skip = sorted(random.sample(range(1,n+1),n-s)) 
    data = pd.read_csv(FILE, skiprows=skip)
    data.to_csv("working_data/data.csv", index=False)
except:
    data.to_csv("working_data/data.csv", index=False)
    
print('data retrieved')

# Clean the Data

##  Drop missing entries

In [59]:
# Drop any rows with missing data

data.dropna(inplace = True)

## Convert all the data to numerical values

In [60]:
#Set Dummy Values for Proto, State, and Dir
# ---------------

labelEncoder = preprocessing.LabelEncoder()
labelEncoder.fit(data['Proto'])
data['Proto'] = labelEncoder.transform(data['Proto'])

labelEncoder.fit(data['State'])
data['State'] = labelEncoder.transform(data['State'])

labelEncoder.fit(data['Dir'])
data['Dir'] = labelEncoder.transform(data['Dir'])


In [61]:
# Convert StartTime to epoch
# ---------------

start_time = data['StartTime'].tolist()

for index in range(0, len(start_time)):
    start_time[index] = pd.to_datetime(start_time[index]).timestamp() 
    
data['StartTime'] = start_time 


In [62]:
# Convert dst/src addresses to numerical values (delete the row if the value cannot be converted)
# ----------------


src_addr = data['SrcAddr'].tolist()
dst_addr = data['DstAddr'].tolist()

for index in range(0, len(src_addr)):
    try:
        src_addr[index] = int(str(src_addr[index]).replace('.', ''))
    except: 
        data = data.drop(data.index[index])
    
for index in range(0, len(dst_addr)):
    try: 
        dst_addr[index] = int(str(dst_addr[index]).replace('.', ''))
    except: 
        data = data.drop(data.index[index])
    
data["SrcAddr"] = src_addr
data["DstAddr"] = dst_addr
data.to_csv("working_data/data.csv", index=False)



In [63]:
# Convert Sport and Dport to numerical values
# ----------------

rows = list(data['Sport'])
for row_index in range(0, len(rows)):
    try:
        rows[row_index] = row[row_index].astype(np.float32)
    except:
        rows[row_index] = 0
data['Sport'] = rows


rows = list(data['Dport'])
for row_index in range(0, len(rows)):
    try:
        rows[row_index] = rows[row_index].astype(np.float32)
    except:
        rows[row_index] = 0
data['Dport'] = rows

print('all but botnet converted')

In [64]:
# Convert Label column to numerical values. 1 if botnet, 0 if benign. 
# Store the botnet and benign flows in separate csv files.
# -----------------------

# grab all of the labels, store them in a list
label_entries = list(data['Label'])

# create lists for storing botnet and benign flows
botnets = list()
benign = list()

# convert botnet entries to 1, benign to 0
# -------------------
index = 0
for entry in data['Label']:
    # non-TCP and UDP protocols are outliers, so remove them
    if not 'TCP' in entry and not 'UDP' in entry:
        label_entries[index] = -1
    else:
        if 'botnet' in entry or 'Botnet' in entry:
            label_entries[index] = 1

        else:
            if not entry == 1 and not entry == 0:
                label_entries[index] = 0
    index += 1

# Store the benign and botnet data in separate arrays
# Convert benign and botnet to dataframes

index = 0
for entry in label_entries:
    if entry == 1:
        botnets.append(list(data.iloc[index]))
    elif entry == 0:
        benign.append(list(data.iloc[index]))
    index += 1

benign = DataFrame (benign,columns=list(data.columns))
botnets = DataFrame (botnets,columns=list(data.columns))

# Save the dataframes to csv files
botnets.to_csv("working_data/botnet{}.csv".format(file), index=False)
benign.to_csv("working_data/benign.csv", index=False)
        


In [65]:
# remove outliers
# ----------------

# create lists for storing outliers and non-outliers
no_outliers = list()
outliers = list()

# loop through each relevant column to find the outliers
for name in ['Dur', 'TotBytes', 'SrcBytes', 'TotPkts']:
    
    # get the column from the list of non-malware
    col = benign[name]
    
    # calculate summary statistics
    data_mean, data_std = mean(col), std(col)
    
    # identify outliers
    cut_off = data_std * 3
    lower, upper = data_mean - cut_off, data_mean + cut_off

    # find locations of the outliers
    index = 0
    for x in col:
        #check for protocol outliers
        if name == 'Label':
            if x == -1:
                outliers.append(index)
                
        if x < lower or x > upper:
            outliers.append(index)
        index += 1
    
#remove duplicates
outliers = list(dict.fromkeys(outliers))   

print('data cleaned')


# Save cleaned data to csv

In [66]:
# Combine the reduced list of benign flows with the botnet flows
# Convert the list to csv

no_outliers = DataFrame (pd.concat([benign.drop(benign.index[outliers]), botnets], ignore_index=True))

# no_outliers.to_csv("working_data/no_outliers.csv", index=False)
no_outliers.to_csv("working_data/no_outliers{}.csv".format(file), index=False)
    

In [67]:
print('success!')

success!
