# Imports

In [21]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import preprocessing
from pandas import DataFrame

# Store Botnet/Benign flows in separate csv files

In [22]:
# read in data
data = pd.read_csv('working_data/data.csv')

# create a list of all the labels
label_entries = list(data['Label'])

#create lists for storing botnet and benign flows
botnets = list()
benign = list()


# Store the benign and botnet data in separate arrays

index = 0
for entry in data['Label']:
    if not 'TCP' in entry and not 'UDP' in entry:
        label_entries[index] = -1
    else:
        if 'botnet' in entry or 'Botnet' in entry:
            botnets.append(list(data.iloc[index]))
        else:
            benign.append(list(data.iloc[index]))
    index += 1

    
# Convert benign and botnet to dataframes, save in csv files

benign = DataFrame (benign,columns=list(data.columns))
botnets = DataFrame (botnets,columns=list(data.columns))

botnets.to_csv("botnet.csv", index=False)
benign.to_csv("benign.csv", index=False)
        



# Using lists of benign/botnets in string format, remove the outliers

In [23]:
#remove outliers

from numpy.random import seed
from numpy.random import randn
from numpy import mean
from numpy import std


no_outliers = list()
outliers = list()

#loop through each relevant column to find the outliers
for name in ['Dur', 'TotBytes', 'SrcBytes', 'TotPkts']:
    
    col = benign[name]
    
    # calculate summary statistics
    data_mean, data_std = mean(col), std(col)
    
    # identify outliers
    cut_off = data_std * 3
    lower, upper = data_mean - cut_off, data_mean + cut_off

    # find locations of the outliers
    index = 0
    for x in col:
        #check for protocol outliers
        if name == 'Label':
            if x == -1:
                outliers.append(index)
                
        if x < lower or x > upper:
            outliers.append(index)
        index += 1
    
#remove duplicates
outliers = list(dict.fromkeys(outliers))   


In [24]:
# Combine the reduced list of benign flows with the botnet flows
# Convert the list to csv

no_outliers = DataFrame (pd.concat([benign.drop(benign.index[outliers]), botnets], ignore_index=True))

no_outliers.to_csv("no_outliers.csv", index=False)
    

# Read from no_outliers

In [25]:
no_outliers = pd.read_csv('working_data/no_outliers.csv')

# K-Means Clustering

In [26]:
from sklearn.cluster import KMeans

#get all the features except StartTime and Label
x = no_outliers.iloc[:,1:14]


In [27]:
#counts the number of botnets and benign flows in each cluster, given a list of the labels and the # clusters
def count_groups(n, labels, centroids):
    # Make a list of dictionaries for storing counts of botnet and benign flows for each cluster
    counts = list()
    
    for index in range(0, n):
#         counts.append( {'label': 0, 'centroid': list()} )
        counts.append({})

    #Iterate through each individual cluster
    for index in range(0, len(labels)):
        
        try:
            counts[labels[index]][no_outliers['Label'][index]] += 1
        except:
            counts[labels[index]][no_outliers['Label'][index]] = 0
            
            
#         counts[labels[index]]['label'] = labels[index]
#         counts[labels[index]]['centroid'] = centroids[labels[index]]
            
    return counts


In [28]:
# For each cluster, print out the number of types of each botnet and benign 
import xlwt
from xlwt import Workbook


kmeans = KMeans(n_clusters=30)
kmeans.fit(x)
identified_clusters = kmeans.fit_predict(x)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
clusters_set = count_groups(30, labels, centroids)


    
# Write cluster info to excel sheet

# create workbook
wb = Workbook()

# create sheet
sheet1 = wb.add_sheet('Dataset_42')


# Make the header the list of botnet/benign types
header = set().union(*(d.keys() for d in clusters_set))
header = list(header)
# Convert the dictionary values to a list of lists
rows = list()
i = 1
for cluster in clusters_set:
    row = list()
    row.append(i)
    for item in header:
        try:
            row.append(cluster[item])
        except:
            row.append(0)
    rows.append(row)
    i+=1
    

header.insert(0, 'Cluster #')

for col in range(0, len(header)):
#     print(header[col])

    # print column header
    sheet1.write(0, col, header[col])
    # print each row in column
    for row in range(0, len(rows)):
        sheet1.write(row+1, col, rows[row][col])

print('success!')

    
wb.save('working_data/clusters_chart.xls')

success!


In [None]:
print(len(botnets))
print(len(benign))

# Useful Resources:

https://www.geeksforgeeks.org/convert-a-categorical-variable-into-dummy-variables/?ref=rp