Given a cleaned dataset, performs k-means clustering and calculates the number of malware in each cluster. It then graphs this and saves the results to an excel sheet. 

# Imports

In [58]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import preprocessing
import xlwt
from xlwt import Workbook

# Perform K-Means Clustering

Define the number of clusters

In [59]:
NUM_CLUSTERS = 15

In [60]:
# grab data
botnets = pd.read_csv("combined_data/combined_datasets.csv")

# get all the features except StartTime and Label
x = botnets.iloc[:,1:14]
botnet_header = botnets['Label'].tolist()

In [61]:
# Counts the number of botnets and benign flows in each cluster, given a list of 
# the labels and the # of clusters
# -------------------

def count_groups(n, labels, centroids):
    
    # Make a list of dictionaries for storing counts of botnet and benign flows 
    # for each cluster
    counts = list()
    
    for index in range(0, n):
        counts.append({})

    # Iterate through each individual cluster
    for index in range(0, len(labels)):
        
        # If we encounter more than one malware/non-malware of the same type
        try:
            counts[labels[index]][botnets['Label'][index]] += 1
        except:
            counts[labels[index]][botnets['Label'][index]] = 0
            
            
    return counts


In [62]:
# Perform k-means clustering on dataset x

kmeans = KMeans(n_clusters=NUM_CLUSTERS)
kmeans.fit(x)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
clusters_set = count_groups(NUM_CLUSTERS, labels, centroids)

In [63]:
# Write table of malware counts to excel sheet
# ---------------

# create workbook
wb = Workbook()

# create sheets
sheet1 = wb.add_sheet('botnet')

# Create the table header the list of botnet/benign types
header = set().union(*(d.keys() for d in clusters_set))
header = list(header)

# Convert the dictionary values to a list of lists
rows = list()
i = 1
for cluster in clusters_set:
    row = list()
    for item in header:
        try:
            row.append(cluster[item])
        except:
            row.append(0)
    rows.append(row)
    i+=1
    

bot_col = 0

# dictionary for storing the counts of malware + non-malware in each cluster
combined_data = {'malware': [0] * len(rows)}


# write table of malware to excel sheet
for col in range(0, len(header)):
    
    # check to see if the column has any counts over 0 (contains any malware)
    nonZero = 0
    for row in range(0, len(rows)):
        if rows[row][col] != 0:
            nonZero = 1
            
    # if the column contains no data, there is no need to print it
    # otherwise, proceed
    if nonZero != 0:
        
        # write table of malware
        # -----------------
        
        # if the column is for holding counts of malware
        # print header
        sheet1.write(0, bot_col, header[col])

        # print each row in column
        for row in range(0, len(rows)):
            # if the value is 0, print a blank instead
            if rows[row][col] == 0:
                sheet1.write(row+1, bot_col, '')
            # write the count to the table
            else:
                sheet1.write(row+1, bot_col, rows[row][col])
                # update the total count of malware for the cluster
                combined_data['malware'][row] += (rows[row][col])
                    
        bot_col += 1
            


# Save excel sheet
FILE = "combined_data/combined_clusters_chart.xls"
wb.save(FILE)


# create dataframes from the excel sheets
botnet_df = pd.read_excel("combined_data/combined_clusters_chart.xls", sheet_name=0)

cat_botnet = list(botnet_df.columns.values)
botnet_df.set_index(cat_botnet, inplace=True)


In [64]:

# create charts from the dataframes, store the new excel sheet in the results folder
# --------------

# create writer
writer = pd.ExcelWriter("combined_data/combined_clusters_graph.xls", engine='xlsxwriter')
botnet_df.to_excel(writer, sheet_name='botnet')

# create workbook/worksheets
workbook = writer.book
mal_ws = writer.sheets['botnet']

# create two stacked bar charts
mal_chart = workbook.add_chart({'type': 'bar', 'subtype': 'stacked'}) 

# Configure the charts from the dataframe data
for col_num in range(1, len(cat_botnet) + 1):
    
    letter = chr(ord('@')+col_num)
    str = '=botnet!${}$1:${}{}'.format(letter, letter, NUM_CLUSTERS)
    
    mal_chart.add_series({
        'name': cat_botnet[col_num-1],
        'values': str,
    })
    

# insert charts
mal_ws.insert_chart('A40', mal_chart)
writer.save()

In [65]:
print('success!')

success!
