In [None]:
## Imports
import glob
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import shutil
import csv
import yaml

In [None]:
with open('config.yaml') as stream:
    config = yaml.safe_load(stream)

print(config['appname'])
print(config['options'])


In [None]:
def prepareFolder(path):
    if os.path.exists(path+'/') == True:
        shutil.rmtree(path)
    if os.path.exists(path+'/') == False:
        os.makedirs(path+'/')

In [None]:
## Constants
COLORS = ['#1f77b4',
          '#ff7f0e',
          '#2ca02c',
          '#d62728',
          '#9467bd',
          '#8c564b',
          '#e377c2',
          '#7f7f7f',
          '#bcbd22',
          '#17becf',
          '#1a55FF']
CAT = {
    'usr': 'total-cpu-usage',
    'sys': 'total-cpu-usage',
    'idl': 'total-cpu-usage',
    'wai': 'total-cpu-usage',
    'hiq': 'total-cpu-usage',
    'siq': 'total-cpu-usage',

    'used' : 'memory usage',
    'buff' : 'memory usage',
    'cache' : 'memory usage',
    'free' : 'memory usage',

    'files': 'filesystem',
    'inodes' : 'fileystem',

    'read' : 'dsk/total',
    'writ' : 'dsk/total',
    'reads' : 'dsk/total',
    'writs' : 'dsk/total',

    'recv' : 'net/eth0',
    'send' : 'net/eth0',

    'lis' : 'tcp sockets',
    'act' : 'tcp sockets',
    'syn' : 'tcp sockets',
    'tim' : 'tcp sockets',
    'clo' : 'tcp sockets',

    'tot': 'sockets',
    'tcp': 'sockets',
    'udp': 'sockets',
    'raw': 'sockets',
    'frg': 'sockets',

    'int' : 'system',
    'csw' : 'system',

    'run' : 'procs',
    'blk' : 'procs',
    'new' : 'procs'
}

CAT_NAMES = ['total-cpu-usage', 'memory-usage', 'filesystem', 'dsk-total1', 'dsk-total2', 'net-eth0', 'tcp sockets', 'sockets', 'system', 'procs',]

COL_NAMES_DOCUMENTATION_DO_NOT_DELETE = [
    # total-cpu-usage
    'usr: CPU usage by user processes [%]', 
    'sys: CPU usage by system processes [%]',
    'idl: Number of idle processes [#]',
    'wai: Number of waiting processes [#]',
    'hiq: Number of hard interrupts [#]',
    'siq: Number of soft interrupts [#]',
    # memory usage
    # https://www.tecmint.com/dstat-monitor-linux-server-performance-process-memory-network/
    # https://linuxtect.com/linux-dstat-command-monitor-linux-performance-and-resource-usage/
    # https://www.thegeekdiary.com/understanding-proc-meminfo-file-analyzing-memory-utilization-in-linux/
    # /proc/meminfo
    'used: Amount of used memory [Bytes]', # / 1024 = KB
    'buff: Amount of buffered memory [Bytes]', # / 1024 / 1024 = MB
    'cache: Amount of cached memory [Bytes]',
    'free: Amount of free memory [Bytes]',

    # filesystem
    # https://www.linuxquestions.org/questions/linux-general-1/proc-sys-fs-file-nr-57646/
    # /proc/sys/fs/file-nr
    'files: Number of allocated file handles [#]',
    'inodes: Number of used file handles [#]',

    # dsk/total
    'read: Amount of read bytes on disk [Bytes]', # not 100% sure
    'writ: Amount of written bytes on disk [Bytes]', # / 1024 = KB

    # dsk/total
    'reads: Number of read operations on disk [#]',
    'writs: Number of write operations on disk [#]',
    
    # net/eth0
    'recv: Amount of received bytes on eth0 [B]', # /1024 = k
    'send: Amount of received bytes on eth0 [B]',

    # tcp sockets
    # http://karunsubramanian.com/network/what-is-syn_sent-socket-status/
    # /proc/net/tcp
    # /proc/net/tcp6
    'lis: Number of TCP connections with status "listening" [#]', 
    'act: Number of TCP connections with status "established" (active) [#]',
    'syn: Number of TCP connections with status "syn_sent", "syn_receive" or "last_ack" [#]',
    'tim: Number of TCP connections with status "waiting"[#]',
    'clo: Number of TCP connections with status "fin-wait1/2", "close/_wait" or "closign" (closed) [#]',

    # sockets
    # https://community.hpe.com/t5/Networking/Regarding-meaning-for-quot-cat-proc-net-sockstat-quot-results/td-p/3922867#.Yvnld3VBxhE
    # /proc/net/sockstat
    'tot: Number of total sockets [#]',
    'tcp: Number of tcp sockets [#]',
    'udp: Number of udp sockets [#]',
    'raw: Number of raw (using no protocol) sockets [#]',
    'frg: Number of frag sockets [#]',

    # system
    # https://www.tecmint.com/dstat-monitor-linux-server-performance-process-memory-network/
    'int: Number of interrupts [#]', 
    'csw: Number of ontext switches [#]', 

    # procs
    # /proc/stat
    'run: Number of processes with status "running" [#]', 
    'blk: Number of processes with status "blocked" [#]',
    'new: Number of processes with status "new" [#]' 
]

COL_NAMES = [
    # total-cpu-usage
    'usr: CPU usage by user processes [%]', 
    'sys: CPU usage by system processes [%]',
    'idl: Number of idle processes [#]',
    'wai: Number of waiting processes [#]',
    'hiq: Number of hard interrupts [#]',
    'siq: Number of soft interrupts [#]',
    # memory usage
    # https://www.tecmint.com/dstat-monitor-linux-server-performance-process-memory-network/
    # https://linuxtect.com/linux-dstat-command-monitor-linux-performance-and-resource-usage/
    # https://www.thegeekdiary.com/understanding-proc-meminfo-file-analyzing-memory-utilization-in-linux/
    # /proc/meminfo
    'used: Amount of used memory [Bytes]', # / 1024 = KB
    'buff: Amount of buffered memory [Bytes]', # / 1024 / 1024 = MB
    'cach: Amount of cached memory [Bytes]',
    'free: Amount of free memory [Bytes]',

    # filesystem
    # https://www.linuxquestions.org/questions/linux-general-1/proc-sys-fs-file-nr-57646/
    # /proc/sys/fs/file-nr
    'files: Number of allocated file handles [#]',
    'inodes: Number of used file handles [#]',

    # dsk/total
    'read: Amount of read bytes on disk [Bytes]', # not 100% sure
    'writ: Amount of written bytes on disk [Bytes]', # / 1024 = KB

    # dsk/total
    'reads: Number of read operations on disk [#]',
    'writs: Number of write operations on disk [#]',
    
    # net/eth0
    'recv: Amount of received bytes on eth0 [B]', # /1024 = k
    'send: Amount of received bytes on eth0 [B]',

    # tcp sockets
    # http://karunsubramanian.com/network/what-is-syn_sent-socket-status/
    # /proc/net/tcp
    # /proc/net/tcp6
    'lis: Number of TCP connections "listening" [#]', 
    'act: Number of TCP connections "established" (active) [#]',
    'syn: Number of TCP connections "syn_s", "syn_r" or "last_a" [#]',
    'tim: Number of TCP connections with status "waiting"[#]',
    'clo: Number of TCP connections with status "closing"  [#]',

    # sockets
    # https://community.hpe.com/t5/Networking/Regarding-meaning-for-quot-cat-proc-net-sockstat-quot-results/td-p/3922867#.Yvnld3VBxhE
    # /proc/net/sockstat
    'tot: Number of total sockets [#]',
    'tcp: Number of tcp sockets [#]',
    'udp: Number of udp sockets [#]',
    'raw: Number of raw sockets [#]',
    'frg: Number of frag sockets [#]',

    # system
    # https://www.tecmint.com/dstat-monitor-linux-server-performance-process-memory-network/
    'int: Number of interrupts [#]', 
    'csw: Number of ontext switches [#]', 

    # procs
    # /proc/stat
    'run: Number of processes with status "running" [#]', 
    'blk: Number of processes with status "blocked" [#]',
    'new: Number of processes with status "new" [#]' 
]

## Create dataframes

In [None]:
import processCSV

# load dataset and names
dfs, names = processCSV.processCSV()

# sanity check
for (df, name) in zip(dfs, names):
     print (df['usr'].iloc[0], name)


In [None]:
# Time Series
prepareFolder('plots/timeseries')
for (df, name) in zip(dfs, names):
    print (df['usr'].iloc[0], name)
    os.mkdir('plots/timeseries/'+name)
    index = 0
    #for column in df.loc[:, 'usr':'idl']:
    for column in df.columns:
        plt.figure(figsize=(10, 8), dpi=100)
        plt.hist(df[column])
        plt.subplot(2, 1, 1)
        df[column].plot(marker="o", label=COL_NAMES[index])
        plt.axhline(y=df[column].mean(), color=COLORS[1], linestyle=':', linewidth=3, label='mean = {:.2f}'.format(df[column].mean()))
        plt.axhline(y=df[column].max(), color=COLORS[2], linestyle='-', linewidth=1, label='max = {:.2f}'.format(df[column].max()))
        plt.axhline(y=df[column].min(), color=COLORS[3], linestyle='-', linewidth=1, label='min = {:.2f}'.format(df[column].min()))
        #plt.xlabel('Timestamp [hh:mm:ss] from {}'.format(startDate))
        plt.title('{} - {}'.format(CAT.get(column), column)) 
        plt.ylabel(COL_NAMES[index])
        plt.legend(bbox_to_anchor=(1,1), loc="upper left")

        plt.subplot(2, 1, 2)    
        df[column].plot(marker="o", label=COL_NAMES[index])
        plt.axhline(y=df[column].median(), color=COLORS[1], linestyle='-.', linewidth=1, label='median = {:.2f}'.format(df[column].median()))
        plt.axhline(y=df[column].quantile(q=0.75), color=COLORS[2], linestyle='-.', linewidth=1, label='quantile (0.75) = {:.2f}'.format(df[column].quantile(q=0.75)))
        plt.axhline(y=df[column].quantile(q=0.95), color=COLORS[3], linestyle='-.', linewidth=1, label='quantile (0.95) = {:.2f}'.format(df[column].quantile(q=0.95)))
        #plt.axhline(y=df[i].std(), color=COLORS[4], linestyle=':', linewidth=1, label='std = {:.2f}'.format(df[i].std()))
        #plt.xlabel('Timestamp [hh:mm:ss] from {}'.format(startDate))
        plt.ylabel(COL_NAMES[index])
        plt.legend(bbox_to_anchor=(1,1), loc="upper left")
        plt.title('{} - {}'.format(CAT.get(column), column)) 
        plt.savefig('plots/timeseries/{}/{}-timeseries.png'.format(name, column), facecolor='white', transparent=False)
        #plt.show()
        plt.grid(True)
        index += 1
        plt.close('all')

In [None]:
# Boxplot
prepareFolder('plots/boxplot')

for (df, name) in zip(dfs, names):
    os.mkdir('plots/boxplot/'+name)
    index = 0
    for column in df.columns:
        plt.figure(figsize=(10, 8), dpi=100)
        sns.boxplot(df[column])
        plt.savefig('plots/boxplot/{}/{}-boxplot.png'.format(name, column), facecolor='white', transparent=False)
        plt.close('all')



In [None]:
# Decompose
prepareFolder('plots/decompose')

for (df, name) in zip(dfs, names):
    os.mkdir('plots/decompose/'+name)
    index = 0
    for column in df.columns:
        plt.figure(figsize=(10, 8), dpi=100)
        sm.tsa.seasonal_decompose(df[column], model="add",  period=1).plot()
        plt.savefig('plots/decompose/{}/{}-decompose.png'.format(name, column), facecolor='white', transparent=False)
        plt.close('all')


In [None]:
import processCSV

# load dataset and names
dfs, names = processCSV.processCSV()

# sanity check
for (df, name) in zip(dfs, names):
     print (df['usr'].iloc[0], name)

In [None]:
# Standard deviation
prepareFolder('plots/standard-deviation')

for (df, name) in zip(dfs, names):
    df_total_cpu_usage = df[['usr', 'sys', 'idl', 'wai', 'hiq', 'siq']]
    print(df_total_cpu_usage['usr'].iloc[0], name)
    df_memory_usage = df[['used', 'buff', 'cach', 'free']]
    df_filesystem = df[['files', 'inodes']]
    df_dsk_total_1 = df[['read', 'writ']]
    df_dsk_total_2 = df[['reads', 'writs']]
    df_net_eth0 = df[['recv', 'send']]
    df_tcp_sockets = df[['lis', 'act', 'syn', 'tim', 'clo']]
    df_system = df[['int', 'csw']]
    df_procs = df[['run', 'blk', 'new']]

    std_categories = [df_total_cpu_usage, df_memory_usage, df_filesystem, df_dsk_total_1, df_dsk_total_2, df_net_eth0, df_tcp_sockets, df_system, df_procs]

    index= 0
    os.mkdir('plots/standard-deviation/'+name)
    plt.figure(figsize=(10, 8), dpi=100)
    for i in std_categories:
        stds_y = []
        stds_x = i.columns.to_list()
        for std in i.std():
            stds_y.append(std)
        sns.barplot(stds_x, stds_y)
        plt.xlabel('metrics')
        plt.ylabel('standard deviation')
        plt.title('{}'.format(CAT_NAMES[index])) 
        plt.savefig('plots/standard-deviation/{}/{}-std.png'.format(name,CAT_NAMES[index]), facecolor='white', transparent=False)
        plt.close('all')
        index += 1

## Threshold

In [None]:
import processCSV

# load dataset and names
dfs, names = processCSV.processCSV()

# sanity check
for (df, name) in zip(dfs, names):
     print (df['usr'].iloc[0], name)
print(names)

In [None]:
import processCSV
with open('config.yaml') as stream:
    config = yaml.safe_load(stream)

# load dataset and names
dfs, names = processCSV.processCSV()

# sanity check
for (df, name) in zip(dfs, names):
     print (df['usr'].iloc[0], name)
     
prepareFolder('plots/threshold')

WINDOWS = config['windows'] # number of datapoints of the non-overlapping windows
INFECTED = config['infected'] # positions of the infected datasets
HEALTHY = config['healthy'] # position of the healthy dataset
FUNCTIONS = config['functions'] # aggragation functions
print(INFECTED, HEALTHY)
# reset index (can't plot a time series)
dfs[HEALTHY[0]].reset_index(inplace=True)

# export col names for labeling
cols = dfs[HEALTHY[0]].columns.values.tolist()
cols = cols[1:] # remove time col

for infIndex in INFECTED:
    print(infIndex)
    # debug only
    # print(dfs[infIndex]['usr'].iloc[0],names[infIndex])
    # print(dfs[HEALTHY[0]]['usr'].iloc[0],names[HEALTHY[0]])

    # reset index
    startDate = dfs[infIndex].index[0]
    dfs[infIndex].reset_index(inplace=True)
    path = 'plots/threshold/'+names[infIndex]
    folderPrefixLenght = 15 # 01csv-infected(MALWARE) from '0' to '('
    os.makedirs(path)
    for window in WINDOWS:

        #str = i + '-mean'
        dirPath = f'{path}/{window}'
        os.makedirs(dirPath)
        for function in FUNCTIONS:
            for i in cols:
                
                str = i + '-' + function
                #print(str)
                #print(HEALTHY[0])

                if function == 'mean':
                    dfs[HEALTHY[0]][str]  = dfs[HEALTHY[0]][i].rolling(window).mean()[window-1::window]
                    dfs[HEALTHY[0]][str].bfill(inplace = True)
                    dfs[infIndex][str]  = dfs[infIndex][i].rolling(window).mean()[window-1::window]
                    dfs[infIndex][str].bfill(inplace = True)
                elif function == 'max':
                    dfs[HEALTHY[0]][str]  = dfs[HEALTHY[0]][i].rolling(window).max()[window-1::window]
                    dfs[HEALTHY[0]][str].bfill(inplace = True)
                    dfs[infIndex][str]  = dfs[infIndex][i].rolling(window).max()[window-1::window]
                    dfs[infIndex][str].bfill(inplace = True)
                elif function == 'min':
                    dfs[HEALTHY[0]][str]  = dfs[HEALTHY[0]][i].rolling(window).min()[window-1::window]
                    dfs[HEALTHY[0]][str].bfill(inplace = True)
                    dfs[infIndex][str]  = dfs[infIndex][i].rolling(window).min()[window-1::window]
                    dfs[infIndex][str].bfill(inplace = True)
                else:
                    pass
                
                plt.figure(dpi=200, figsize=(7,4))
                plt.title(label=str)
                # rolling, non-overlapping aggreation function such as mean, max and min 
                dfs[HEALTHY[0]][str].plot(marker="o", color=COLORS[1], linestyle='-', linewidth=0.5, label= str + " healthy")
                dfs[infIndex][str].plot(marker="o", color=COLORS[2], linestyle='-', linewidth=0.5, label= str + " infected")
                
            
                # min and max
                minHealthy = dfs[HEALTHY[0]][str].min()
                maxHealthy = dfs[HEALTHY[0]][str].max()
                minInfected = dfs[infIndex][str].min()
                maxInfected = dfs[infIndex][str].max()
                plt.axhline(y=minHealthy, color=COLORS[2], linestyle='-.', linewidth=0.5, label='min Healthy = {:.2f}'.format(minHealthy))
                plt.axhline(y=maxHealthy, color=COLORS[2], linestyle='-.', linewidth=0.5, label='max Healthy = {:.2f}'.format(maxHealthy))
                plt.axhline(y=minInfected, color=COLORS[3], linestyle='-.', linewidth=0.5, label='min Infected = {:.2f}'.format(minInfected))
                plt.axhline(y=maxInfected, color=COLORS[3], linestyle='-.', linewidth=0.5, label='max Infected = {:.2f}'.format(maxInfected))
                

                # visualize the easy thresholds
                if maxHealthy < minInfected:
                    difference = minInfected - maxHealthy
                    threshold = maxHealthy + (0.5 * difference)
                    plt.axhspan(maxHealthy, minInfected, color=COLORS[0], alpha=0.25)
                    plt.axhline(y=threshold, color=COLORS[0], linestyle='-', linewidth=1, alpha=1, label='threshold = {:.2f}'.format(threshold))
                    print('proposed threshold for {} regardin metric {}: {} >= {} '.format(names[infIndex][folderPrefixLenght:-1], str, str, threshold))
                    # MALWARE; METRIC; >=, VALUE
                    policyRule = [names[infIndex][folderPrefixLenght:-1], str, '>=', threshold]
                    with open ('policy({}-{}).csv'.format(function, window),'a', newline='') as policy:
                        csv_writter = csv.writer(policy)
                        csv_writter.writerow(policyRule)

                if maxInfected < minHealthy:
                    difference = minHealthy - maxInfected
                    threshold = maxInfected + (0.5 * difference)
                    plt.axhspan(maxInfected, minHealthy, color=COLORS[0], alpha=0.25)
                    plt.axhline(y=threshold, color=COLORS[0], linestyle='-', linewidth=1, alpha=1, label='threshold = {:.2f}'.format(threshold))
                    print('proposed threshold for {} regardin metric {}: {} <= {} '.format(names[infIndex][folderPrefixLenght:-1], str, str, threshold))
                    # MALWARE; METRIC; >=, VALUE
                    policyRule = [names[infIndex][folderPrefixLenght:-1], str, '<=', threshold]
                    with open ('policy({}-{}).csv'.format(function, window),'a', newline='') as policy:
                        csv_writter = csv.writer(policy)
                        csv_writter.writerow(policyRule)
                
                plt.xlabel('Timestamp [hh:mm:ss] from {}'.format(startDate))
                plt.ylabel(str)
                plt.legend(bbox_to_anchor=(1, 0.75), loc="upper left")
                plt.tight_layout()
                plt.title(str + names[infIndex][5:])
                plt.savefig(path+'/'+'{}'.format(window)+'/'+str+'({})-'.format(window)+names[infIndex][6:], facecolor='white', transparent=False)   
                #plt.show()