# DATASETS AND PCAP FILE SOURCES

## UNSW-NB15
### source: https://research.unsw.edu.au/projects/unsw-nb15-dataset

## KDD CUP99 (no pcap)
### source: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

## CIC-IDS
### source: https://www.unb.ca/cic/datasets/ids-2017.html



# CODE

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option("display.max_columns", 200)
import warnings
warnings.filterwarnings('ignore')
from socket import getservbyname as getServNum
import socket
from datetime import datetime

### Read datasets and labels

In [2]:
filepath = "./csv/attack/" #"./csv/bonafide"

HAS_CONN = os.path.isfile(filepath+"conn.log")
HAS_HTTP = os.path.isfile(filepath+"http.log")
HAS_FTP = os.path.isfile(filepath+"ftp.log")

df = pd.read_csv(filepath+"argus.csv")                                             # dataset Argus
print("argus.csv", df.shape)

if(HAS_CONN):
    zconn = pd.read_csv(filepath+"conn.log", sep='\t', skiprows = [0, 1, 2, 3, 4, 5, 7]) # dataset Zeek Conn
    zconn.columns = np.concatenate([zconn.columns[1:], ['drop']])                 # mark extra column for drop
    zconn.drop('drop', axis = 1, inplace = True)                                  # drop marked column
    print("conn.log", zconn.shape)
else:
    print("no argus.csv")

if(HAS_HTTP):
    zhttp = pd.read_csv(filepath+"http.log", sep='\t', skiprows = [0, 1, 2, 3, 4, 5, 7]) # dataset Zeek http
    zhttp.columns = np.concatenate([zhttp.columns[1:], ['drop']])                 # mark extra column for drop
    zhttp.drop('drop', axis = 1, inplace = True)                                  # drop marked column
    print("http.log", zhttp.shape)
else:
    print("no http.log")
# trans_depth and response_body_len

if(HAS_FTP):
    zftp = pd.read_csv(filepath+"ftp.log", sep='\t', skiprows = [0, 1, 2, 3, 4, 5, 7])   # dataset Zeek ftp
    zftp.columns = np.concatenate([zftp.columns[1:], ['drop']])                   # mark extra column for drop
    zftp.drop('drop', axis = 1, inplace = True)                                   # drop marked column
    print("ftp.log", zftp.shape)
else:
    print("no ftp.log")

argus.csv (233215, 36)
conn.log (180263, 21)
http.log (4, 30)
no ftp.log


# Features taken from Argus and Zeek log files

### As shown in original UNSW-NB15 CSV file available online 
dur, proto, service, state, spkts, dpkts, sbytes, dbytes, rate, sttl, dttl, sload, dload, sloss, dloss, sinpkt, dinpkt, sjit, djit, swin, stcpb, dtcpb, dwin, tcprtt, synack, ackdat, smean, dmean, trans_depth, response_body_len, ct_srv_src, ct_state_ttl, ct_dst_ltm, ct_src_dport_ltm, ct_dst_sport_ltm, ct_dst_src_ltm, is_ftp_login, ct_ftp_cmd, ct_flw_http_mthd, ct_src_ltm, ct_srv_dst, is_sm_ips_ports

> ### Argus
> - 1 SrcAddr
> - 2 Sport
> - 3 DstAddr
> - 4 Dport
> - 5 Proto
> - 6 State
> - 7 dur
> - 8 SrcBytes
> - 9 DstBytes
> - 10 sTtl
> - 11 dTtl
> - 12 SrcLoss
> - 13 DstLoss
> - 14
> - 15 SrcLoad
> - 16 DstLoad
> - 17 SrcPkts
> - 18 DstPkts
> - 19 SrcWin (swin)
> - 20 DstWin (dwin)
> - 21 SrcTCPBase
> - 22 DstTCPBase
> - 23 sPktSz (smeansz/sMeanPktSz)
> - 24 dPktSz (dmeansz/dMeanPktSz)
> - 25
> - 26
> - 27 SrcJitter
> - 28 DstJitter
> - 29 StartTime
> - 30 LastTime
> - 31 SIntPkt
> - 32 DIntPkt
> - 33 TcpRtt
> - 34 SynAck
> - 35 AckDat
> - @ Rate @
> - @ TotAppByte @
> - @ PCRatio @
> - Additionally: Trans, Min, Max, Sum.

> ### conn.log
> - 14 service
> - 7 duration
> - conn_state

> ### http.log
> - 25 trans_depth
> - 26 response_body_len
> - 38! method

> ### ftp.log
> - user 
> - password
> - command 
> - Additionally: arg, mime_type, file_size, reply_code, reply_msg, data_channel.passive, data_channel.orig_h, data_channel.resp_h, data_channel.resp_p

# Formating and Merging Data

### Format Argus.csv data

In [3]:
## convert to int
def portsAsInt(x):
    if isinstance(x,str):     #if is string
        if x.isnumeric():        #and if contains only decimals
            return int (x)
        else:
            try:
                return int(float(x))
            except ValueError:
                return int(x,16) #if contains hex number
    return 0

df = df.astype({'SrcAddr':'string', 'Sport':'string', 'DstAddr':'string', 'Dport':'string', 'Proto':'string', 'State':'string'})
df['Dport'] = df['Dport'].apply(lambda x: portsAsInt(x))
df['Sport'] = df['Sport'].apply(lambda x: portsAsInt(x))
df[['Sport','Dport']].fillna(0, inplace=True)

if (df['Dport'].notna().all() and df['Sport'].notna().all()):
    if (df['Dport'].apply(lambda x: isinstance(x,int)).all() and df['Sport'].apply(lambda x: isinstance(x,int)).all()):
        print("all ports are properly parsed")
    else:
        print("not all port properly parsed")
else:
    print("some ports are NA")
    
df = df.astype({'SrcAddr':'string', 'Sport':'int32', 'DstAddr':'string', 'Dport':'int32', 'Proto':'int32', 'State':'string'})
if isinstance(df['StartTime'][0],str):
    df['StartTime'] = df['StartTime'].apply(lambda x: float(x))
    df['LastTime'] = df['LastTime'].apply(lambda x: float(x))
#    df['StartTime'] = df['StartTime'].apply(lambda x: datetime.strptime(x, '%H:%M:%S.%f').time())
#    df['LastTime'] = df['LastTime'].apply(lambda x: datetime.strptime(x, '%H:%M:%S.%f').time())
df.head(5)

all ports are properly parsed


Unnamed: 0,SrcAddr,Sport,DstAddr,Dport,Proto,State,Dur,SrcBytes,DstBytes,sTtl,dTtl,SrcLoss,DstLoss,SrcLoad,DstLoad,SrcPkts,DstPkts,SrcWin,DstWin,SrcTCPBase,DstTCPBase,sMeanPktSz,dMeanPktSz,SrcJitter,DstJitter,StartTime,LastTime,SIntPkt,DIntPkt,TcpRtt,SynAck,AckDat,Trans,Min,Max,Sum
0,00:00:00:00:00:00,0,00:00:00:00:00:00,0,0,INT,0.0,60,0,,,0,0,0.0,0.0,1,0,,,,,60.0,0.0,,,4026532000.0,0.0,0.0,,0.0,0.0,0.0,1,0.0,0.0,0.0
1,10.0.2.15,0,10.10.10.254,0,2054,CON,2.068213,336,42,,,0,0,1137.213623,0.0,8,1,,,,,42.0,42.0,447.082375,,7.944996,10.013209,295.448438,,0.0,0.0,0.0,1,2.068213,2.068213,2.068213
2,::,143,ff02::16,0,58,MHR,1.064836,290,0,1.0,,0,0,1457.501343,0.0,3,0,,,,,96.666664,0.0,456.441,,9.840128,10.904964,532.418,,0.0,0.0,0.0,1,1.064836,1.064836,1.064836
3,10.0.2.15,50496,91.189.92.40,443,6,FIN,3.610742,108,0,64.0,,0,0,119.642998,0.0,2,0,40880.0,,2396930000.0,,54.0,0.0,0.0,,10.01393,13.624673,3610.742,,0.0,0.0,0.0,1,3.610742,3.610742,3.610742
4,::,135,ff02::1:ff55:da3f,0,58,NNS,0.0,78,0,255.0,,0,0,0.0,0.0,1,0,,,,,78.0,0.0,0.0,,10.74837,10.748372,3610.742,,0.0,0.0,0.0,1,0.0,0.0,0.0


## conn.log

### Format conn.log data

In [4]:
if HAS_CONN:
    if zconn.columns.isin(['id.orig_h','id.orig_p','id.resp_h','id.resp_p']).any():
        badIndex = zconn[['id.orig_p','id.resp_p']].isna().all(axis=1)
        badIndex = badIndex[badIndex].index
        zconn.drop(badIndex, axis=0, inplace=True)
        zconn = zconn.astype({'id.orig_h':'string', 'id.orig_p':'int32', 'id.resp_h':'string', 'id.resp_p':'int32', 'proto':'string','service':'string'})

    zconn.columns = ['StartTime', 'uid', 'SrcAddr', 'Sport', 'DstAddr','Dport','Proto', 'service', 'duration', 'orig_bytes', 'resp_bytes','conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history','orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes','tunnel_parents']

    test = zconn['Proto']
    for loc in test.index:
        if not(str(test.iloc[loc]).isnumeric()):
            if test.iloc[loc] == "tcp":
                zconn['Proto'].iloc[loc] = '6'
            if test.iloc[loc] == "udp":
                zconn['Proto'].iloc[loc] = '17'
            if test.iloc[loc] == "ipv4":
                zconn['Proto'].iloc[loc] = '4'
            if test.iloc[loc] == "icmp":
                zconn['Proto'].iloc[loc] = '1'
            if test.iloc[loc] == "igmp":
                zconn['Proto'].iloc[loc] = '2'

    zconn = zconn.astype({'Proto':'int32'})

    zconn['StartTime'] = zconn['StartTime'].apply(lambda x: float(x))
    print("Unique protocol list :", zconn['Proto'].unique())
    zconn.head(5)
else:
    print("No conn.log")

Unique protocol list : [ 6  1 17]


### Merging data from conn.log

In [5]:
if HAS_CONN:
    DS = df.merge(zconn[['SrcAddr','Sport','DstAddr','Dport', 'Proto','StartTime','service','duration','conn_state']], how='left', left_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','StartTime'], right_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','StartTime'])
    print("Flows: ",DS.shape[0], "\nFlows not in conn.log: ", df.shape[0] - df.merge(zconn[['SrcAddr','Sport','DstAddr','Dport', 'Proto','StartTime','service','duration','conn_state']], how='inner', left_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','StartTime'], right_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','StartTime']).shape[0], "\nFlows only in conn.log: ", zconn.shape[0] - df.merge(zconn[['SrcAddr','Sport','DstAddr','Dport', 'Proto','StartTime','service','duration','conn_state']], how='inner', left_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','StartTime'], right_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','StartTime']).shape[0])
    DS.fillna(value={'service': '-','duration': 0,'conn_state': '-'}, inplace=True)
else:
    DS = df
    DS[['service','duration','conn_state']] = ['-',0,'-']

Flows:  233215 
Flows not in conn.log:  141062 
Flows only in conn.log:  88109


## http.log

### Formating http.log

In [6]:
if HAS_HTTP:
    if zhttp.columns.isin(['ts','id.orig_h','id.orig_p','id.resp_h','id.resp_p']).any():
        zhttp.columns = ['StartTime', 'uid', 'SrcAddr', 'Sport', 'DstAddr','Dport','trans_depth', 'method', 'host', 'uri', 'referrer', 'version','user_agent', 'origin', 'request_body_len', 'response_body_len','status_code', 'status_msg', 'info_code', 'info_msg', 'tags','username', 'password', 'proxied', 'orig_fuids', 'orig_filenames','orig_mime_types', 'resp_fuids', 'resp_filenames', 'resp_mime_types']
    badIndex = zhttp[['Sport','Dport']].isna().all(axis=1)
    badIndex = badIndex[badIndex].index
    zhttp.drop(badIndex, axis=0, inplace=True)
    zhttp['service'] = 'http'
    zhttp['Proto'] = 6
    zhttp = zhttp.astype({'StartTime':'float','SrcAddr':'string', 'Sport':'int32', 'DstAddr':'string','Dport':'int32','Proto':'int32','service':'string','trans_depth':'int32','response_body_len':'int32','method':'string'})
else:
    print("No http.log")

### Merging data from http.log (port 80)

In [7]:
if HAS_HTTP:
    DS2 = DS.merge(zhttp[['SrcAddr','Sport','DstAddr','Dport','Proto','service','trans_depth','response_body_len','method']], how='left', left_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','service'], right_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','service'])
    print("Flows: ", DS2.shape[0], "\nFlows not in http.log: ", DS2.shape[0] - DS.merge(zhttp[['SrcAddr','Sport','DstAddr','Dport','Proto','service','trans_depth','response_body_len','method']], how='inner', left_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','service'], right_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','service']).shape[0], "\nFlows only in http.log: ", zhttp.shape[0] - DS.merge(zhttp[['SrcAddr','Sport','DstAddr','Dport','Proto','service','trans_depth','response_body_len','method']], how='inner', left_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','service'], right_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','service']).shape[0], "\nFlows in http.log: ", zhttp.shape[0])
    print("HTTP Flows in DS: ", DS[DS['service']=='http'].shape[0])
    DS2.fillna(value={'trans_depth': 0,'response_body_len': 0,'method': '-'}, inplace=True)
    DS2.head(5)
else:
    DS2 = DS
    DS2[['trans_depth','response_body_len','method']] = [0,0,'-']

Flows:  233215 
Flows not in http.log:  233212 
Flows only in http.log:  0 
Flows in http.log:  3
HTTP Flows in DS:  3


## ftp.log

### Formating ftp.log data

In [8]:
if HAS_FTP:  
    if zftp.columns.isin(['id.orig_h','id.orig_p','id.resp_h','id.resp_p']).any():
        zftp.columns = ['StartTime', 'uid', 'SrcAddr', 'Sport', 'DstAddr','Dport','user','password','command','arg','mime_type','file_size','reply_code','reply_msg','data_channel.passive','data_channel.orig_h','data_channel.resp_h','data_channel.resp_p','fuid']
    badIndex = zftp[['Sport','Dport']].isna().all(axis=1)
    badIndex = badIndex[badIndex].index
    zftp.drop(badIndex, axis=0, inplace=True)
    zftp['service'] = 'ftp'
    zftp['Proto'] = 6
    zftp = zftp.astype({'StartTime':'float','SrcAddr':'string', 'Sport':'int32', 'DstAddr':'string','Dport':'int32','Proto':'int32','service':'string','user':'string','password':'string', 'command':'string'})
else:
    print("No ftp.log")

No ftp.log


### Merging data from ftp.log (port 21)

In [9]:
if (HAS_FTP):
    DS3 = DS2.merge(zftp[['SrcAddr','Sport','DstAddr','Dport','Proto','service','user','password','command']], how='left', left_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','service'], right_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','service'])
    print("Flows in DS3: ", DS3.shape[0], "\nFlows in ftp.log: ", zftp.shape[0], "\nFlow in both: ", DS2.merge(zftp[['SrcAddr','Sport','DstAddr','Dport','Proto','service','user','password','command']], how='inner', left_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','service'], right_on=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','service']).shape[0], "\nNew Flows: ", DS3.shape[0]-DS2.shape[0])
    print("Non repeated in zftp", zftp[zftp.duplicated(subset=['SrcAddr', 'Sport', 'DstAddr', 'Dport','Proto','service'], keep='first')].shape[0])
    DS3.fillna(value={'user': '-','password': '-','command': '-'}, inplace=True)
    DS3[DS3['service']=='ftp'].head(5)
else:
    DS3 = DS2
    DS3[['user','password','command']] = ['-','-','-']

## Fitting into UNSW-NB15 format

In [10]:
DS = DS3[['SrcAddr', 'Sport', 'DstAddr', 'Dport', 'Proto', 'State', 'Dur','SrcBytes', 'DstBytes', 'sTtl', 'dTtl',
           'SrcLoss', 'DstLoss','service', 'SrcLoad', 'DstLoad', 'SrcPkts', 'DstPkts', 'SrcWin', 'DstWin', 'SrcTCPBase',
           'DstTCPBase', 'sMeanPktSz', 'dMeanPktSz', 'trans_depth','response_body_len', 'SrcJitter', 'DstJitter','StartTime',
           'LastTime', 'SIntPkt', 'DIntPkt', 'TcpRtt', 'SynAck', 'AckDat', 'Trans', 'Min',
           'Max', 'Sum', 'duration', 'conn_state', 'method', 'user', 'password', 'command']]
DS.columns = ['srcip', 'sport', 'dstip', 'dport', 'proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl',
               'sloss', 'dloss', 'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin', 'stcpb',
               'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
               'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat', 'Trans', 'Min',
               'Max', 'Sum', 'duration', 'conn_state', 'method', 'user', 'password', 'command']
print(DS.shape)
DS.head(5)

(233215, 45)


Unnamed: 0,srcip,sport,dstip,dport,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,service,sload,dload,spkts,dpkts,swin,dwin,stcpb,dtcpb,smeansz,dmeansz,trans_depth,res_bdy_len,sjit,djit,stime,ltime,sintpkt,dintpkt,tcprtt,synack,ackdat,Trans,Min,Max,Sum,duration,conn_state,method,user,password,command
0,00:00:00:00:00:00,0,00:00:00:00:00:00,0,0,INT,0.0,60,0,,,0,0,-,0.0,0.0,1,0,,,,,60.0,0.0,0.0,0.0,,,4026532000.0,0.0,0.0,,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,-,-,-,-,-
1,10.0.2.15,0,10.10.10.254,0,2054,CON,2.068213,336,42,,,0,0,-,1137.213623,0.0,8,1,,,,,42.0,42.0,0.0,0.0,447.082375,,7.944996,10.013209,295.448438,,0.0,0.0,0.0,1,2.068213,2.068213,2.068213,0.0,-,-,-,-,-
2,::,143,ff02::16,0,58,MHR,1.064836,290,0,1.0,,0,0,-,1457.501343,0.0,3,0,,,,,96.666664,0.0,0.0,0.0,456.441,,9.840128,10.904964,532.418,,0.0,0.0,0.0,1,1.064836,1.064836,1.064836,0.0,-,-,-,-,-
3,10.0.2.15,50496,91.189.92.40,443,6,FIN,3.610742,108,0,64.0,,0,0,-,119.642998,0.0,2,0,40880.0,,2396930000.0,,54.0,0.0,0.0,0.0,0.0,,10.01393,13.624673,3610.742,,0.0,0.0,0.0,1,3.610742,3.610742,3.610742,3.610742,SH,-,-,-,-
4,::,135,ff02::1:ff55:da3f,0,58,NNS,0.0,78,0,255.0,,0,0,-,0.0,0.0,1,0,,,,,78.0,0.0,0.0,0.0,0.0,,10.74837,10.748372,3610.742,,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,-,-,-,-,-


In [11]:
#--------------------------#
# General Purpose Features #
#--------------------------#

#'is_sm_ips_ports'
DS['is_sm_ips_ports'] = DS['srcip'] == DS['dstip']
DS['is_sm_ips_ports'].replace(to_replace={True: 1, False: 0}, inplace=True)
# DS.fillna(value={'is_sm_ips_ports': 0}, inplace=True)

# 'ct_state_ttl'
teste = DS.groupby(['state','sttl','dttl'], as_index=False).size()
for line in teste.index:
    DS.loc[ (DS['state'] == teste.iloc[line,0]) & (DS['sttl'] == teste.iloc[line,1]) & (DS['dttl'] == teste.iloc[line,2]),
           'ct_state_ttl'] = teste.iloc[line,3]

DS.fillna(value={'ct_state_ttl': 0}, inplace=True)
DS['ct_state_ttl'] = DS['ct_state_ttl'].apply(int)
#DS['ct_state_ttl']

# 'ct_flw_http_mthd' 
teste = DS.groupby(['method'], as_index=False).size()
teste.loc[ teste['method'] == '-', 'size'] = 0
for line in teste.index:
    DS.loc[ DS['method'] == teste.iloc[line,0] , 'ct_flw_http_mthd'] = teste.iloc[line,1]
    
DS.fillna(value={'ct_flw_http_mthd': 0}, inplace=True)
DS['ct_flw_http_mthd'] = DS['ct_flw_http_mthd'].apply(int)
DS['ct_flw_http_mthd'].unique()

# is_ftp_login
DS['is_ftp_login'] = '-'
DS.loc[ (DS['user'] == '-') | (DS['user'] == '<unknown>') | (DS['user'] == 'anonymous') | (DS['password'] == '-'), 'is_ftp_login'] = 0
DS.loc[ (DS['is_ftp_login'] != 0) & (DS['service'] == 'ftp'), 'is_ftp_login'] = 1
DS[(DS['is_ftp_login'] == 1)]

# ct_ftp_cmd
teste = DS[DS['service']=='ftp'].groupby(['srcip','dstip','sport','dport','command'], as_index=False).size()
teste.drop(index=teste[teste['command']=='-'].index, inplace=True)
teste = teste.groupby(['srcip','dstip','sport','dport'], as_index=False).size()
teste['service'] = 'ftp'
teste.rename(columns={"size":"ct_ftp_cmd"}, inplace=True)
if not(DS.columns.str.contains('ct_ftp_cmd', regex=False).any()):
    DS = DS.merge(teste, how='left', left_on=['srcip','dstip','sport','dport','service'],
                                right_on=['srcip','dstip','sport','dport','service'])
    DS.fillna(value={'ct_ftp_cmd': 0}, inplace=True)
    DS['ct_ftp_cmd'] = DS['ct_ftp_cmd'].apply(int)
teste.columns

#---------------------#
# Connection Features #
#---------------------#
DS.sort_values('ltime', inplace=True, kind='mergesort', ignore_index=True)

for indice in range(len(DS.index)):
    if indice == 0:
        DS[['ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm']] = np.zeros(7,dtype=int)
        continue
    temp = min(101,indice)
    priors = DS.iloc[range(indice-temp,indice)]
    
    # ct_srv_src
    teste = priors.groupby(['srcip','service'], as_index=False).size()
    teste = teste[(teste[['srcip','service']] == DS[['srcip','service']].iloc[indice]).all(axis=1)]['size']
    if not teste.empty:
        DS.at[indice,'ct_srv_src'] = teste.iloc[0]
    
    # ct_srv_dst
    teste = priors.groupby(['dstip','service'], as_index=False).size()
    teste = teste[(teste[['dstip','service']] == DS[['dstip','service']].iloc[indice]).all(axis=1)]['size']
    if not teste.empty:
        DS.at[indice,'ct_srv_dst'] = teste.iloc[0]
    
    # ct_dst_ltm 
    teste = priors.groupby(['dstip'], as_index=False).size()
    teste = teste[(teste['dstip'] == DS['dstip'].iloc[indice])]['size']
    if not teste.empty:
        DS.at[indice,'ct_dst_ltm'] = teste.iloc[0]
    
    # ct_src_ltm
    teste = priors.groupby(['srcip'], as_index=False).size()
    teste = teste[(teste['srcip'] == DS['srcip'].iloc[indice])]['size']
    if not teste.empty:
        DS.at[indice,'ct_src_ltm'] = teste.iloc[0]
    
    # ct_src_dport_ltm
    teste = priors.groupby(['srcip','dport'], as_index=False).size()
    teste = teste[(teste[['srcip','dport']] == DS[['srcip','dport']].iloc[indice]).all(axis=1)]['size']
    if not teste.empty:
        DS.at[indice,'ct_src_dport_ltm'] = teste.iloc[0]
    
    # ct_dst_sport_ltm
    teste = priors.groupby(['dstip','sport'], as_index=False).size()
    teste = teste[(teste[['dstip','sport']] == DS[['dstip','sport']].iloc[indice]).all(axis=1)]['size']
    if not teste.empty:
        DS.at[indice,'ct_dst_sport_ltm'] = teste.iloc[0]
    
    # ct_dst_src_ltm
    teste = priors.groupby(['srcip','dstip'], as_index=False).size()
    teste = teste[(teste[['srcip','dstip']] == DS[['srcip','dstip']].iloc[indice]).all(axis=1)]['size']
    if not teste.empty:
        DS.at[indice,'ct_dst_src_ltm'] = teste.iloc[0]
    
DS.head(5)

Unnamed: 0,srcip,sport,dstip,dport,proto,state,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,service,sload,dload,spkts,dpkts,swin,dwin,stcpb,dtcpb,smeansz,dmeansz,trans_depth,res_bdy_len,sjit,djit,stime,ltime,sintpkt,dintpkt,tcprtt,synack,ackdat,Trans,Min,Max,Sum,duration,conn_state,method,user,password,command,is_sm_ips_ports,ct_state_ttl,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
0,00:00:00:00:00:00,0,00:00:00:00:00:00,0,0,INT,0.0,60,0,,,0,0,-,0.0,0.0,1,0,,,,,60.0,0.0,0.0,0.0,,,4026532000.0,0.0,0.0,,0.0,0.0,0.0,1,0.0,0.0,0.0,0,-,-,-,-,-,1,0,0,0,0,0,0,0,0,0,0,0
1,10.0.2.15,0,10.10.10.254,0,2054,CON,2.068213,336,42,,,0,0,-,1137.213623,0.0,8,1,,,,,42.0,42.0,0.0,0.0,447.082375,,7.944996,10.013209,295.448438,,0.0,0.0,0.0,1,2.068213,2.068213,2.068213,0,-,-,-,-,-,0,0,0,0,0,0,0,0,0,0,0,0
2,::,135,ff02::1:ff55:da3f,0,58,NNS,0.0,78,0,255.0,,0,0,-,0.0,0.0,1,0,,,,,78.0,0.0,0.0,0.0,0.0,,10.74837,10.748372,3610.742,,0.0,0.0,0.0,1,0.0,0.0,0.0,0,-,-,-,-,-,0,0,0,0,0,0,0,0,0,0,0,0
3,::,143,ff02::16,0,58,MHR,1.064836,290,0,1.0,,0,0,-,1457.501343,0.0,3,0,,,,,96.666664,0.0,0.0,0.0,456.441,,9.840128,10.904964,532.418,,0.0,0.0,0.0,1,1.064836,1.064836,1.064836,0,-,-,-,-,-,0,0,0,0,0,1,0,0,1,1,0,0
4,fe80::a00:27ff:fe55:da3f,143,ff02::16,0,58,MHR,0.559562,220,0,1.0,,0,0,-,1572.658569,0.0,2,0,,,,,110.0,0.0,0.0,0.0,0.0,,11.74826,12.307827,559.562,,0.0,0.0,0.0,1,0.559562,0.559562,0.559562,0,-,-,-,-,-,0,0,0,0,0,0,1,1,0,0,1,0


In [12]:
DS.fillna(value={'sttl': 0, 'dttl': 0, 'swin': 0, 'dwin': 0, 'stcpb': 0, 'dtcpb': 0, 'sjit': 0, 'djit': 0,'dintpkt': 0}, inplace=True)
if DS.columns.isin(['Trans', 'Min', 'Max', 'Sum', 'duration', 'conn_state', 'method', 'user', 'password', 'command']).any():
    DS.drop(['Trans', 'Min', 'Max', 'Sum', 'duration', 'conn_state', 'method', 'user', 'password', 'command'], axis = 1, inplace = True)
DS.to_csv('./attack_NB15.csv', index=None, header=True)

# Labels

In [14]:
if DS.columns.isin(['Trans', 'Min', 'Max', 'Sum', 'duration', 'conn_state', 'method', 'user', 'password', 'command']).any():
    DS.drop(['Trans', 'Min', 'Max', 'Sum', 'duration', 'conn_state', 'method', 'user', 'password', 'command'], axis = 1, inplace = True)
labels = pd.read_csv("./labels/attack_labels.csv")
labels.head(5)

Unnamed: 0,ip,label
0,172.16.0.3,nmap_tcp_syn
1,172.16.0.4,nmap_tcp_conn
2,172.16.0.5,nmap_tcp_null
3,172.16.0.6,nmap_tcp_xmas
4,172.16.0.7,nmap_tcp_fin


### insert attack category and label

In [16]:
labels.columns

Index(['ip', 'label'], dtype='object')

In [17]:
labels.rename(columns={'ip':'srcip', 'label':'attack_cat'}, inplace=True)
labels['attack_cat'] = labels['attack_cat'].str.strip()
if DS.columns.isin(['Label','attack_cat']).any():
    DS.drop(['Label','attack_cat'], axis = 1, inplace = True)
DS = DS.merge(labels[['srcip','attack_cat']], how='left',
         left_on=['srcip'],
         right_on=['srcip'])
DS.fillna(value={'attack_cat': '-'}, inplace=True)
DS['Label'] = 1
DS.loc[DS['attack_cat'] == '-','Label'] = 0
DS.groupby(['attack_cat','Label'], as_index=False).size()

Unnamed: 0,attack_cat,Label,size
0,-,0,147587
1,hping_tcp_ack,1,1000
2,hping_tcp_fin,1,1000
3,hping_tcp_null,1,1000
4,hping_tcp_syn,1,1000
...,...,...,...
60,unicornscan_tcp_fin,1,1014
61,unicornscan_tcp_fxmas,1,1014
62,unicornscan_tcp_null,1,1014
63,unicornscan_tcp_syn,1,1014


In [14]:
DS['attack_cat'].unique()

array(['-', 'Exploits', 'Fuzzers', 'Reconnaissance', 'Backdoor', 'DoS',
       'Generic', 'Analysis', 'Shellcode', 'Worms', 'Backdoors'],
      dtype=object)

In [18]:
DS.to_csv('./dataset/attack_NB15.csv', index=None, header=True)

# CIC Flow Meter

### source: https://pypi.org/project/cicflowmeter/


| Feature Name | Description |
| :--- | :---: |
| Feduration | Duration of the flow in Microsecond |
| Flow Feduration | Duration of the flow in Microsecond |
| total FWwd Packet | Total packets in the forward direction |
| total Bwd packets | Total packets in the backward direction |
| total Length of Fwd Packet | Total size of packet in forward direction |
| total Length of Bwd Packet | Total size of packet in backward direction |
| Fwd Packet Length Min | Minimum size of packet in forward direction |
| Fwd Packet Length Max | Maximum size of packet in forward direction |
| Fwd Packet Length Mean | Mean size of packet in forward direction |
| Fwd Packet Length Std | Standard deviation size of packet in forward direction |
| Bwd Packet Length Min | Minimum size of packet in backward direction |
| Bwd Packet Length Max | Maximum size of packet in backward direction |
| Bwd Packet Length Mean | Mean size of packet in backward direction |
| Bwd Packet Length Std | Standard deviation size of packet in backward direction |
| Flow Byte/s | Number of flow packets per second |
| Flow Packets/s | Number of flow bytes per second |
| Flow IAT Mean | Mean time between two packets sent in the flow |
| Flow IAT Std | Standard deviation time between two packets sent in the flow |
| Flow IAT Max | Maximum time between two packets sent in the flow |
| Flow IAT Min | Minimum time between two packets sent in the flow |
| Fwd IAT Min | Minimum time between two packets sent in the forward direction |
| Fwd IAT Max | Maximum time between two packets sent in the forward direction |
| Fwd IAT Mean | Mean time between two packets sent in the forward direction |
| Fwd IAT Std | Standard deviation time between two packets sent in the forward direction |
| Fwd IAT Total | Total time between two packets sent in the forward direction |
| Bwd IAT Min | Minimum time between two packets sent in the backward direction|
| Bwd IAT Max | Maximum time between two packets sent in the backward direction |
| Bwd IAT Mean | Mean time between two packets sent in the backward direction |
| Bwd IAT Std | Standard deviation time between two packets sent in the backward direction |
| Bwd IAT Total | Total time between two packets sent in the backward direction |
| Fwd PSH flag | Number of times the PSH flag was set in packets travelling in the forward direction (0 for UDP) |
| Bwd PSH Flag | Number of times the PSH flag was set in packets travelling in the backward direction (0 for UDP) |
| Fwd URG Flag | Number of times the URG flag was set in packets travelling in the forward direction (0 for UDP) |
| Bwd URG Flag | Number of times the URG flag was set in packets travelling in the backward direction (0 for UDP) |
| Fwd Header Length | Total bytes used for headers in the forward direction |
| Bwd Header Length | Total bytes used for headers in the backward direction |
| FWD Packets/s | Number of forward packets per second |
| Bwd Packets/s | Number of backward packets per second |
| Min Packet Length | Minimum length of a packet |
| Max Packet Length | Maximum length of a packet |
| Packet Length Mean | Mean length of a packet |
| Packet Length Std | Standard deviation length of a packet |
| Packet Length Variance | Variance length of a packet |
| FIN Flag Count | Number of packets with FIN |
| SYN Flag Count | Number of packets with SYN |
| RST Flag Count | Number of packets with RST |
| PSH Flag Count | Number of packets with PUSH |
| ACK Flag Count | Number of packets with ACK |
| URG Flag Count | Number of packets with URG |
| CWR Flag Count | Number of packets with CWE |
| ECE Flag Count | Number of packets with ECE |
| down/Up Ratio | Download and upload ratio |
| Average Packet Size | Average size of packet |
| Avg Fwd Segment Size | Average size observed in the forward direction |
| AVG Bwd Segment Size | Average number of bytes bulk rate in the forward direction |
| Fwd Header Length | Length of header for forward packet |
| Fwd Avg Bytes/Bulk | Average number of bytes bulk rate in the forward direction |
| Fwd AVG Packet/Bulk | Average number of packets bulk rate in the forward direction |
| Fwd AVG Bulk Rate | Average number of bulk rate in the forward direction |
| Bwd Avg Bytes/Bulk | Average number of bytes bulk rate in the backward direction |
| Bwd AVG Packet/Bulk | Average number of packets bulk rate in the backward direction |
| Bwd AVG Bulk Rate | Average number of bulk rate in the backward direction |
| Subflow Fwd Packets | The average number of packets in a sub flow in the forward direction |
| Subflow Fwd Bytes | The average number of bytes in a sub flow in the forward direction |
| Subflow Bwd Packet | The average number of packets in a sub flow in the backward direction |
| Subflow Bwd Bytes | The average number of bytes in a sub flow in the backward direction |
| Init_Win_bytes_forward | The total number of bytes sent in initial window in the forward direction |
| Init_Win_bytes_backward | The total number of bytes sent in initial window in the backward direction |
| Act_data_pkt_forward | Count of packets with at least 1 byte of TCP data payload in the forward direction |
| min_seg_size_forward | Minimum segment size observed in the forward direction |
| Active Min | Minimum time a flow was active before becoming idle |
| Active Mean | Mean time a flow was active before becoming idle |
| Active Max | Maximum time a flow was active before becoming idle |
| Active Std | Standard deviation time a flow was active before becoming idle |
| Idle Min | Minimum time a flow was idle before becoming active |
| Idle Mean | Mean time a flow was idle before becoming active |
| Idle Max | Maximum time a flow was idle before becoming active |
| Idle Std | Standard deviation time a flow was idle before becoming active |
| total_fpackets | Total packets in the forward direction |
| total_bpackets | Total packets in the backward direction |
| total_fpktl | Total size of packet in forward direction |
| total_bpktl | Total size of packet in backward direction |
| min_fpktl | Minimum size of packet in forward direction |
| min_bpktl | Minimum size of packet in backward direction |
| max_fpktl | Maximum size of packet in forward direction |
| max_bpktl | Maximum size of packet in backward direction |
| mean_fpktl | Mean size of packet in forward direction |
| mean_bpktl | Mean size of packet in backward direction |
| std_fpktl | Standard deviation size of packet in forward direction |
| std_bpktl | Standard deviation size of packet in backward direction |
| total_fiat | Total time between two packets sent in the forward direction |
| total_biat | Total time between two packets sent in the backward direction |
| min_fiat | Minimum time between two packets sent in the forward direction |
| min_biat | Minimum time between two packets sent in the backward direction |
| max_fiat | Maximum time between two packets sent in the forward direction |
| max_biat | Maximum time between two packets sent in the backward direction |
| mean_fiat | Mean time between two packets sent in the forward direction |
| mean_biat | Mean time between two packets sent in the backward direction |
| std_fiat | Standard deviation time between two packets sent in the forward direction |
| std_biat | Standard deviation time between two packets sent in the backward direction |
| fpsh_cnt | Number of times the PSH flag was set in packets travelling in the forward direction (0 for UDP) |
| bpsh_cnt | Number of times the PSH flag was set in packets travelling in the backward direction (0 for UDP) |
| furg_cnt | Number of times the URG flag was set in packets travelling in the forward direction (0 for UDP) |
| burg_cnt | Number of times the URG flag was set in packets travelling in the backward direction (0 for UDP) |
| total_fhlen | Total bytes used for headers in the forward direction |
| total_bhlen | Total bytes used for headers in the backward direction |
| fPktsPerSecond | Number of forward packets per second |
| bPktsPerSecond | Number of backward packets per second |
| flowPktsPerSecond | Number of flow packets per second |
| flowBytesPerSecond | Number of flow bytes per second |
| min_flowpktl | Minimum length of a flow |
| max_flowpktl | Maximum length of a flow |
| mean_flowpktl | Mean length of a flow |
| std_flowpktl | Standard deviation length of a flow |
| min_flowiat | Minimum inter-arrival time of packet |
| max_flowiat | Maximum inter-arrival time of packet |
| mean_flowiat | Mean inter-arrival time of packet |
| std_flowiat | Standard deviation inter-arrival time of packet |
| flow_fin | Number of packets with FIN |
| flow_syn | Number of packets with SYN |
| flow_rst | Number of packets with RST |
| flow_psh | Number of packets with PUSH |
| flow_ack | Number of packets with ACK |
| flow_urg | Number of packets with URG |
| flow_cwr | Number of packets with CWE |
| flow_ece | Number of packets with ECE |
| downUpRatio | Download and upload ratio |
| avgPacketSize | Average size of packet |
| fAvgSegmentSize | Average size observed in the forward direction |
| fAvgBytesPerBulk | Average number of bytes bulk rate in the forward direction |
| fAvgPacketsPerBulk | Average number of packets bulk rate in the forward direction |
| fAvgBulkRate | Average number of bulk rate in the forward direction |
| bAvgSegmentSize | Average size observed in the backward direction |
| bAvgBytesPerBulk | Average number of bytes bulk rate in the backward direction |
| bAvgPacketsPerBulk | Average number of packets bulk rate in the backward direction |
| bAvgBulkRate | Average number of bulk rate in the backward direction |
| sflow_fpacket | The average number of packets in a sub flow in the forward direction |
| sflow_fbytes | The average number of bytes in a sub flow in the forward direction |
| sflow_bpacket | The average number of packets in a sub flow in the backward direction |
| sflow_bbytes | The average number of bytes in a sub flow in the backward direction |
| min_active | Minimum time a flow was active before becoming idle |
| mean_active | Mean time a flow was active before becoming idle |
| max_active | Maximum time a flow was active before becoming idle |
| std_active | Standard deviation time a flow was active before becoming idle |
| min_idle | Minimum time a flow was idle before becoming active |
| mean_idle | Mean time a flow was idle before becoming active |
| max_idle | Maximum time a flow was idle before becoming active |
| std_idle | Standard deviation time a flow was idle before becoming active |
| Init_Win_bytes_forward | The total number of bytes sent in initial window in the forward direction |
| Init_Win_bytes_backward | The total number of bytes sent in initial window in the backward direction |
| Act_data_pkt_forward | Count of packets with at least 1 byte of TCP data payload in the forward direction |
| min_seg_size_forward | Minimum segment size observed in the forward direction |

In [19]:
cicfm = pd.read_csv("./csv/attack/cic.csv", sep=',') # dataset CICFlow Meter
cicfm.head(2)

Unnamed: 0,src_ip,dst_ip,src_port,dst_port,protocol,timestamp,flow_duration,flow_byts_s,flow_pkts_s,fwd_pkts_s,bwd_pkts_s,tot_fwd_pkts,tot_bwd_pkts,totlen_fwd_pkts,totlen_bwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,fwd_pkt_len_std,bwd_pkt_len_max,bwd_pkt_len_min,bwd_pkt_len_mean,bwd_pkt_len_std,pkt_len_max,pkt_len_min,pkt_len_mean,pkt_len_std,pkt_len_var,fwd_header_len,bwd_header_len,fwd_seg_size_min,fwd_act_data_pkts,flow_iat_mean,flow_iat_max,flow_iat_min,flow_iat_std,fwd_iat_tot,fwd_iat_max,fwd_iat_min,fwd_iat_mean,fwd_iat_std,bwd_iat_tot,bwd_iat_max,bwd_iat_min,bwd_iat_mean,bwd_iat_std,fwd_psh_flags,bwd_psh_flags,fwd_urg_flags,bwd_urg_flags,fin_flag_cnt,syn_flag_cnt,rst_flag_cnt,psh_flag_cnt,ack_flag_cnt,urg_flag_cnt,ece_flag_cnt,down_up_ratio,pkt_size_avg,init_fwd_win_byts,init_bwd_win_byts,active_max,active_min,active_mean,active_std,idle_max,idle_min,idle_mean,idle_std,fwd_byts_b_avg,fwd_pkts_b_avg,bwd_byts_b_avg,bwd_pkts_b_avg,fwd_blk_rate_avg,bwd_blk_rate_avg,fwd_seg_size_avg,bwd_seg_size_avg,cwe_flag_count,subflow_fwd_pkts,subflow_bwd_pkts,subflow_fwd_byts,subflow_bwd_byts
0,10.0.2.15,91.189.92.40,50496,443,6,1969-12-31 21:00:10,68217192.0,3.957947,0.073295,0.073295,0.0,5,0,270,0,54.0,54.0,54.0,0.0,0.0,0.0,0.0,0.0,54,54,54.0,0.0,0.0,100,0,20,0,17054298.0,36908945.0,3610742.0,12628600.0,68217192.0,36908945.0,3610742.0,17054298.0,12628600.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0.0,54.0,40880,0,0.0,0.0,0.0,0.0,18453748.0,3610742.0,9227236.25,5692156.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,0.0,0,5,0,270,0
1,10.10.10.13,10.10.10.255,138,138,17,1969-12-31 21:01:22,0.0,0.0,0.0,0.0,0.0,1,0,243,0,243.0,243.0,243.0,0.0,0.0,0.0,0.0,0.0,243,243,243.0,0.0,0.0,8,0,8,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,243.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,243.0,0.0,0,1,0,243,0


In [21]:

# flow ID, Source IP, Source Port, Destination IP, Destination Port, Protocol, Timestamp,
# Flow Duration, Total Fwd Packets, Total Backward Packets,vTotal Length of Fwd Packets,
# Total Length of Bwd Packets, Fwd Packet Length Max, Fwd Packet Length Min,
# Fwd Packet Length Mean, Fwd Packet Length Std, Bwd Packet Length Max,
# Bwd Packet Length Min, Bwd Packet Length Mean, Bwd Packet Length Std, 
# Flow Bytes/s, Flow Packets/s,
# Flow IAT Mean, Flow IAT Std, Flow IAT Max, Flow IAT Min,Fwd IAT Total, Fwd IAT Mean, Fwd IAT Std, Fwd IAT Max, Fwd IAT Min,
# Bwd IAT Total, Bwd IAT Mean, Bwd IAT Std, Bwd IAT Max, Bwd IAT Min,Fwd PSH Flags, Bwd PSH Flags, Fwd URG Flags, Bwd URG Flags,
# Fwd Header Length, Bwd Header Length, Fwd Packets/s, Bwd Packets/s,
# Min Packet Length, Max Packet Length, Packet Length Mean, Packet Length Std, Packet Length Variance,
# FIN Flag Count, SYN Flag Count, RST Flag Count, PSH Flag Count, ACK Flag Count, URG Flag Count, 
# CWE Flag Count,ECE Flag Count, Down/Up Ratio, Average Packet Size,
# Avg Fwd Segment Size, Avg Bwd Segment Size, Fwd Header Length,
# Fwd Avg Bytes/Bulk, Fwd Avg Packets/Bulk, Fwd Avg Bulk Rate, Bwd Avg Bytes/Bulk, Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,
# Subflow Fwd Packets, Subflow Fwd Bytes, Subflow Bwd Packets, Subflow Bwd Bytes,
# Init_Win_bytes_forward, Init_Win_bytes_backward, act_data_pkt_fwd, min_seg_size_forward,Active Mean, Active Std, Active Max, Active Min,
# Idle Mean, Idle Std, Idle Max, Idle Min, Label

cicfm['flow_ID'] = cicfm['dst_ip'] + '-' + cicfm['dst_port'].apply(str) + '-' + cicfm['src_ip'] + '-' + cicfm['src_port'].apply(str) + '-' + cicfm['protocol'].apply(str)
labels.rename(columns={'srcip':'src_ip', 'attack_cat':'Label'}, inplace=True)
cicfm = cicfm.merge(labels[['src_ip','Label']], how='left',
         left_on=['src_ip'],
         right_on=['src_ip'])
cicfm.fillna(value={'Label': 'benign'}, inplace=True)
cicfm.head(5)

Unnamed: 0,src_ip,dst_ip,src_port,dst_port,protocol,timestamp,flow_duration,flow_byts_s,flow_pkts_s,fwd_pkts_s,bwd_pkts_s,tot_fwd_pkts,tot_bwd_pkts,totlen_fwd_pkts,totlen_bwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,fwd_pkt_len_std,bwd_pkt_len_max,bwd_pkt_len_min,bwd_pkt_len_mean,bwd_pkt_len_std,pkt_len_max,pkt_len_min,pkt_len_mean,pkt_len_std,pkt_len_var,fwd_header_len,bwd_header_len,fwd_seg_size_min,fwd_act_data_pkts,flow_iat_mean,flow_iat_max,flow_iat_min,flow_iat_std,fwd_iat_tot,fwd_iat_max,fwd_iat_min,fwd_iat_mean,fwd_iat_std,bwd_iat_tot,bwd_iat_max,bwd_iat_min,bwd_iat_mean,bwd_iat_std,fwd_psh_flags,bwd_psh_flags,fwd_urg_flags,bwd_urg_flags,fin_flag_cnt,syn_flag_cnt,rst_flag_cnt,psh_flag_cnt,ack_flag_cnt,urg_flag_cnt,ece_flag_cnt,down_up_ratio,pkt_size_avg,init_fwd_win_byts,init_bwd_win_byts,active_max,active_min,active_mean,active_std,idle_max,idle_min,idle_mean,idle_std,fwd_byts_b_avg,fwd_pkts_b_avg,bwd_byts_b_avg,bwd_pkts_b_avg,fwd_blk_rate_avg,bwd_blk_rate_avg,fwd_seg_size_avg,bwd_seg_size_avg,cwe_flag_count,subflow_fwd_pkts,subflow_bwd_pkts,subflow_fwd_byts,subflow_bwd_byts,flow_ID,Label
0,10.0.2.15,91.189.92.40,50496,443,6,1969-12-31 21:00:10,68217192.0,3.957947,0.073295,0.073295,0.0,5,0,270,0,54.0,54.0,54.0,0.0,0.0,0.0,0.0,0.0,54,54,54.0,0.0,0.0,100,0,20,0,17054298.0,36908945.0,3610742.0,12628600.0,68217192.0,36908945.0,3610742.0,17054298.0,12628600.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0.0,54.0,40880,0,0.0,0.0,0.0,0.0,18453748.0,3610742.0,9227236.25,5692156.0,0.0,0.0,0.0,0.0,0.0,0.0,54.0,0.0,0,5,0,270,0,91.189.92.40-443-10.0.2.15-50496-6,benign
1,10.10.10.13,10.10.10.255,138,138,17,1969-12-31 21:01:22,0.0,0.0,0.0,0.0,0.0,1,0,243,0,243.0,243.0,243.0,0.0,0.0,0.0,0.0,0.0,243,243,243.0,0.0,0.0,8,0,8,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,243.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,243.0,0.0,0,1,0,243,0,10.10.10.255-138-10.10.10.13-138-17,benign
2,10.10.10.13,10.10.10.255,138,138,17,1969-12-31 21:05:23,0.0,0.0,0.0,0.0,0.0,1,0,243,0,243.0,243.0,243.0,0.0,0.0,0.0,0.0,0.0,243,243,243.0,0.0,0.0,8,0,8,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,243.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,243.0,0.0,0,1,0,243,0,10.10.10.255-138-10.10.10.13-138-17,benign
3,10.10.10.13,10.10.10.255,138,138,17,1969-12-31 21:13:21,0.0,0.0,0.0,0.0,0.0,1,0,243,0,243.0,243.0,243.0,0.0,0.0,0.0,0.0,0.0,243,243,243.0,0.0,0.0,8,0,8,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,243.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,243.0,0.0,0,1,0,243,0,10.10.10.255-138-10.10.10.13-138-17,benign
4,10.10.10.13,10.10.10.255,138,138,17,1969-12-31 21:25:21,0.0,0.0,0.0,0.0,0.0,1,0,243,0,243.0,243.0,243.0,0.0,0.0,0.0,0.0,0.0,243,243,243.0,0.0,0.0,8,0,8,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,243.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,243.0,0.0,0,1,0,243,0,10.10.10.255-138-10.10.10.13-138-17,benign


In [23]:
cicfm.to_csv('./dataset/attack_CIC.csv', index=None, header=True)
cicfm['Label'].unique()

array(['benign', 'nmap_tcp_syn', 'nmap_tcp_conn', 'nmap_tcp_null',
       'nmap_tcp_xmas', 'nmap_tcp_fin', 'nmap_tcp_ack', 'nmap_tcp_window',
       'nmap_tcp_maimon', 'unicornscan_tcp_syn', 'unicornscan_tcp_conn',
       'unicornscan_tcp_null', 'unicornscan_tcp_xmas',
       'unicornscan_tcp_fxmas', 'unicornscan_tcp_fin',
       'unicornscan_tcp_ack', 'hping_tcp_syn', 'hping_tcp_null',
       'hping_tcp_xmas', 'hping_tcp_fin', 'hping_tcp_ack',
       'masscan_tcp_syn', 'nmap_ping_scan', 'nmap_vvv', 'nmap_connect',
       'nmap_fast', 'nmap_servinfo', 'nmap_reason', 'nmap_open',
       'nmap_top10', 'nmap_fragv', 'nmap_mtu', 'nmap_spoof_idle',
       'nmap_noping', 'nmap_udping', 'nmap_ack_syn', 'nmap_syn',
       'nmap_ipproto', 'nmap_udp', 'nmap_null', 'nmap_fin',
       'nmap_stealth', 'nmap_spoof_mac', 'nmap_fake_srcprt',
       'nmap_data_length', 'nmap_bad_checksum', 'nmap_random_host',
       'nmap_fw_bypass', 'nmap_firewalk', 'nmap_http_methods',
       'nmap_rpcinfo', 'nmap_ba

In [26]:
cicfm.groupby(['Label'], as_index=False).size()

Unnamed: 0,Label,size
0,benign,16072
1,hping_tcp_ack,1000
2,hping_tcp_fin,1000
3,hping_tcp_null,1000
4,hping_tcp_syn,1000
...,...,...
60,unicornscan_tcp_fin,1014
61,unicornscan_tcp_fxmas,1014
62,unicornscan_tcp_null,1014
63,unicornscan_tcp_syn,1014


# THE END!