In [15]:
import os 
import pandas as pd
import numpy as np

# Define a Goal
Our Goal is to create a Network Intrusion Detection System (NIDS).
Categorize each object that is a raw packet as:
- Attack or Not Attack (label 1 or 0)
- Category of the Attack: Fuzzers, Analysis, Backdoors, DoS, Exploits, Generic, Reconnaissance, Shellcode and Worms.

# Data AcQuisition (DAQ)
We used a non cleaned dataset found on kaggle.com: **UNSW-NB15**. The raw packet was created by the *'IXIA PerfectStorm tool'*. This dataset is a labeled datset and in particular has nine types of attacks: 
- Generic: Broad category of general attacks.
- Fuzzers: Send random data to find vulnerabilities.
- Backdoors: Hidden access for unauthorized control.
- DoS: Overwhelm a system to disrupt service.
- Exploits: Use vulnerabilities for unauthorized access.
- Reconnaissance: Gather info to find vulnerabilities.
- Shellcode: Malicious code for system control.
- Worms: Self-spreading malware across networks.

First we assign a column name according to *NUSW-NB15_features.csv*.

In [16]:
column_name = [
    'srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur',
    'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service',
    'Sload', 'Dload', 'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb',
    'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit',
    'Stime', 'Ltime', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat',
    'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
    'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
    'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat',
    'Label'
]
df = pd.read_csv('UNSW-NB15_4.csv', header=None, names=column_name)

# Data Exploration
The exploration of the data is made in order to identify the dataset content, to extract and visualize data. We will use this in order to indentify possible features in the dataset that we do not want.

In [17]:
df.head(5)

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,59.166.0.9,7045,149.171.126.7,25,tcp,FIN,0.201886,37552,3380,31,...,,2,2,7,4,1,1,3,,0
1,59.166.0.9,9685,149.171.126.2,80,tcp,FIN,5.864748,19410,1087890,31,...,,3,1,4,4,1,1,1,,0
2,59.166.0.2,1421,149.171.126.4,53,udp,CON,0.001391,146,178,31,...,,3,5,2,7,1,1,4,,0
3,59.166.0.2,21553,149.171.126.2,25,tcp,FIN,0.053948,37812,3380,31,...,,1,1,4,7,1,1,3,,0
4,59.166.0.8,45212,149.171.126.4,53,udp,CON,0.000953,146,178,31,...,,2,5,2,1,1,1,2,,0


We make a describe() only for usefull feature:
- Dur: Analyzes the duration of connections.
- Sbytes, Dbytes: Amount of bytes exchanged between source and destination.
- Sload, Dload: Data transmission speed.
- Spkts, Dpkts: Number of packets sent and received.
- Sjit, Djit: Jitter of the connection (variation in delay).
- Sintpkt, Dintpkt: Interval between packets.
- Tcprtt, Synack, Ackdat: TCP round-trip time, SYN-ACK, and acknowledgment time.

In [46]:
column_described = [
    'dur', 'sbytes', 'dbytes', 'Sload', 'Dload', 'sloss', 'dloss',
    'Spkts', 'Dpkts', 'Sjit', 'Djit',
    'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat'
]

df[column_described].describe()


Unnamed: 0,dur,sbytes,dbytes,Sload,Dload,sloss,dloss,Spkts,Dpkts,Sjit,Djit,Sintpkt,Dintpkt,tcprtt,synack,ackdat
count,440044.0,440044.0,440044.0,440044.0,440044.0,440044.0,440044.0,440044.0,440044.0,440044.0,440044.0,440044.0,440044.0,440044.0,440044.0,440044.0
mean,0.58089,3805.922,26933.48,66943440.0,1951188.0,4.164484,12.246309,26.208541,32.320084,1425.334,539.643135,127.166633,54.118351,0.00893,0.004706,0.004225
std,3.694473,65268.56,137865.7,153132400.0,3915264.0,25.464147,48.760595,73.360219,104.588023,17110.96,2005.982783,2170.642473,1119.445521,0.068126,0.039393,0.035075
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9e-06,146.0,0.0,304345.3,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.008,0.0,0.0,0.0,0.0
50%,0.004552,534.0,304.0,923525.6,76385.22,0.0,0.0,4.0,4.0,1.636482,0.374054,0.323818,0.249588,0.0,0.0,0.0
75%,0.089172,2646.0,10168.0,76000000.0,1053726.0,7.0,8.0,34.0,30.0,97.3669,37.919519,2.475747,1.673626,0.000673,0.000536,0.00013
max,59.999989,11063470.0,12838550.0,5600000000.0,22906900.0,4158.0,4829.0,8324.0,9660.0,1201182.0,120773.4484,84371.496,52133.24,10.037506,4.525272,5.512234


## Validating value 0 for the Duration of the packet
Value 0 in duration could be an error. let's compare it with other values relevant like *sbytes, dbytes, Sjit, Djit, sloss, or dloss*.

In [63]:
dur_zero = df[df['dur'] == 0]
dur_zero.count().to_frame().T

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,441,441,441,441,441,441,441,441,441,441,...,441,441,441,441,441,441,441,441,1,441


In [64]:
columns_of_interest = ['sbytes', 'dbytes', 'Sjit', 'Djit', 'sloss', 'dloss']
dur_zero[columns_of_interest].describe()

Unnamed: 0,sbytes,dbytes,Sjit,Djit,sloss,dloss
count,441.0,441.0,441.0,441.0,441.0,441.0
mean,50.678005,0.0,4.031095,0.0,0.0,0.0
std,70.226165,0.0,6.172316,0.0,0.0,0.0
min,28.0,0.0,0.0,0.0,0.0,0.0
25%,46.0,0.0,0.0,0.0,0.0,0.0
50%,46.0,0.0,0.0,0.0,0.0,0.0
75%,46.0,0.0,10.954515,0.0,0.0,0.0
max,1504.0,0.0,15.492038,0.0,0.0,0.0


Instead it seems that everything is ok because every packet with *dur* == 0 have a *sbytes* not zero and instead the *dbytes* are zero so no bytes are reaching the destination due to an error in the trasmission. 
No packet loss is encountered and we have only jitter for the source (*Sjit*) -> OK.

## Categoric Feature

In [45]:
column_1=[
    'proto', 'state', 'service', 'attack_cat', 'Label'
    ]

df[column_1].value_counts().to_frame().reset_index().rename(columns={0: 'count'}).head(10)

Unnamed: 0,proto,state,service,attack_cat,Label,count
0,udp,INT,dns,Generic,1,60344
1,tcp,FIN,http,Exploits,1,2966
2,tcp,FIN,-,Fuzzers,1,2825
3,unas,INT,-,Exploits,1,1800
4,unas,INT,-,DoS,1,1680
5,udp,INT,-,Fuzzers,1,1418
6,tcp,FIN,-,Exploits,1,1265
7,tcp,FIN,-,Reconnaissance,1,1180
8,udp,INT,-,Reconnaissance,1,1169
9,tcp,FIN,smtp,Exploits,1,1035
