# Feature Selection using the Adjusted Mutual Information (AMI) Score 

### (DNS analysis only)

This notebook uses the Adjusted Mutual Information Score to determine the most relevant features in the DDoS attacks. It uses the bigflows pcap from [http://tcpreplay.appneta.com/wiki/captures.html] containing normal network traffic. The DDoS attack pcaps from DDoSDB [https://ddosdb.org/] are used to compare it with the normal network traffic.

In [73]:
#all relevant libraries need to be imported
from sklearn.metrics.cluster import adjusted_mutual_info_score, mutual_info_score, normalized_mutual_info_score
from sklearn.feature_selection import mutual_info_classif
import numpy as np
import dpkt
import Utils
import ipaddress
import pandas as pd
import time
import os
import json
import operator
import sys

### Load bigflows containing only UDP traffic

In [67]:
start = time.time()

input = open('normal_pcaps/bigflows_dns.pcap', 'rb')
normal = dpkt.pcap.Reader(input)
length = 4372 #this is the number of packets in bigflows_udp.pcap   #791615 #791179

ips_n = np.empty(length)
protocol_n = np.empty(length)
total_length_n = np.empty(length)
src_port_n = np.empty(length)
dst_port_n = np.empty(length)
udp_length_n = np.empty(length)

dns_opcode_n = np.empty(length)
dns_answerlength_n = np.empty(length)
dns_questionlength_n = np.empty(length)

dns_flag_response_n = np.empty(length)
dns_flag_opcode_n = np.empty(length)
dns_flag_truncated_n = np.empty(length)
dns_flag_authorative_n = np.empty(length)
dns_flag_recursiond_n = np.empty(length)
dns_flag_recursiona_n = np.empty(length)

dns_reply_code_n = np.empty(length)

dns_answer_type_n = np.empty(length)
dns_answer_class_n = np.empty(length)
dns_answer_name_n = np.empty(length)
dns_answer_size_n = np.empty(length)

dns_question_name_n = np.empty(length)
dns_question_class_n = np.empty(length)
dns_question_type_n = np.empty(length)

labels_n = np.zeros(length)

udp_packet = 0
tcp_packet = 0
dns_packet = 0
other = 0
total_packet = 0
 
    
printb = 0
i=0
for ts, buf in normal:
    eth = dpkt.ethernet.Ethernet(buf)
    total_packet += 1 

    if eth.type == 2048:
        ip = eth.data
        ips_n[i] = int.from_bytes(ip.src,"little")
        protocol_n[i] = ip.p
        total_length_n[i] = ip.len
        
        if ip.p == 6:
            tcp_packet += 1
            TCP=ip.data 
            dst_port_n[i] = TCP.dport
            src_port_n[i] = TCP.sport
        elif ip.p == 17:
            udp_packet += 1
            UDP=ip.data
            if UDP.dport == 53 or UDP.sport == 53:
                dns_packet += 1
                dns = dpkt.dns.DNS(UDP.data)
                
                #flags
                dns_opcode_n[i] =  dns.opcode
                dns_answerlength_n[i] = len(dns.an)
                dns_questionlength_n[i] = len(dns.qd)
                
                dns_flag_opcode_n[i] = dns.opcode
                dns_flag_response_n[i] = dns.qr
                dns_flag_truncated_n[i] = dns.tc
                dns_flag_authorative_n[i] = dns.aa #authorative answer
                dns_flag_recursiond_n[i] = dns.rd
                dns_flag_recursiona_n[i] = dns.ra
                
                dns_reply_code_n[i] = dns.rcode
                
                if (len(dns.an) != 0):
                    dns_answer_type_n[i] = dns.an[0].type
                    dns_answer_name_n[i] = int.from_bytes(bytes(dns.an[0].name,'utf-8'), "little")
                    dns_answer_class_n[i] = dns.an[0].cls
                    dns_answerlength_n[i] = dns.an[0].rlen
                else:
                    dns_answer_type_n[i] = -1
                    dns_answer_name_n[i] = -1
                    dns_answer_class_n[i] = -1
                    dns_answer_size_n[i] = -1
                
                if (len(dns.qd) != 0 ):
                    dns_question_class_n[i] = dns.qd[0].cls
                    dns_question_name_n[i] = int.from_bytes(bytes(dns.qd[0].name,'utf-8'), "little")
                    dns_question_type_n[i] = dns.qd[0].type
                else:
                    dns_question_class_n[i] = -1
                    dns_question_name_n[i] = -1
                    dns_question_type_n[i] = -1
                    
                    
                dst_port_n[i] = UDP.dport
                src_port_n[i] = UDP.sport
                udp_length_n[i] = UDP.ulen
            else:
                other +=1
                
        else:
            other += 1
            dst_port_n[i] = 0
            src_port_n[i] = 0
    else:
        other += 1
        
    i += 1

input.close()
print("total # of packets: %s"  % (total_packet))
print("# of UDP packets: %s" % (udp_packet))
print("# of TCP packets: %s" % (tcp_packet))
print("# of DNS packets: %s" % (dns_packet))
print("# of other packets except for TCP or UDP: %s"  % (other))

end = time.time()
print(end - start)


total # of packets: 4372
# of UDP packets: 4372
# of TCP packets: 0
# of DNS packets: 4372
# of other packets except for TCP or UDP: 0
0.27992868423461914


### Get attack keys of DDoS attacks containing only DNS traffic

In [77]:
path_to_json = 'signatures/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

dns_keys=[]

for jf in json_files:
    with open('signatures/' + jf) as f:
        data=json.load(f)
    protocol = data.get("protocol")

    if protocol == "DNS":
        dns_keys.append(jf[:-5])

print("# of DNS DDoS attacks: ", len(dns_keys))

# of DNS DDoS attacks:  79


### Get AMI score and other stats per DNS DDoS Attack

In [None]:
start = time.time()

counter = 1
most_important_features = []

#iterate over all attack keys with DNS attacks
for attack_key in dns_keys:
    try:
        input = open('attack_pcaps/' + attack_key + '.pcap', 'rb')
        attack = dpkt.pcap.Reader(input)

        udp_packet = 0
        tcp_packet = 0
        other = 0
        total_packet = 0
        i = 0
        
        ############### 
        #loop trough packets to get the number of packets in the attack
        #############
        for ts, buf in attack:
            eth = dpkt.ethernet.Ethernet(buf)
            total_packet += 1 

        input.close()
        
        print("attackkey: ", attack_key)
        print("total # of packets: %s"  % (total_packet))
        #######################
        #initialize numpy arrays for all the to measure features
        ##############################
        ips_a = np.empty(total_packet)
        protocol_a = np.empty(total_packet)
        total_length_a = np.empty(total_packet)
        src_port_a = np.empty(total_packet)
        dst_port_a = np.empty(total_packet)
        udp_length_a = np.empty(total_packet)
        
        dns_opcode_a = np.empty(total_packet)
        dns_answerlength_a = np.empty(total_packet)
        dns_questionlength_a = np.empty(total_packet)

        dns_flag_response_a = np.empty(total_packet)
        dns_flag_opcode_a = np.empty(total_packet)
        dns_flag_truncated_a = np.empty(total_packet)
        dns_flag_authorative_a = np.empty(total_packet)
        dns_flag_recursiond_a = np.empty(total_packet)
        dns_flag_recursiona_a = np.empty(total_packet)

        dns_reply_code_a = np.empty(total_packet)

        dns_answer_type_a = np.empty(total_packet)
        dns_answer_class_a = np.empty(total_packet)
        dns_answer_name_a = np.empty(total_packet)
        dns_answer_size_a = np.empty(total_packet)

        dns_question_name_a = np.empty(total_packet)
        dns_question_class_a = np.empty(total_packet)
        dns_question_type_a = np.empty(total_packet)
        
        labels_a = np.ones(total_packet)    
        
        ###############
        #loop trough packets and gather features
        #########
        input = open('attack_pcaps/' + attack_key + '.pcap', 'rb')
        attack = dpkt.pcap.Reader(input)
        i = 0
        for ts, buf in attack:
            eth = dpkt.ethernet.Ethernet(buf)
            if eth.type == 2048:
                ip = eth.data
                ips_a[i] = int.from_bytes(ip.src,"little")
                protocol_a[i] = ip.p
                total_length_a[i] = ip.len

                if ip.p == 6:
                    tcp_packet += 1
                    TCP=ip.data 
                    dst_port_a[i] = TCP.dport
                    src_port_a[i] = TCP.sport
                elif ip.p == 17:
                    udp_packet += 1
                    UDP=ip.data
                    if UDP.dport == 53 or UDP.sport == 53:
                        dns_packet += 1
                        try:
                            dns = dpkt.dns.DNS(UDP.data)
                        except:
                            continue
                        #flags
                        dns_opcode_a[i] =  dns.opcode
                        dns_answerlength_a[i] = len(dns.an)
                        dns_questionlength_a[i] = len(dns.qd)

                        dns_flag_opcode_a[i] = dns.opcode
                        dns_flag_response_a[i] = dns.qr
                        dns_flag_truncated_a[i] = dns.tc
                        dns_flag_authorative_a[i] = dns.aa #authorative answer
                        dns_flag_recursiond_a[i] = dns.rd
                        dns_flag_recursiona_a[i] = dns.ra

                        dns_reply_code_a[i] = dns.rcode

                        if (len(dns.an) != 0):
                            dns_answer_type_a[i] = -1 #dns.an[0].type
                            dns_answer_name_a[i] = int.from_bytes(bytes(dns.an[0].name,'utf-8'), "little")
                            dns_answer_class_a[i] = dns.an[0].cls
                            dns_answer_size_a[i] = dns.an[0].rlen
                        else:
                            dns_answer_type_a[i] = -1
                            dns_answer_name_a[i] = -1
                            dns_answer_class_a[i] = -1
                            dns_answer_size_a[i] = -1

                        if (len(dns.qd) != 0 ):
                            dns_question_class_a[i] = dns.qd[0].cls
                            dns_question_name_a[i] = int.from_bytes(bytes(dns.qd[0].name,'utf-8'), "little")
                            dns_question_type_a[i] = -1 #dns.qd[0].type
                        else:
                            dns_question_class_a[i] = -1
                            dns_question_name_a[i] = -1
                            dns_question_type_a[i] = -1


                        dst_port_a[i] = UDP.dport
                        src_port_a[i] = UDP.sport
                        udp_length_a[i] = UDP.ulen
                else:
                    other += 1
                    dst_port_a[i] = 0
                    src_port_a[i] = 0
            else:
                other += 1

            i += 1
            
        ###############
        #make Numpy arrays with attack features and normal features together
        ################
        ips = np.append(ips_a, ips_n)
        protocol = np.append(protocol_a, protocol_n)
        total_length = np.append(total_length_a, total_length_n)
        src_port = np.append(src_port_a, src_port_n)
        dst_port = np.append(dst_port_a, dst_port_n)
        udp_length = np.append(udp_length_a, udp_length_n)
        
        dns_opcode = np.append(dns_opcode_a, dns_opcode_n)
        dns_answerlength = np.append(dns_answerlength_a, dns_answerlength_n)
        dns_questionlength = np.append(dns_questionlength_a, dns_questionlength_n)

        dns_flag_response = np.append(dns_flag_response_a, dns_flag_response_n)
        dns_flag_opcode = np.append(dns_flag_opcode_a, dns_flag_opcode_n)
        dns_flag_truncated = np.append(dns_flag_truncated_a, dns_flag_truncated_n)
        dns_flag_authorative = np.append(dns_flag_authorative_a, dns_flag_authorative_n)
        dns_flag_recursiond = np.append(dns_flag_recursiond_a, dns_flag_recursiond_n)
        dns_flag_recursiona = np.append(dns_flag_recursiona_a, dns_flag_recursiona_n)

        dns_reply_code = np.append(dns_reply_code_a, dns_reply_code_n)

        dns_answer_type = np.append(dns_answer_type_a, dns_answer_type_n)
        dns_answer_class = np.append(dns_answer_class_a, dns_answer_class_n)
        dns_answer_name = np.append(dns_answer_name_a, dns_answer_name_n)
        dns_answer_size = np.append(dns_answer_size_a, dns_answer_name_n)

        dns_question_name = np.append(dns_question_name_a, dns_question_name_n)
        dns_question_class = np.append(dns_question_class_a, dns_question_name_n)
        dns_question_type = np.append(dns_question_type_a, dns_question_type_n)
        
        labels = np.append(labels_a, labels_n)

        features = [ips,protocol,total_length,src_port,dst_port,udp_length,dns_opcode,dns_answerlength,dns_questionlength,dns_flag_response,dns_flag_opcode,dns_flag_truncated,dns_flag_authorative,dns_flag_recursiond,dns_flag_recursiona,dns_reply_code,dns_answer_type,dns_answer_class,dns_answer_name,dns_answer_size,dns_question_name,dns_question_class,dns_question_type]

        
        ##########
        #Calculate adjusted mutual information score
        ############
        
        scores = []
        for x in features:
            score = adjusted_mutual_info_score(np.repeat(x,4), np.repeat(labels,4), "arithmetic")
            scores.append(score)
            
        ###########
        #Print all stats per attack
        #######

        print("# of UDP packets: %s" % (udp_packet))
        print("# of TCP packets: %s" % (tcp_packet))
        print("# of DNS packets: %s" % (dns_packet))
        print("# of other packets except for TCP or UDP: %s"  % (other))
        print("Features: ips,protocol,total_length,src_port,dst_port,udp_length")
        print("AMI scores: ", scores)
        
        max_index, max_value = max(enumerate(scores), key=operator.itemgetter(1))
        most_important_features.append(max_index)
        print("Most important feature: ", max_index)
        end = time.time()
        print(end - start)
        print(counter,"/",len(dns_keys))
        counter += 1
        print("#####################################")
    except Exception as e:
        print("something went wrong here ", attack_key)
        exc_type, exc_obj, exc_tb = sys.exc_info()
        fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
        print(exc_type, fname, exc_tb.tb_lineno)
        print(e)
print(end - start)

attackkey:  41fd4b0156a33263722ee03c74e238a4
total # of packets: 13483
# of UDP packets: 13483
# of TCP packets: 0
# of DNS packets: 749249
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.25686533456314203, 0.06267065062436858, 0.24137759417033883, 0.12356628570330584, 0.2669391946337126, 0.24004417275803083, 0.031132611257206903, 0.24079408912210978, 0.02905852818970957, 0.12386406998884587, 0.02907114105423345, 0.029083754878815815, 0.029051148478346098, 0.028774781147169224, 0.12388715985760088, 0.04346079803790393, 0.1007974929486909, 0.14298384611512965, 0.23425269870967894, 0.21788949057721516, 0.20484056882016125, 0.20511988189186708, 0.27845496876232606]
Most important feature:  22
2.8738136291503906
1 / 79
#####################################
attackkey:  b3ff5d184e6e3c08ed25483c2cc4a9c3
total # of packets: 2061061


### Analysis and Conclusions

In [18]:
for i in range(0,6):
    print(i,most_important_features.count(i))

0 0
1 31
2 3
3 93
4 4
5 26


The counts of most important feature per feature are: <br/>

0 IP address: 0 <br/>
1 Protocol: 31 <br/>
2 Total packet length: 3 <br/>
3 UDP source port: 93 <br/>
4 Destination port: 4 <br/>
5 UDP payload length: 26 <br/>

None of the attacks has its IP address as most important feature. This can be explained by the fact that there are too much different IP addresses. Furthermore by the fact that the values of the IP addresses are laying in between each other for the normal and attack traffic.

The 31 at the protocol is remarkable, since this AMI value should be zero since all packets should be UDP packets. With manual inspection we discovered that some of the attacks are mixed with ICMP packets. 

The UDP source port is for most attacks the distinguishing feature. UDP payload second best.
