# Feature Selection using the Adjusted Mutual Information (AMI) Score 

### (UDP analysis only)

This notebook uses the Adjusted Mutual Information Score to determine the most relevant features in the DDoS attacks. It uses the bigflows pcap from [http://tcpreplay.appneta.com/wiki/captures.html] containing normal network traffic. The DDoS attack pcaps from DDoSDB [https://ddosdb.org/] are used to compare it with the normal network traffic.

In [1]:
#all relevant libraries need to be imported
from sklearn.metrics.cluster import adjusted_mutual_info_score, mutual_info_score, normalized_mutual_info_score
from sklearn.feature_selection import mutual_info_classif
import numpy as np
import dpkt
import Utils
import ipaddress
import pandas as pd
import time
import os
import json
import operator

### Load bigflows containing only UDP traffic

In [2]:
start = time.time()

input = open('normal_pcaps/bigflows_udp.pcap', 'rb')
normal = dpkt.pcap.Reader(input)
length = 152733 #this is the number of packets in bigflows_udp.pcap   #791615 #791179

ips_n = np.empty(length)
protocol_n = np.empty(length)
total_length_n = np.empty(length)
src_port_n = np.empty(length)
dst_port_n = np.empty(length)
udp_length_n = np.empty(length)

labels_n = np.zeros(length)

udp_packet = 0
tcp_packet = 0
other = 0
total_packet = 0
 
i=0
for ts, buf in normal:
    eth = dpkt.ethernet.Ethernet(buf)
    total_packet += 1 

    if eth.type == 2048:
        ip = eth.data
        ips_n[i] = int.from_bytes(ip.src,"little")
        protocol_n[i] = ip.p
        total_length_n[i] = ip.len
        
        if ip.p == 6:
            tcp_packet += 1
            TCP=ip.data 
            dst_port_n[i] = TCP.dport
            src_port_n[i] = TCP.sport
        elif ip.p == 17:
            udp_packet += 1
            UDP=ip.data
            dst_port_n[i] = UDP.dport
            src_port_n[i] = UDP.sport
            udp_length_n[i] = UDP.ulen
        else:
            dst_port_n[i] = 0
            src_port_n[i] = 0
    else:
        other += 1
        
    i += 1

input.close()
print("total # of packets: %s"  % (total_packet))
print("# of UDP packets: %s" % (udp_packet))
print("# of TCP packets: %s" % (tcp_packet))
print("# of other packets except for TCP or UDP: %s"  % (other))

end = time.time()
print(end - start)


total # of packets: 152733
# of UDP packets: 152733
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
2.1693918704986572


### Get attack keys of DDoS attacks containing only UDP traffic

In [3]:
path_to_json = 'signatures/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

udp_keys=[]

for jf in json_files:
    with open('signatures/' + jf) as f:
        data=json.load(f)
    protocol = data.get("protocol")

    if protocol != "TCP" and protocol != "HTTP" and protocol != "ICMP":
        udp_keys.append(jf[:-5])

print("# of UDP DDoS attacks: ", len(udp_keys))

# of UDP DDoS attacks:  253


### Get AMI score and other stats per UDP DDoS Attack

In [5]:
start = time.time()

counter = 1
most_important_features = []

#iterate over all attack keys with UDP attacks
for attack_key in udp_keys:
    try:
        input = open('attack_pcaps/' + attack_key + '.pcap', 'rb')
        attack = dpkt.pcap.Reader(input)

        udp_packet = 0
        tcp_packet = 0
        other = 0
        total_packet = 0
        i = 0
        
        ############### 
        #loop trough packets to get the number of packets in the attack
        #############
        for ts, buf in attack:
            eth = dpkt.ethernet.Ethernet(buf)
            total_packet += 1 

        input.close()
        
        #######################
        #initialize numpy arrays for all the to measure features
        ##############################
        ips_a = np.empty(total_packet)
        protocol_a = np.empty(total_packet)
        total_length_a = np.empty(total_packet)
        src_port_a = np.empty(total_packet)
        dst_port_a = np.empty(total_packet)
        udp_length_a = np.empty(total_packet)
        labels_a = np.ones(total_packet)    
        
        ###############
        #loop trough packets and gather features
        #########
        input = open('attack_pcaps/' + attack_key + '.pcap', 'rb')
        attack = dpkt.pcap.Reader(input)
        i = 0
        for ts, buf in attack:
            eth = dpkt.ethernet.Ethernet(buf)
            if eth.type == 2048:
                ip = eth.data
                ips_a[i] = int.from_bytes(ip.src,"little")
                protocol_a[i] = ip.p
                total_length_a[i] = ip.len
                if ip.p == 6:
                    tcp_packet += 1
                    TCP=ip.data 
                    dst_port_a[i] = TCP.dport
                    src_port_a[i] = TCP.sport
                elif ip.p == 17:
                    udp_packet += 1
                    UDP=ip.data
                    try:
                        udp_length_a[i] = UDP.ulen
                        dst_port_a[i] = UDP.dport
                        src_port_a[i] = UDP.sport
                    except:
                        udp_length_a[i] = UDP.ulen
                        dst_port_a[i] = UDP.dport
                        src_port_a[i] = UDP.sport
                else:
                    other += 1
                    dst_port_a[i] = 0
                    src_port_a[i] = 0
            else:
                other += 1

            i += 1
            
        ###############
        #make Numpy arrays with attack features and normal features together
        ################
        ips = np.append(ips_a, ips_n)
        protocol = np.append(protocol_a, protocol_n)
        total_length = np.append(total_length_a, total_length_n)
        src_port = np.append(src_port_a, src_port_n)
        dst_port = np.append(dst_port_a, dst_port_n)
        udp_length = np.append(udp_length_a, udp_length_n)
        labels = np.append(labels_a, labels_n)

        features = [ips,protocol,total_length,src_port,dst_port,udp_length]

        ##########
        #Calculate adjusted mutual information score
        ############
        scores = []
        for x in features:
            score = adjusted_mutual_info_score(np.repeat(x,4), np.repeat(labels,4), "arithmetic")
            scores.append(score)
            
        ###########
        #Print all stats per attack
        #######
        print("attackkey: ", attack_key)
        print("total # of packets: %s"  % (total_packet))
        print("# of UDP packets: %s" % (udp_packet))
        print("# of TCP packets: %s" % (tcp_packet))
        print("# of other packets except for TCP or UDP: %s"  % (other))
        print("Features: ips,protocol,total_length,src_port,dst_port,udp_length")
        print("AMI scores: ", scores)
                
        numpy.unique(total_length)
        
        fname = "signatures/" + attack_key + ".json"
        with open(fname) as jsonfile:
            sig = json.load(jsonfile)
        sig['AMI'] = scores
        with open(fname, mode='w') as f:
            f.write(json.dumps(sig, indent=2))
        
        max_index, max_value = max(enumerate(scores), key=operator.itemgetter(1))
        most_important_features.append(max_index)
        print("Most important feature: ", max_index)
        end = time.time()
        print(end - start)
        print(counter,"/",len(udp_keys))
        counter += 1
        print("#####################################")
    except Exception as e:
        print("something went wrong with attack ", attack_key)
        print(str(e))
print(end - start)

attackkey:  a43a626aff8968889e14096fcdc5e1f5
total # of packets: 182836
# of UDP packets: 182836
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.32804197845782124, 2.4166171588213676e-15, 0.5457104444959394, 0.5212409165991726, 0.5117158798018506, 0.545733562057418]
Most important feature:  5
8.7776198387146
1 / 253
#####################################
attackkey:  dce5ef87684753c1c5588f1197ac59f6
total # of packets: 18259
# of UDP packets: 18259
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.16103044626432686, 9.39522067887482e-15, 0.21394613954999, 0.19084951316012302, 0.20732361141108954, 0.21396049758791405]
Most important feature:  5
11.285322666168213
2 / 253
#####################################
attackkey:  3bc4411e6ad43580551f36a9fffc1347
total # of packets: 2
# of UDP packets: 2
# of T

attackkey:  95120088fead57b6620bba06ee0ba5ca
total # of packets: 170
# of UDP packets: 170
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.0047366852357821985, 1.2272552635563947e-12, 0.0055106668513872385, 0.006060160852004992, 0.0057218160020911105, 0.0055112646219592516]
Most important feature:  3
41.452938079833984
20 / 253
#####################################
attackkey:  68387e4ccd91c2f0a78bd8050165b33a
total # of packets: 106
# of UDP packets: 106
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.0031262318950289864, 3.023270100609267e-16, 0.0035460912858169148, 0.004003494117569772, 0.003801094634059248, 0.003546497874512349]
Most important feature:  3
42.55676054954529
21 / 253
#####################################
attackkey:  b0e94727d740256be0aa28ac8ccdc5d4
total # of packets: 1071
# of

attackkey:  da27a3365d19888db6a6e3be0b29e669
total # of packets: 544
# of UDP packets: 544
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.012839614927554852, 1.4962982640940657e-13, 0.015725710347998915, 0.016427343596577958, 0.015415058669390564, 0.015727180948097864]
Most important feature:  3
67.32504963874817
38 / 253
#####################################
attackkey:  e6c3668c7cd00f4b287b6fc92a0afcb9
total # of packets: 319
# of UDP packets: 319
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.008168936358077167, 7.117474725007162e-13, 0.010208246371887325, 0.010444008460021506, 0.009869985129128976, 0.010209258671937756]
Most important feature:  3
68.48605513572693
39 / 253
#####################################
attackkey:  368726a5d6fdb862413347cd6a8e8bed
total # of packets: 24620
# of UDP p

attackkey:  20bb770b6cc349185770a35cbeda3956
total # of packets: 8
# of UDP packets: 8
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.0002905725619100433, -6.2483243833916365e-12, 0.00038217843377178965, 0.0003821668372818057, 0.00036317456816925943, 0.00038222412506693035]
Most important feature:  5
101.73777770996094
57 / 253
#####################################
attackkey:  1fe0c0580a97bc027388276a7a9e0137
total # of packets: 3
# of UDP packets: 3
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.00011305154527387258, 4.583746133227516e-11, 0.00014390587750115016, 0.00015171978228607912, 0.0001438781324274639, 0.00014392317211591422]
Most important feature:  3
102.77652049064636
58 / 253
#####################################
attackkey:  6850a3840f24fbe3f3990b2b0a09e0a4
total # of packets: 2
# 

attackkey:  b4348b2293cf64d074c1248b78f4b27d
total # of packets: 2594
# of UDP packets: 2594
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.045120599798430625, 1.2143028675175405e-13, 0.05477347718306615, 0.057619257003227106, 0.05475553315498958, 0.054778238257662634]
Most important feature:  3
131.15994024276733
75 / 253
#####################################
attackkey:  700062ab3a066e721cc674c40a089795
total # of packets: 1677
# of UDP packets: 1677
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.03232311149167387, -1.7510883865939518e-15, 0.04368480137979166, 0.0412027800451178, 0.03872076364030938, 0.04368870095646072]
Most important feature:  5
132.51098370552063
76 / 253
#####################################
attackkey:  c093f324dc7908241b9be60bb8b58473
total # of packets: 382
# of UDP pac

attackkey:  14dd8e6fdcecd5d02573af4ae38f2449
total # of packets: 13422
# of UDP packets: 13422
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.13778648026716972, 3.395106006625635e-14, 0.17503852019441474, 0.15680409708624335, 0.1331433501383501, 0.17505106826406613]
Most important feature:  5
171.27180433273315
94 / 253
#####################################
attackkey:  9805c4aa14103ea03aa57bb35cd1a0d5
total # of packets: 2
# of UDP packets: 2
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [7.612429097709338e-05, 0.0, 9.526109185639655e-05, 3.983486658902074e-05, 9.776512584925293e-05, 9.527257174165251e-05]
Most important feature:  4
172.2989764213562
95 / 253
#####################################
attackkey:  324fe87b957724e969ef1e3893ea67b6
total # of packets: 7612
# of UDP packets: 7612
# of TC

attackkey:  d0a8baa3a49351a64abe15bacfcbfd4c
total # of packets: 94
# of UDP packets: 94
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.0028085802950286844, -3.3601718803933094e-16, 0.0029753753485372266, 0.003600115364685638, 0.003405257391033414, 0.0029757246480007758]
Most important feature:  3
196.4095013141632
112 / 253
#####################################
attackkey:  2bb0fc4cdf229a8b2114cec0d6362597
total # of packets: 2497
# of UDP packets: 2497
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.04363057041019476, 1.2722116642259785e-13, 0.05693968666698791, 0.05596294020179934, 0.05403623748679062, 0.056944667615798264]
Most important feature:  5
197.81024813652039
113 / 253
#####################################
attackkey:  bd3dd0d241ed5dafb0837c557ffeb3c3
total # of packets: 182
# of UDP

attackkey:  4c853fcff01b06cd9571173df0ddfdae_3
total # of packets: 3
# of UDP packets: 3
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.00011305154527387256, 4.583746133227516e-11, 0.00014390587750115016, 0.0001517197822860791, 0.0001438781324274639, 0.00014392317211591422]
Most important feature:  3
219.22757077217102
130 / 253
#####################################
attackkey:  4820f8926277836e275742eb24ce7f6f
total # of packets: 42
# of UDP packets: 42
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.0013616327729007187, 1.4033379944685298e-12, 0.0017853248872682012, 0.0016888151045881818, 0.0016753192392925111, 0.0017855348894124843]
Most important feature:  5
220.30144214630127
131 / 253
#####################################
attackkey:  b0bed1288522fdc2bffea0396ba00084
total # of packets: 157

attackkey:  97ae14212b7569d75cc1f350bd479cf8
total # of packets: 4
# of UDP packets: 4
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.00014941150394139554, 3.523433499126527e-11, 0.000192262957511155, 0.00019924963539462467, 0.00018902542284419483, 0.0001922860255760757]
Most important feature:  3
256.13492250442505
148 / 253
#####################################
attackkey:  674cdc64e1c911f123891e4a4c61bc1f
total # of packets: 10
# of UDP packets: 10
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.00035915233248510906, 0.0, 0.00018598796053637964, 0.0004705069742246982, 0.0004474572361599182, 0.00018601732301240357]
Most important feature:  3
257.1787841320038
149 / 253
#####################################
attackkey:  5bdf5fc1273bda23ccaff90e5cffb40f
total # of packets: 204
# of UDP packets: 2

attackkey:  49abe772eb3942b7587e50a3f705e7ed
total # of packets: 16404
# of UDP packets: 16404
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.15411408500237106, -1.0460294060587072e-15, 0.2038000768580757, 0.1785488765920473, 0.1434935934619744, 0.20381400980752054]
Most important feature:  5
279.6681923866272
166 / 253
#####################################
attackkey:  fc81d769d7832fa40a4cda6ae7877e8b
total # of packets: 8
# of UDP packets: 8
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.0002905725619100433, -6.2483243833916365e-12, 0.00014634969818396772, 0.00037314791674140316, 0.00036317456816925943, 0.0001463730632482749]
Most important feature:  3
280.70485377311707
167 / 253
#####################################
attackkey:  bf1727bdaefe5ddd4cef7fdfecdf2379
total # of packets: 1000
# of 

attackkey:  e3339b9b837343c4aac4616f6ef57448
total # of packets: 43173
# of UDP packets: 43173
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.19503398128253965, 1.1324008676170557e-14, 0.18032165487433263, 0.32673934609292543, 0.3182024144998237, 0.1803353029812606]
Most important feature:  3
311.9067921638489
184 / 253
#####################################
attackkey:  f2724bf2d516c91e8373f7153a595ff1_2
total # of packets: 85883
# of UDP packets: 85883
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.22650543590658098, 5.783282838156912e-15, 0.42849594411301334, 0.4230343485505718, 0.38890703524969183, 0.42878587062767776]
Most important feature:  5
316.76315116882324
185 / 253
#####################################
attackkey:  79c0560b679610429eb83424c59852fc_2
total # of packets: 8
# of UDP pac

attackkey:  c9518f49edfb54b9e668e656ae1f3729
total # of packets: 7913
# of UDP packets: 7913
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.09921382638915166, 1.7177335372965752e-14, 0.13381433489017205, 0.10806829012557337, 0.10416951098420339, 0.13382465334245458]
Most important feature:  5
346.4999084472656
202 / 253
#####################################
attackkey:  c8bb46a460eb7c5a0801e9dfa11cf4a3
total # of packets: 82619
# of UDP packets: 82619
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.261222482028717, 0.0, 0.37198974840137633, 0.395677546025606, 0.2535832809732589, 0.37200629890938014]
Most important feature:  3
351.71178460121155
203 / 253
#####################################
attackkey:  2d276c708de961e0f5576c1a23c07ed7
total # of packets: 1743
# of UDP packets: 1743
# of TCP pac

attackkey:  13073741f538935aa8adfe90bad2ed52
total # of packets: 8
# of UDP packets: 8
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.0002905725619100433, -6.2483243833916365e-12, 0.00014634969818396772, 0.0003756157311691188, 0.0003631745681692594, 0.0001463730632482749]
Most important feature:  3
396.2286305427551
221 / 253
#####################################
attackkey:  19655b0e20c74dcffa056a78cde7ee0f
total # of packets: 20726
# of UDP packets: 20726
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.15428344518535858, -2.5594523239840367e-14, 0.2152262683955371, 0.20598383568048625, 0.16620650949512647, 0.21733645384965347]
Most important feature:  5
398.70028281211853
222 / 253
#####################################
attackkey:  35b310bf1cae16a5f37ae483b74a200f_3
total # of packets: 4
# of U

attackkey:  03c18359ae35b93a3657b5971fd952fa
total # of packets: 295
# of UDP packets: 295
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.007637804595017677, 2.5393386792286606e-13, 0.009848924300284017, 0.009766376775888992, 0.009314646021536001, 0.009849908151508971]
Most important feature:  5
436.0433373451233
239 / 253
#####################################
attackkey:  ad0ba2fa0338aaa18aefe31edb295d09
total # of packets: 2060
# of UDP packets: 2060
# of TCP packets: 0
# of other packets except for TCP or UDP: 0
Features: ips,protocol,total_length,src_port,dst_port,udp_length
AMI scores:  [0.03803732596116569, -1.9262998456335032e-15, 0.05405690189684963, 0.036963852256847285, 0.0440422416806919, 0.05406173171059493]
Most important feature:  5
437.4670889377594
240 / 253
#####################################
attackkey:  a3644f57a18ccb958a3d64f96e943828
total # of packets: 306
# of UDP pa

### Analysis and Conclusions

In [7]:
for i in range(0,6):
    print(i,most_important_features.count(i))

0 0
1 0
2 2
3 144
4 13
5 94


The counts of most important feature per feature are: <br/>

0 IP address: 0 <br/>
1 Protocol: 0 <br/>
2 Total packet length: 2 <br/>
3 UDP source port: 144 <br/>
4 Destination port: 13 <br/>
5 UDP payload length: 94 <br/>

None of the attacks has its IP address as most important feature. This can be explained by the fact that there are too much different IP addresses. Furthermore by the fact that the values of the IP addresses are laying in between each other for the normal and attack traffic.

The 31 at the protocol is remarkable, since this AMI value should be zero since all packets should be UDP packets. With manual inspection we discovered that some of the attacks are mixed with ICMP packets. 

The UDP source port is for most attacks the distinguishing feature. UDP payload second best.
