# Multi-vector DDoS attack fingerprinting
# Extending https://bitbucket.org/nbip/pat_rec_tool together with NBIP

## Libraries for data analysis

In [1]:
import pandas as pd
import numpy as np

## Enrichment functions 

In [2]:
%run libs/enrichment_functions.ipynb

#### Usage examples of enrichment functions: 

In [3]:
get_ip_proto_name(6)

'TCP'

In [4]:
get_port_name(80)

'HTTP service port'

In [5]:
get_tcp_flag_name('S')

'SYN'

## Convert pcap to dataframe

In [6]:
%run libs/pcap2dataframe.ipynb

## DDoS attack vector (pattern) identification/recognition

In [7]:
import collections

def analyse(df, debug=False, ttl_variation_threshold = 4):
    """
    Analysis only top traffic stream

    :param dataframe (df) containing the pcap/pcapng file converted:
    :return (1) print the summary of attack vectors and :
    """

    attack_case = "-1"
    reflection_label=""
    spoofed_label=""
    fragment_label=""

    allpatterns = {
        "dst_ip" : "",
        "patterns": []
    }
#     result_structure = {
#         "start_timestamp":0,
#         "end_timestamp":0,
#         "ip_protocol":0,
#         "dst_ip":[],
#         "src_ips":[],
#         "dst_ports":[], #(port,share)
#         "src_ports":[], #(port,share)
#         "reflected":False,
#         "spoofed":False,
#         "fragmented":False,
#         "pattern_traffic_share":0.0,
#         "pattern_packet_count":0,
#         "pattern_total_megabytes":0,
#         "ttl_variation":[],
# #         "packets":[]
#     }    
    
    if debug: print "Total number packets: "+ str(len(df))
    if debug: print "\n###################################\nIDENTIFYING MAIN CHARACTERISTICS:\n###################################"
    top_ip_dst = df['ip_dst'].value_counts().index[0]
    if debug: print "Target (destination) IP: "+ top_ip_dst
    allpatterns["dst_ip"] = top_ip_dst
    
    #Restricting attacks from outside the network!
    #df_filtered = df[(df['ip_dst'] == top_ip_dst) & ~df['ip_src'].str.contains(".".join(top_ip_dst.split('.')[0:2]), na=False) ]

    df_filtered = df[(df['ip_dst'] == top_ip_dst) ]
    total_packets_to_target = len(df_filtered)
    if debug: print "Number of packets: "+str(total_packets_to_target)    
        
##############################
##############################
    while (len(df_filtered)>0):
        if debug: print "\n###################################################################################################################"
        result = {}
        top_ip_proto = df[df['ip_dst'] == top_ip_dst]['ip_proto'].value_counts().index[0]
        result['ip_protocol']=top_ip_proto
        if debug: print "IP protocol used in packets going to target IP: "+str(top_ip_proto)
        
        df_filtered = df_filtered[df_filtered['ip_proto'] == top_ip_proto]

        # Performing a first filter based on the top_ip_dst (target IP), the source IPs canNOT be from the \16 of the
        # target IP, and the top IP protocol that targeted the top_ip_dst

        ####
        # Calculating the number of packets after the first filter 
        total_packets_filtered = len(df_filtered)
        if debug: print "Number of packets: "+str(total_packets_filtered)
        result["total_nr_packets"] = total_packets_filtered
    
        ####
        # For attacks in the IP protocol level
        attack_label = get_ip_proto_name(top_ip_proto) + "-based attack"
        result["transport_protocol"] = get_ip_proto_name(top_ip_proto)

        ####
        # For attacks based on TCP or UDP, which have source and destination ports
        if ((top_ip_proto == 6) or (top_ip_proto == 17)):

            if debug: print "\n#############################\nPORT FREQUENCY OF REMAINING PACKETS\n##############################"
            ####
            # Calculating the distribution of source ports based on the first filter
            percent_src_ports = df_filtered['sport'].value_counts().divide(float(total_packets_filtered) / 100)

            if debug: print "SOURCE ports frequency" 
            if debug: print percent_src_ports.head() 

            ####
            # Calculating the distribution of destination ports after the first filter
            percent_dst_ports = df_filtered['dport'].value_counts().divide(float(total_packets_filtered) / 100)
            if debug: print "\nDESTINATION ports frequency" 
            if debug: print percent_dst_ports.head()

            #####
            ## WARNING packets are filtered here again#####
            # Using the top 1 (source or destination) port to analyse a pattern of packets
            if (len(percent_src_ports) > 0) and (len(percent_dst_ports) > 0):
                if percent_src_ports.values[0] > percent_dst_ports.values[0]:
                    if debug: print "\nUsing top source port: ", percent_src_ports.keys()[0] 
                    df_pattern = df_filtered[df_filtered['sport'] == percent_src_ports.keys()[0]]
                    result["selected_port"] = "src_" + str(percent_src_ports.keys()[0])
                else:
                    if debug: print "\n Using top dest port: ", percent_dst_ports.keys()[0]
                    df_pattern = df_filtered[df_filtered['dport'] == percent_dst_ports.keys()[0]]
                    result["selected_port"] = "dst_" + str(percent_dst_ports.keys()[0])
            else:
                if debug: print 'no top source/dest port' 
                return None

            

            #####
            # Calculating the total number of packets involved in the attack
            pattern_packets = len(df_pattern)
            result["pattern_packet_count"] = pattern_packets

            #WARNING Can be wrong
            result['raw_attack_size_megabytes'] = (df_pattern['raw_size'].sum() /1000000).item()
            result["pattern_total_megabytes"] = (df_pattern[df_pattern['fragments'] == 0]['ip_length'].sum() / 1000000).item()

            #####
            # Calculating the percentage of the current pattern compared to the raw input file
            representativeness = float(pattern_packets) * 100 / float(total_packets_to_target)
            result["pattern_traffic_share"] = representativeness
            attack_label = 'In %.2f' % representativeness + "\n " + attack_label

            #####
            # Checking the existence of HTTP data
            http_data = df_pattern['http_data'].value_counts().divide(float(pattern_packets) / 100)

            #####
            # Checking the existence of TCP flags
            percent_tcp_flags = df_pattern['tcp_flag'].value_counts().divide(float(pattern_packets) / 100)

            #####
            # Calculating the number of source IPs involved in the attack
            ips_involved = df_pattern['ip_src'].unique()
######            
            if len(ips_involved) < 5:
                if debug: print "\n###################################################################################################################"
                if debug: print "\n###################################################################################################################"
                if debug: print "\n###################################################################################################################"
                if debug: print("\nNO MORE PATTERNS")
                break
            
            if debug: print("\n############################\nPATTERN (ATTACK VECTOR) LABEL "+ "\n############################")
            attack_label = attack_label + "\n"+ str(len(ips_involved)) + " source IPs"
            result["src_ips"] = ips_involved.tolist()

            #####
            # Calculating the number of source IPs involved in the attack
            result["start_timestamp"] = df_pattern['timestamp'].min().item()
            result["end_timestamp"] = df_pattern['timestamp'].max().item()

            ####
            # Calculating the distribution of TTL variation (variation -> number of IPs)
            ttl_variations = df_pattern.groupby(['ip_src'])['ip_ttl'].agg(np.ptp).value_counts().sort_index()
    #         if debug: print('TTL variation : NR of source IPs')
    #         if debug: print(ttl_variations)
            ips_ttl_greater_4 = ttl_variations.groupby(np.where(ttl_variations.index > 4, '>4', ttl_variations.index)).sum()
#             if debug: print('\n IPs TTL variation >4')
#             if debug: print(ips_ttl_greater_4)
            result["ttl_variation"] = ttl_variations.to_dict()

            ####
            # Calculating the distribution of IP fragments (fragmented -> percentage of packets)
            percent_fragments = df_pattern['fragments'].value_counts().divide(float(pattern_packets) / 100)
            ####
            # Calculating the distribution of source ports that remains
            percent_src_ports = df_pattern['sport'].value_counts().divide(float(pattern_packets) / 100)
            result["src_ports"] = percent_src_ports.to_dict()

            ####
            # Calculating the distribution of destination ports after the first filter
            percent_dst_ports = df_pattern['dport'].value_counts().divide(float(pattern_packets) / 100)
            result["dst_ports"] = percent_dst_ports.to_dict()

            ####
            # There are 3 possibilities of attacks cases!
            if (percent_src_ports.values[0] == 100):
                df_filtered = df_filtered[df_filtered['sport'].isin(percent_src_ports.keys()) == False]
                if (len(percent_dst_ports) == 1):
                    # if debug: print("\nCASE 1: 1 source port to 1 destination port") if debug else next
                    port_label = "From " + get_port_name(
                        percent_src_ports.keys()[0]) + "\n   - Against " + get_port_name(
                        percent_dst_ports.keys()[0]) + "[" + '%.1f' % percent_dst_ports.values[0] + "%]"
                else:
                    # if debug: print("\nCASE 2: 1 source port to a set of destination ports") if debug else next
                    if (percent_dst_ports.values[0] >= 50):
                        port_label = "From " + get_port_name(
                            percent_src_ports.keys()[0]) + "\n   - Against a set of (" + str(
                            len(percent_dst_ports)) + ") ports, such as " + get_port_name(
                            percent_dst_ports.keys()[0]) + "[" + '%.2f' % percent_dst_ports.values[
                            0] + "%]" + " and " + get_port_name(percent_dst_ports.keys()[1]) + "[" + '%.2f' % \
                                                                                                     percent_dst_ports.values[
                                                                                                         1] + "%]"
                    elif (percent_dst_ports.values[0] >= 33):
                        port_label = "From " + get_port_name(
                            percent_src_ports.keys()[0]) + "\n   - Against a set of (" + str(
                            len(percent_dst_ports)) + ") ports, such as " + get_port_name(
                            percent_dst_ports.keys()[0]) + "[" + '%.2f' % percent_dst_ports.values[
                            0] + "%]" + "; " + get_port_name(percent_dst_ports.keys()[1]) + "[" + '%.2f' % \
                                                                                                  percent_dst_ports.values[
                                                                                                      1] + "%], and " + get_port_name(
                            percent_dst_ports.keys()[2]) + "[" + '%.2f' % percent_dst_ports.values[2] + "%]"
                    else:
                        port_label = "From " + get_port_name(
                            percent_src_ports.keys()[0]) + "\n   - Against a set of (" + str(
                            len(percent_dst_ports)) + ") ports, such as " + get_port_name(
                            percent_dst_ports.keys()[0]) + "[" + '%.2f' % percent_dst_ports.values[
                            0] + "%]" + "; " + get_port_name(percent_dst_ports.keys()[1]) + "[" + '%.2f' % \
                                                                                                  percent_dst_ports.values[
                                                                                                      1] + "%], and " + get_port_name(
                            percent_dst_ports.keys()[2]) + "[" + '%.2f' % percent_dst_ports.values[2] + "%]"
            else:
                if (len(percent_src_ports) == 1):
                    df_filtered = df_filtered[df_filtered['sport'].isin(percent_src_ports.keys()) == False]

                    # if debug: print("\nCASE 1: 1 source port to 1 destination port") if debug else next
                    port_label = "Using " + get_port_name(percent_src_ports.keys()[0]) + "[" + '%.1f' % \
                                                                                                                  percent_src_ports.values[
                                                                                                                      0] + "%]" + "\n   - Against " + get_port_name(
                        percent_dst_ports.keys()[0]) + "[" + '%.1f' % percent_dst_ports.values[0] + "%]"


                else:
                    # if debug: print("\nCASE 3: 1 source port to a set of destination ports") if debug else next
                    df_filtered = df_filtered[df_filtered['sport'].isin(percent_src_ports.keys()) == False]

                    if (percent_src_ports.values[0] >= 50):
                        port_label = "From a set of (" + str(
                            len(percent_src_ports)) + ") ports, such as " + get_port_name(
                            percent_src_ports.keys()[0]) + "[" + '%.2f' % percent_src_ports.values[
                            0] + "%] and " + get_port_name(percent_src_ports.keys()[1]) + "[" + '%.2f' % \
                                                                                                percent_src_ports.values[
                                                                                                    1] + "%]" + "\n   - Against " + get_port_name(
                            percent_dst_ports.keys()[0]) + "[" + '%.1f' % percent_dst_ports.values[0] + "%]"
                    elif (percent_src_ports.values[0] >= 33):
                        port_label = "From a set of (" + str(
                            len(percent_src_ports)) + ") ports, such as " + get_port_name(
                            percent_src_ports.keys()[0]) + "[" + '%.2f' % percent_src_ports.values[
                            0] + "%], " + get_port_name(percent_src_ports.keys()[1]) + "[" + '%.2f' % \
                                                                                             percent_src_ports.values[
                                                                                                 1] + "%], and " + get_port_name(
                            percent_src_ports.keys()[2]) + "[" + '%.2f' % percent_src_ports.values[
                            2] + "%]" + "\n   - Against " + get_port_name(percent_dst_ports.keys()[0]) + "[" + '%.1f' % \
                                                                                                            percent_dst_ports.values[
                                                                                                                0] + "%]"
                    else:
                        df_filtered = df_filtered[df_filtered['dport'].isin(percent_dst_ports.keys()) == False]
                        port_label = "From a set of (" + str(
                            len(percent_src_ports)) + ") ports, such as " + get_port_name(
                            percent_src_ports.keys()[0]) + "[" + '%.2f' % percent_src_ports.values[
                            0] + "%], " + get_port_name(percent_src_ports.keys()[1]) + "[" + '%.2f' % \
                                                                                             percent_src_ports.values[
                                                                                                 1] + "%], " + get_port_name(
                            percent_src_ports.keys()[2]) + "[" + '%.2f' % percent_src_ports.values[
                            2] + "%]" + "; and " + get_port_name(percent_src_ports.keys()[3]) + "[" + '%.2f' % \
                                                                                                      percent_src_ports.values[
                                                                                                          3] + "%]" + "\n   - Against " + get_port_name(
                            percent_dst_ports.keys()[0]) + "[" + '%.1f' % percent_dst_ports.values[0] + "%]"

            ####
            # Testing HTTP request
            if len(http_data) > 0 and ((percent_dst_ports.index[0] == 80) or (percent_dst_ports.index[0] == 443)):
                attack_label = attack_label + "; " + http_data.index[0]

            ####
            # Testing TCP flags
            if (len(percent_tcp_flags) > 0) and (percent_tcp_flags.values[0] > 50):
                attack_label = attack_label + "; TCP flags: " + get_tcp_flag_name(
                    percent_tcp_flags.index[0]) + "[" + '%.1f' % percent_tcp_flags.values[0] + "%]"

            ####
            # IP fragmentation
            if '1' in percent_fragments.keys():
                if (percent_fragments['1'] > 0.3):
                    fragment_label = "%.2f" % percent_fragments['1'] + "packets with fragments marked"
                    result["fragmented"] = True

            ####
            # IP spoofing (if (more than 0) src IPs had the variation of the ttl higher than a treshold)
            if '>4' in ips_ttl_greater_4.keys():
                if (ips_ttl_greater_4['>4'] > len(ips_involved)*0.1 ):
                    result["spoofed"]=True
                    spoofed_label = "Likely involving spoofed IPs"
                else:
                    ####involved in 
                    # Reflection and Amplification
                    if percent_src_ports.values[0] >= 1:
                        result["reflected"]=True
                        reflection_label = "Reflection & Amplification"

            print "\nSUMMARY:\n"\
                    +"- %.2f" % representativeness +"% of the packets targeting "+top_ip_dst+"\n"\
                    +"   - Involved "+str(len(ips_involved))+" source IP addresses\n"\
                    +"   - Using IP protocol "+get_ip_proto_name(top_ip_proto)+"\n"\
                    +"   - "+port_label+"\n"\
                    +"   - "+fragment_label\
                    +"   - "+reflection_label\
                    +"   - "+spoofed_label
            
            allpatterns["patterns"].append(result)


    return allpatterns

<h1 align='center'> !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!<br>  THE DEMO <br>!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!</h1>


- 1: UDP attack against HTTP (~0.2'')
- 2: NTP  against HTTP (~20'')
- 3: Multi-vector attack (DNS reflection and netbios) (1' 20'')

In [10]:
!editcap -F libpcap -T ether 0323.2143.anonymous-stresser.net.ssyn__00004_20150323220443.pcap ../test.pcap

In [11]:
input_file='../test.pcap' 

In [12]:
# %%time
df = pcap2dataframe(input_file)

In [15]:
df.head()

Unnamed: 0,timestamp,ip_ttl,ip_proto,ip_length,ip_src,ip_dst,sport,dport,tcp_flag,fragments,http_data,raw_size
0,1427145000.0,61,6,52,62.141.46.54,192.42.116.16,48212,443,....A...,0,,66
1,1427145000.0,247,6,40,60.181.117.160,145.111.250.92,11053,1234,.S......,0,,60
2,1427145000.0,247,6,40,110.33.131.212,145.111.250.92,36949,1234,.S......,0,,60
3,1427145000.0,247,6,40,150.140.52.142,145.111.250.92,21540,1234,.S......,0,,60
4,1427145000.0,63,17,100,145.220.0.46,88.184.232.154,17273,55318,,0,,114


In [29]:
len(df[df['ip_dst']=='145.111.250.92'])

326105

In [30]:
len(df)

1417546

In [13]:
df.head(100)

Unnamed: 0,timestamp,ip_ttl,ip_proto,ip_length,ip_src,ip_dst,sport,dport,tcp_flag,fragments,http_data,raw_size
0,1.427145e+09,61,6,52,62.141.46.54,192.42.116.16,48212,443,....A...,0,,66
1,1.427145e+09,247,6,40,60.181.117.160,145.111.250.92,11053,1234,.S......,0,,60
2,1.427145e+09,247,6,40,110.33.131.212,145.111.250.92,36949,1234,.S......,0,,60
3,1.427145e+09,247,6,40,150.140.52.142,145.111.250.92,21540,1234,.S......,0,,60
4,1.427145e+09,63,17,100,145.220.0.46,88.184.232.154,17273,55318,,0,,114
5,1.427145e+09,63,17,68,145.220.0.46,157.56.144.215,17273,3544,,0,,82
6,1.427145e+09,63,17,68,145.220.0.46,157.56.106.189,17273,3544,,0,,82
7,1.427145e+09,50,6,1500,198.101.15.94,145.220.0.15,52271,9001,....A...,0,,1514
8,1.427145e+09,58,6,52,178.33.112.171,192.150.94.49,443,37298,....A...,0,,66
9,1.427145e+09,61,6,52,62.141.46.54,192.42.116.16,48212,443,....A...,0,,66


In [14]:
# %%time
allpatterns = analyse(df, True)

Total number packets: 1417546

###################################
IDENTIFYING MAIN CHARACTERISTICS:
###################################
Target (destination) IP: 145.111.250.92
Number of packets: 326105

###################################################################################################################
IP protocol used in packets going to target IP: 6
Number of packets: 326105

#############################
PORT FREQUENCY OF REMAINING PACKETS
##############################
SOURCE ports frequency
64324    0.005826
21360    0.004906
63068    0.004600
3152     0.004600
59778    0.004600
Name: sport, dtype: float64

DESTINATION ports frequency
1234    100.0
Name: dport, dtype: float64

 Using top dest port:  1234

############################
PATTERN (ATTACK VECTOR) LABEL 
############################

SUMMARY:
- 100.00% of the packets targeting 145.111.250.92
   - Involved 326092 source IP addresses
   - Using IP protocol TCP
   - From a set of (65077) ports, such as port 

# Let's take a look on the attack pattern!!!!

In [32]:
print allpatterns.keys()
print "\n"
print allpatterns['patterns'][0].keys()

['dst_ip', 'patterns']




IndexError: list index out of range

In [21]:
allpatterns['patterns'][0]['src_ports']

IndexError: list index out of range

In [33]:
allpatterns['patterns'][0]['dst_ports']

IndexError: list index out of range

In [15]:
allpatterns['patterns'][0]['src_ips']

['200.45.216.96',
 '171.37.157.18',
 '122.137.117.247',
 '180.123.27.16',
 '124.236.1.161',
 '184.43.33.18',
 '184.39.242.182',
 '124.125.46.8',
 '172.3.83.62',
 '162.198.246.54',
 '162.233.186.54',
 '172.6.53.230',
 '172.12.42.206',
 '172.5.178.166',
 '162.233.186.158',
 '172.6.53.94',
 '172.13.243.182',
 '172.6.52.118',
 '172.13.23.118',
 '172.3.83.198',
 '172.14.81.70',
 '172.10.106.230',
 '162.224.246.30',
 '172.10.98.222',
 '162.196.242.62',
 '172.10.3.246',
 '162.203.12.182']

In [16]:
allpatterns['dst_ip']

'172.31.28.132'

<h1 align='center'> !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
################################################</h1>

## Generating IPS/IDS (ex. SNORT, SURICATA) rules

## Generating an easy-to-understand report  

## Filtering ONLY the attack trace (WITHOUT target IP) into another pcap/ipfix/sflow file 

## Request permission to share IPS rules and filtered attack trace with DDoSDB.org