In [1]:
import pandas as pd
import numpy as np
import collections
import math
from datetime import datetime
import json
import hashlib
import time
from tabulate import tabulate

import warnings
warnings.filterwarnings('ignore')

In [2]:
# from functions.protocolnumber2name import *
# from functions.portnumber2name import *
# from functions.tcpflagletters2names import *
# from functions.pcap2dataframe_tshark import *

In [3]:
#FOR TESTING PURPOSE
from protocolnumber2name import *
from portnumber2name import *
from tcpflagletters2names import *
from pcap2dataframe_tshark import *

In [12]:
def analyse_df_pcap_tshark(df, debug=False, ttl_variation_threshold = 4):
    """
    Analysis only top traffic stream

    :param dataframe (df) containing the pcap/pcapng file converted:
    :return (1) print the summary of attack vectors and :
    """

    total_packets = len(df)
    ############################################################################
    fingerprints= []
    attack_vector = {}
    df_attackvectors=[]
    attackvectors_labels =[]
    attackvectors_source_ips=[]
    counter = 1
    ############################################################################
    ############################################################################
    dst_ip_distribution = df['_ws.col.Destination'].value_counts()
    if debug: print ('\nDISTRIBUTION OF DESTINATION IPs:\n', dst_ip_distribution)
    top1_dst_ip = dst_ip_distribution.keys()[0]
    ############################################################################
    ############################################################################
    df_remaining = df[df['_ws.col.Destination']==top1_dst_ip]
    ############################################################################
    ############################################################################
    while len(df_remaining) > 1:
        attack_vector['file_type'] = 'pcap'
        #Analysing the distribution of IP protocols (and defining the top1)
        protocol_distribution = df_remaining['_ws.col.Protocol'].value_counts()
        if debug: print ('\nDISTRIBUTION OF PROTOCOLS:\n', protocol_distribution)
        top1_protocol = protocol_distribution.keys()[0]
        filter_top_protocol_string = "df_remaining['_ws.col.Protocol']=='"+str(top1_protocol)+"'"
        attack_vector['protocol']=top1_protocol

        #Defining if the remaining is based on the top1 source OR destination port
        if top1_protocol == 'IPv4':
            fragmentation_distribution = df_remaining[df_remaining['_ws.col.Protocol']=='IPv4']['fragmentation'].value_counts()
            if debug: print('\nFRAGMENTATION DISTRIBUTION:\n',fragmentation_distribution)    
            if fragmentation_distribution.keys()[0] == True:
                filter_fragmentation_string="df_remaining['fragmentation']==True"
                attackvector_filter_string = '('+str(filter_top_protocol_string)+')&('+str(filter_fragmentation_string)+')'
            attack_vector['additional'] = {'fragmentation': True}

        else:
            ###Analysing the distribution of SOURCE ports AND defining the top1
            port_source_distribution = df_remaining[df_remaining['_ws.col.Protocol']==top1_protocol]['srcport'].value_counts().head()
            if debug: print('\nDISTRIBUTION OF SOURCE PORT:\n', port_source_distribution)
            top1_source_port = math.floor(port_source_distribution.keys()[0])

            ###Analysing the distribution of DESTINATION ports AND defining the top1
            port_destination_distribution = df_remaining[df_remaining['_ws.col.Protocol']==top1_protocol]['dstport'].value_counts().head()
            if debug: print('\nDISTRIBUTION OF DESTINATION PORTS:\n',port_destination_distribution)
            top1_destination_port = math.floor(port_destination_distribution.keys()[0])

            ###Checking wich port type (source or destination) AND number had most occurrences
            if port_source_distribution.iloc[0] > port_destination_distribution.iloc[0]:
                filter_top_port = "df_remaining['srcport']=="+str(top1_source_port)
            else:
                filter_top_port = "df_remaining['dstport']=="+str(top1_destination_port)
            
            
            #Defining the conclusion of the analysis (of the remaining traffic)
            attackvector_filter_string = '('+str(filter_top_protocol_string)+')&('+str(filter_top_port)+')' 

            ###########
            if top1_protocol == 'ICMP':
                icmp_type_distribution = df_remaining[df_remaining['_ws.col.Protocol']=='ICMP']['icmp.type'].value_counts()
                if debug: print('\nDISTRIBUTION ICMP TYPES:\n',icmp_type_distribution)
                top1_icmp_type = icmp_type_distribution.keys()[0]
                filter_icmp_type = "df_remaining['icmp.type']=='"+str(top1_icmp_type)+"'"
                attackvector_filter_string = '('+str(filter_top_protocol_string)+')&('+str(filter_icmp_type)+')' 
                attack_vector['additional'] = {'icmp_type':top1_icmp_type}

        #     ###########
        #     if top1_protocol == 'QUIC':
        #         quic_payload_distribution = df_remaining[df_remaining['_ws.col.Protocol']=='QUIC']['quic.payload'].value_counts()
        #         if debug: print('DISTRIBUTION QUIC PAYLOADS:',quic_payload_distribution.head())
        #         top1_quic_payload_distribution = quic_payload_distribution.keys()[0]
        #         filter_quic = "df_remaining['quic.payload']=='"+str(top1_quic_payload_distribution)+"'"
        #         attackvector_filter_string += '&('+str(filter_quic)+')'
        #
        #         attack_vector['additional'] = {'quic_payload':top1_quic_payload_distribution}

            ###########
            if top1_protocol == 'TCP':
                tcp_flag_distribution = df_remaining[df_remaining['_ws.col.Protocol']=='TCP']['tcp.flags.str'].value_counts()
                if debug: print('\nDISTRIBUTION TCP FLAGS:\n',tcp_flag_distribution.head())
                top1_tcp_flag = tcp_flag_distribution.keys()[0]
                filter_tcp_flag = "df_remaining['tcp.flags.str']=='"+str(top1_tcp_flag)+"'"
                attackvector_filter_string += '&('+str(filter_tcp_flag)+')'  

                attack_vector['additional'] = {'tcp_flag': top1_tcp_flag}
            ###########
            if top1_protocol == 'DNS':
                dns_query_distribution = df_remaining[df_remaining['_ws.col.Protocol']=='DNS']['dns.qry.name'].value_counts()
                if debug: print('\nDISTRIBUTION DNS QUERIES:\n',dns_query_distribution.head())
                top1_dns_query = dns_query_distribution.keys()[0]
                filter_dns_query = "df_remaining['dns.qry.name']=='"+str(top1_dns_query)+"'"
                attackvector_filter_string += '&('+str(filter_dns_query)+')'

                dns_type_distribution = df_remaining[df_remaining['_ws.col.Protocol']=='DNS']['dns.qry.type'].value_counts()
                if debug: print('\nDISTRIBUTION DNS TYPES:\n',dns_type_distribution.head())
                top1_dns_type = dns_type_distribution .keys()[0]
                attack_vector['additional'] = {'dns_query': top1_dns_query,
                                       'dns_type': top1_dns_type}
        ############################################################################
        attackvectors_labels.append(attackvector_filter_string.replace("df_remaining",""))

        df_attackvector_current = df_remaining[eval(attackvector_filter_string)]
        src_ips_attackvector_current = df_attackvector_current['_ws.col.Source'].unique()   
        ###If the number of source IPs involved in this potential attack vector is 1, then it is NOT a DDoS! STOP!
        if len(src_ips_attackvector_current) < 2:
            if debug: print ('\nSTOP ANALYSIS!!! THERE IS ONLY ONE SOURCE IP RELATED TO THIS ATTACK VECTOR!\n')
            break
        
        ############################################################################
        ### SAVING FOR FURTHER ANALYSIS OF THE CURRENT DATAFRAME
        ### df_attackvectors.append(df_attackvector_current)
        ############################################################################
        
        ### For later comparing the list of IPs
        attackvectors_source_ips.append(src_ips_attackvector_current)

        start_time =df_attackvector_current['frame.time_epoch'].iloc[0]
        end_time= df_attackvector_current['frame.time_epoch'].iloc[-1]

        attack_vector['src_ips'] = src_ips_attackvector_current.tolist()
        
        if str(df_attackvector_current['srcport'].iloc[0]) != 'nan':
            attack_vector['src_ports'] = [int(x) for x in df_attackvector_current['srcport'].unique().tolist() if not math.isnan(x)]
        else:
            attack_vector['src_ports']=[]
        
        if str(df_attackvector_current['dstport'].iloc[0]) != 'nan':
            attack_vector['dst_ports'] = [int(x) for x in df_attackvector_current['dstport'].unique().tolist() if not math.isnan(x)]
        else:
            attack_vector['dst_ports']=[]

        attack_vector['start_timestamp'] = start_time
        attack_vector['start_time'] = datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S')
        attack_vector['duration_sec'] = end_time-start_time
        
        start_time_formated = datetime.fromtimestamp(start_time).strftime('%Y%m%d%H%M%S%f')

    #     ttl_variations = df_attackvector_current.groupby(['_ws.col.Source'])['ip.ttl'].agg(np.ptp).value_counts().sort_index()
    #     if debug: print('TTL VARIATION FOR IPS:',ttl_variations)
    #     if debug: print('TTL VALUE DISTRIBUTION:',df_attackvector_current['ip.ttl'].value_counts().head())

        ############################################################################  
        print('\nATTACK VECTOR '+str(counter)+': '+str(attackvector_filter_string).replace("df_remaining",""))
        print('  - Packets:'+str(len(df_attackvector_current)))
        print('  - #Src_IPs:'+str(len(src_ips_attackvector_current)))

        fingerprints.append(attack_vector)
        ############################################################################
        md5=str(hashlib.md5(str(start_time).encode()).hexdigest())
        with open('../output/'+md5+'.json', 'w+') as outfile:
            json.dump(attack_vector, outfile)
        ############################################################################
        df_remaining = df_remaining[eval(attackvector_filter_string.replace('==','!=').replace('&','|'))]
        ############################################################################
        counter += 1
        attack_vector = {}

    matrix_source_ip_intersection = pd.DataFrame()
    for m in range(counter-1):
        for n in range(counter-1):    
            intersection = len(np.intersect1d(attackvectors_source_ips[m], attackvectors_source_ips[n]))
            matrix_source_ip_intersection.loc[str(m+1), str(n+1)] = intersection
        matrix_source_ip_intersection.loc[str(m+1), 'Attack vector'] = str(attackvectors_labels[m])

    
#     print('\nINTERSECTION OF SOURCE IPS IN ATTACK VECTORS:\n',tabulate(matrix_source_ip_intersection, headers='keys', tablefmt='psql'))
    print('\nINTERSECTION OF SOURCE IPS IN ATTACK VECTORS:\n',matrix_source_ip_intersection)


    return top1_dst_ip, fingerprints

In [13]:
#FOR TESTING PURPOSE

input_file = '../input4test/1.pcap'
df = pcap2dataframe_tshark(input_file)
dst_ip, fingerprints =analyse_df_pcap_tshark(df, True)

('\nDISTRIBUTION OF DESTINATION IPs:\n', 172.31.28.132        2559
172.31.0.2           1172
200.45.216.96          34
124.236.1.161          26
122.137.117.247        23
184.43.33.18           22
184.39.242.182         22
162.198.246.54         18
180.123.27.16          18
162.233.186.54         18
172.6.53.230           18
172.3.83.62            18
172.12.42.206          17
162.233.186.158        16
172.5.178.166          16
172.13.243.182         15
172.6.52.118           15
172.6.53.94            15
172.3.83.198           14
172.14.81.70           14
172.13.23.118          14
172.10.106.230         13
162.224.246.30         13
172.10.98.222          13
162.196.242.62         13
172.10.3.246           12
171.37.157.18          11
06:90:f4:0e:e0:ff       9
06:9a:6c:66:90:33       9
162.203.12.182          9
124.125.46.8            6
89.248.161.150          4
114.32.207.183          1
211.104.160.20          1
61.240.144.66           1
121.8.241.143           1
Name: _ws.col.Destinati

In [None]:
# display(dst_ip)
# display(fingerprints[0])