In [1]:
import xml.etree.ElementTree as ET
import re, glob, os, pathlib, subprocess,time
import pandas as pd
import msgpack
import matplotlib.pyplot as plt
from collections import defaultdict
import matplotlib
import numpy as np

In [2]:
__directory__ = "/home/leo/Dev/Data_Stream/2018/04/"
__filedate__ = "20180418"

__directory__ = "/home/leo/Dev/Data_Stream/Rollernet/"
__filedate__ = "rollernet"

preprocess_data = __directory__+"preprocess_data/"
figures_data = __directory__+"figures/"
storage_kcores = __directory__+__filedate__+"_kcores_storage/"

In [3]:
def parse_xml(path_xml, type=None):
    tree = ET.parse(path_xml)
    anomalies = []
    root = tree.getroot()
    for child in root:
        if child.tag == "anomaly":
            anom = {}
            anom['type'] = child.attrib['type']
            if type and anom['type'] != type:
                continue
            anom['value'] = re.findall(r'[^\d\W]+', child.attrib['value'])[0]
            # anom['type_detect']=
            for c in child:
                if c.tag == "from":
                    anom['begin'] = c.attrib['sec']
                    begin = c.attrib['sec']
                if c.tag == "to":
                    anom['end'] = c.attrib['sec']
                    end = c.attrib['sec']
                    # print("length "+anom['type']+" :", float(end)-float(begin))
                for d in c:
                    if d.tag == "filter":
                        try:
                            anom['info'].append(d.attrib)
                        except:
                            anom['info'] = [d.attrib]
            anomalies.append(anom)
    return anomalies

In [4]:
def replace_ip_with_id(anomalies, path_dict):
    with open(path_dict + "_dict_ip_2_nodes_label.mspk", 'rb') as output:
        ip_2_nodes_label = msgpack.load(output)
    for anom in anomalies:
        for i in anom['info']:
            try:
                i['src_ip'] = ip_2_nodes_label[bytes(i['src_ip'], 'utf-8')]
            except:
                pass
            try:
                i['dst_ip'] = ip_2_nodes_label[bytes(i['dst_ip'], 'utf-8')]
            except:
                pass
    return anomalies

In [5]:
def preprocess_anomalies_nodes(anomalies):
    t_min = min([min(int(i['begin']), int(i['end'])) for i in anomalies if int(i['end']) != 0 and int(i['begin']) != 0])

    dict_anomalies = {}
    for i in anomalies:
        b = int(i['begin']) - t_min
        e = int(i['end']) - t_min
        if b == 0 or (e - b) > 900:
            b = -1
            e = -1
        for j in i['info']:
            if 'src_ip' in j and not 'dst_ip' in j:
                dict_anomalies[j['src_ip']] = (b, e)
            if 'dst_ip' in j and not 'src_ip' in j:
                dict_anomalies[j['dst_ip']] = (b, e)
    print("Anomalous nodes : ", dict_anomalies)
    return dict_anomalies


def preprocess_anomalies_links(anomalies):
    t_min = min([min(int(i['begin']), int(i['end'])) for i in anomalies if int(i['end']) != 0 and int(i['begin']) != 0])
    dict_anomalies = {}
    for i in anomalies:
        b = int(i['begin']) - t_min
        e = int(i['end']) - t_min
        if b == 0 or (e - b) > 900:
            b = -1
            e = -1
        for j in i['info']:
            if 'src_ip' in j and 'dst_ip' in j:
                src = j['src_ip']
                dst = j['dst_ip']
                dict_anomalies[(src, dst)] = (b, e)
                dict_anomalies[(dst, src)] = (b, e)
    print("Anomalous Links : ", dict_anomalies)
    return dict_anomalies


In [6]:
def anomalies_in_kcores(anomalies, storage_kcores):
    anomalous_nodes = preprocess_anomalies_nodes(anomalies)
    print("nb anomalous nodes :",len(anomalous_nodes))
    kcores_with_anomalies = defaultdict(list)
    kcores_without_anomalies = defaultdict(list)
    with open(storage_kcores, 'rb') as ipt:
        unpacker = msgpack.Unpacker(ipt,use_list=False)
        for i in unpacker:
            k = i[0]
            print(" k : ",k)
            for j in i[1]:
                t0,t1,u = j
                if u in anomalous_nodes:
                    b, e = anomalous_nodes[u]
                    if b == -1:
                        kcores_with_anomalies[k].append(j)
                    elif e >= t0 and t1 >= b:
                        kcores_with_anomalies[k].append(j)
                else:
                    kcores_without_anomalies[k].append(j)
            if len(kcores_with_anomalies[k]) > 0:
                print("Current K : ",k)
                print("Nb nodes anomalies : ", len(kcores_with_anomalies[k]))
                print("Nb nodes without anomalies : ",len(kcores_without_anomalies[k]))
#     if storage_path:
#         pathlib.Path(storage_path).mkdir(parents=True, exist_ok=True)
#         with open(storage_path + "anomalies_in_scc.mspk", 'wb') as output:
#             msgpack.dump(list(scc_with_anomalies), output)
    return kcores_with_anomalies,kcores_without_anomalies

In [8]:
anomalies = parse_xml(__directory__ + __filedate__ + "_anomalous_suspicious.xml", type='anomalous')
anomalies = replace_ip_with_id(anomalies, __directory__ + __filedate__)
anomalous_nodes = preprocess_anomalies_nodes(anomalies)

Anomalous nodes :  {3015481: (-1, -1), 1830: (-1, -1), 286: (-1, -1), 4: (-1, -1), 535769: (22, 241), 23573: (-1, -1), 4143169: (218, 555), 24: (-1, -1), 349: (-1, -1), 577: (-1, -1), 214428: (-1, -1), 2143: (-1, -1), 698: (-1, -1), 664: (-1, -1), 85653: (-1, -1), 78767: (-1, -1), 162252: (-1, -1), 3128: (-1, -1), 3: (-1, -1), 211: (-1, -1), 9460: (-1, -1), 5999164: (-1, -1), 10892: (-1, -1), 881: (-1, -1), 40801: (-1, -1), 6962: (-1, -1), 7373: (-1, -1), 54: (-1, -1), 2171: (-1, -1), 197180: (-1, -1), 382: (-1, -1), 55: (-1, -1), 597: (-1, -1)}


In [9]:
kcores_anomalies,kcores_without_anomalies = anomalies_in_kcores(anomalies,storage_kcores+"postprocess_kcores.scf")

Anomalous nodes :  {3015481: (-1, -1), 1830: (-1, -1), 286: (-1, -1), 4: (-1, -1), 535769: (22, 241), 23573: (-1, -1), 4143169: (218, 555), 24: (-1, -1), 349: (-1, -1), 577: (-1, -1), 214428: (-1, -1), 2143: (-1, -1), 698: (-1, -1), 664: (-1, -1), 85653: (-1, -1), 78767: (-1, -1), 162252: (-1, -1), 3128: (-1, -1), 3: (-1, -1), 211: (-1, -1), 9460: (-1, -1), 5999164: (-1, -1), 10892: (-1, -1), 881: (-1, -1), 40801: (-1, -1), 6962: (-1, -1), 7373: (-1, -1), 54: (-1, -1), 2171: (-1, -1), 197180: (-1, -1), 382: (-1, -1), 55: (-1, -1), 597: (-1, -1)}
nb anomalous nodes : 33
 k :  1
Current K :  1
Nb nodes anomalies :  6481
Nb nodes without anomalies :  6330530
 k :  2
Current K :  2
Nb nodes anomalies :  5302
Nb nodes without anomalies :  257313
 k :  3
Current K :  3
Nb nodes anomalies :  2163
Nb nodes without anomalies :  30721
 k :  4
Current K :  4
Nb nodes anomalies :  24
Nb nodes without anomalies :  304


In [3]:
def preprocess_kcores(storage_kcores):
    nodes_cores_stats = {}
    with open(storage_kcores+"postprocess_kcores.scf", 'rb') as ipt:
        unpacker = msgpack.Unpacker(ipt,use_list=False)
        for i in unpacker:
            k = i[0]
            for j in i[1]:
                t0,t1,u = j
                if u not in nodes_cores_stats:
                    nodes_cores_stats[u] = {k :[t0,t1]}
                elif k not in nodes_cores_stats[u]:
                    nodes_cores_stats[u][k] = [t0,t1]
                else:
                    nodes_cores_stats[u][k]+=[t0,t1]
    return nodes_cores_stats

# Create a DataFrame with node's index (anomalous and none anomalous),
# columns : (k, frequency in kcores, surface per cores, intervals inter core ?, interval in core ?)
def create_kcores_DataFrame(nodes_cores_stats):
    names = []
    dict_data= {'2-core':[],
               '3-core':[],
               '4-core':[],
               'frequency 2-core':[],
               'frequency 3-core' : [],
               'frequency 4-core': [],
               'frequency':[],
               'surface 2-core':[],
               'surface 3-core':[],
               'surface 4-core':[],
               'surface':[],
               #'time between core':[] ?
               }
    for u in nodes_cores_stats:
        names.append(u)
        if 2 in nodes_cores_stats[u]:
            dict_data['2-core'].append(1)
            dict_data['frequency 2-core'].append(len(nodes_cores_stats[u][2])/2)
            dict_data['surface 2-core'].append(sum([t1-t0 for t0,t1 in zip(nodes_cores_stats[u][2][::2],nodes_cores_stats[u][2][1::2])]))
        else:
            dict_data['2-core'].append(0)
            dict_data['frequency 2-core'].append(0)
            dict_data['surface 2-core'].append(0)
            
        if 3 in nodes_cores_stats[u]:
            dict_data['3-core'].append(1)
            dict_data['frequency 3-core'].append(len(nodes_cores_stats[u][3])/2)
            dict_data['surface 3-core'].append(sum([t1-t0 for t0,t1 in zip(nodes_cores_stats[u][3][::2],nodes_cores_stats[u][3][1::2])]))

        else:
            dict_data['3-core'].append(0)
            dict_data['frequency 3-core'].append(0)
            dict_data['surface 3-core'].append(0)
            
        if 4 in nodes_cores_stats[u]:
            dict_data['4-core'].append(1)
            dict_data['frequency 4-core'].append(len(nodes_cores_stats[u][4])/2)
            dict_data['surface 4-core'].append(sum([t1-t0 for t0,t1 in zip(nodes_cores_stats[u][4][::2],nodes_cores_stats[u][4][1::2])]))
        else:
            dict_data['4-core'].append(0)
            dict_data['frequency 4-core'].append(0)
            dict_data['surface 4-core'].append(0)
            
        dict_data['frequency'].append(dict_data['frequency 2-core'][-1]+dict_data['frequency 3-core'][-1]+dict_data['frequency 4-core'][-1])
        dict_data['surface'].append(dict_data['surface 2-core'][-1]+dict_data['surface 3-core'][-1]+dict_data['surface 4-core'][-1])
    Index = pd.Index(names)
    D = pd.DataFrame(dict_data, index=Index)
    return D


In [4]:
nodes_cores_stats = preprocess_kcores(storage_kcores)
D = create_kcores_DataFrame(nodes_cores_stats)

In [14]:
index_anomalous = [i for i in D.index if i in anomalous_nodes]
index_non_anomalous = [i for i in D.index if i not in anomalous_nodes]
D_anomalous = D.loc[index_anomalous]
D_non_anomalous = D.loc[index_non_anomalous]
D_anomalous.describe()

Unnamed: 0,2-core,3-core,4-core,frequency 2-core,frequency 3-core,frequency 4-core,frequency,surface 2-core,surface 3-core,surface 4-core,surface
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,0.875,0.46875,0.15625,165.71875,67.59375,0.75,234.0625,95.202509,29.861054,0.154408,125.217971
std,0.336011,0.507007,0.368902,248.184665,229.886348,2.735666,460.394847,166.798843,111.96412,0.529875,245.639678
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,13.75,0.0,0.0,13.75,8.815037,0.0,0.0,9.368095
50%,1.0,0.0,0.0,59.5,0.0,0.0,75.0,27.958813,0.0,0.0,30.610172
75%,1.0,1.0,0.0,188.75,23.5,0.0,225.75,95.047867,5.308021,0.0,101.658057
max,1.0,1.0,1.0,955.0,955.0,15.0,1912.0,721.476902,559.173483,2.71428,900.707296


In [15]:
D_non_anomalous.describe()

Unnamed: 0,2-core,3-core,4-core,frequency 2-core,frequency 3-core,frequency 4-core,frequency,surface 2-core,surface 3-core,surface 4-core,surface
count,551397.0,551397.0,551397.0,551397.0,551397.0,551397.0,551397.0,551397.0,551397.0,551397.0,551397.0
mean,0.03446,0.003883,0.000292,0.466657,0.055715,0.000551,0.522923,0.315657,0.0181,0.000156,0.333913
std,0.182407,0.062192,0.017085,10.435907,3.6571,0.045186,13.045384,8.557699,1.437975,0.015115,9.334269
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1004.0,1004.0,10.0,2013.0,900.883764,472.929217,4.341626,900.883764


In [5]:
for c in D.columns:
    print(c+" Largest Values :")
    print(D.nlargest(15,c))
    print(c+" Smallest Values :")
    print(D.nsmallest(10,c))
    print()

2-core Largest Values :
    2-core  3-core  4-core  frequency  frequency 2-core  frequency 3-core  \
29       1       1       1      819.0             198.0             298.0   
13       1       1       1      740.0             177.0             275.0   
16       1       1       1      762.0             152.0             278.0   
33       1       1       1      724.0             178.0             250.0   
22       1       1       1      816.0             227.0             300.0   
31       1       1       1      711.0             173.0             257.0   
44       1       1       1      838.0             224.0             306.0   
46       1       1       1      857.0             160.0             306.0   
34       1       1       1      854.0             176.0             317.0   
40       1       1       1      900.0             151.0             323.0   
45       1       1       1      873.0             167.0             329.0   
50       1       1       1      845.0             12

In [16]:
for c in D_anomalous.columns:
    print(c+" Largest Values :")
    print(D_anomalous.nlargest(15,c))
    print(c+" Smallest Values :")
    print(D_anomalous.nsmallest(10,c))
    print()

2-core Largest Values :
        2-core  3-core  4-core  frequency 2-core  frequency 3-core  \
214428       1       0       0              19.0               0.0   
6962         1       1       0             184.0              41.0   
40801        1       1       1             255.0              17.0   
9460         1       1       0              54.0              27.0   
7373         1       1       0             203.0              25.0   
2143         1       1       1             130.0              23.0   
597          1       1       0              40.0               6.0   
162252       1       1       0             180.0              34.0   
3128         1       1       0             426.0              45.0   
23573        1       1       0              65.0               4.0   
1830         1       0       0              10.0               0.0   
577          1       0       0             286.0               0.0   
881          1       0       0              93.0               0.0

In [17]:
for c in D_non_anomalous.columns:
    print(c+" Largest Values :")
    print(D_non_anomalous.nlargest(15,c))
    print(c+" Smallest Values :")
    print(D_non_anomalous.nsmallest(10,c))
    print()

2-core Largest Values :
       2-core  3-core  4-core  frequency 2-core  frequency 3-core  \
8638        1       0       0               1.0               0.0   
876         1       1       0              44.0               8.0   
4910        1       0       0               2.0               0.0   
894         1       1       1             218.0              41.0   
9221        1       0       0               3.0               0.0   
9428        1       0       0              51.0               0.0   
6258        1       0       0               1.0               0.0   
6081        1       0       0              64.0               0.0   
6278        1       0       0               5.0               0.0   
15010       1       0       0              38.0               0.0   
6596        1       0       0               6.0               0.0   
6597        1       0       0               6.0               0.0   
11041       1       1       0              13.0               1.0   
16463     

      2-core  3-core  4-core  frequency 2-core  frequency 3-core  \
4901       0       0       0               0.0               0.0   
4902       0       0       0               0.0               0.0   
6167       0       0       0               0.0               0.0   
2353       0       0       0               0.0               0.0   
1481       0       0       0               0.0               0.0   
504        0       0       0               0.0               0.0   
8306       0       0       0               0.0               0.0   
4909       0       0       0               0.0               0.0   
6476       0       0       0               0.0               0.0   
7685       0       0       0               0.0               0.0   

      frequency 4-core  frequency  surface 2-core  surface 3-core  \
4901               0.0        0.0             0.0             0.0   
4902               0.0        0.0             0.0             0.0   
6167               0.0        0.0           

      2-core  3-core  4-core  frequency 2-core  frequency 3-core  \
4901       0       0       0               0.0               0.0   
4902       0       0       0               0.0               0.0   
6167       0       0       0               0.0               0.0   
2353       0       0       0               0.0               0.0   
1481       0       0       0               0.0               0.0   
504        0       0       0               0.0               0.0   
8306       0       0       0               0.0               0.0   
4909       0       0       0               0.0               0.0   
6476       0       0       0               0.0               0.0   
7685       0       0       0               0.0               0.0   

      frequency 4-core  frequency  surface 2-core  surface 3-core  \
4901               0.0        0.0             0.0             0.0   
4902               0.0        0.0             0.0             0.0   
6167               0.0        0.0           

In [6]:
def plot_lines(node_core_info):
    seg = []
    for u in node_core_info:
        t0,t1,n = u
        seg.append(((t0,n),(t1,n)))
    return seg


def plot_kcores(kcores_with_anomalies,kcores_without_anomalies, title=None,legend=None, saving_path=None, format='pdf'):
    nodes = set()
    for v in kcores_with_anomalies.values():
        for i in v:
            nodes.add(i[2])
    for v in kcores_without_anomalies.values():
        for i in v:
            nodes.add(i[2])
    t_min = 0
    t_max = 900
    print("T min :",t_min)
    print("T max :",t_max)

    dict_color_anom = {2:"#ffcc00", # Yellow
                   3:"#e68a00", # Orange
                   4:"#cc0000"} # Rouge
    dict_color_normal = {2:"#8080ff", # Light blue
                     3:"#0033cc", # Medium blue
                     4:"#004466"} # Strong blue
    colors_anom = []
    segs_anom = []
    colors_normal = []
    segs_normal = []
    for k,v in kcores_with_anomalies.items():
        if k >= 2:
            seg = plot_lines(v)
            segs_anom += seg
            colors_anom += [dict_color_anom[k]]*len(seg)
    for k,v in kcores_without_anomalies.items():
        if k >= 2:
            seg = plot_lines(v)
            segs_normal += seg
            colors_normal += [dict_color_normal[k]]*len(seg)

    print("len colors normal : ",len(colors_normal))
    print("len segs normal : ",len(segs_normal))
    print("len colors anomalous : ",len(colors_anom))
    print("len segs anomalous : ",len(segs_anom))
    print("Segs calculated")
    fig,ax = plt.subplots(1,1)
    line_coll = matplotlib.collections.LineCollection(np.array(segs_normal),
                                                      colors=colors_normal,linewidths=[5*10**(-3)])
    line_coll.set_alpha(0.7)
    ax.add_collection(line_coll)
    
    line_coll = matplotlib.collections.LineCollection(np.array(segs_anom),
                                                      colors=colors_anom,linewidths=[5*10**(-1)])
    line_coll.set_alpha(1)
    ax.add_collection(line_coll)
    print("Collections created and added")
    ax.set_xlim(t_min,t_max)
    ax.set_ylim(0,max(nodes))
    if legend :
        plt.ylabel("Nodes", fontname='Ubuntu', fontsize=12, color='#666699')
        plt.xlabel("t", fontname='Ubuntu', fontsize=12, color='#476b6b')
        list_legend =[]
        for k,current_color in dict_color_anom.items():
            list_legend.append(matplotlib.patches.Patch(color=current_color, label=str(k)+"-Cores Anomalous"))
        for k,current_color in dict_color_normal.items():
            list_legend.append(matplotlib.patches.Patch(color=current_color, label=str(k)+"-Cores"))
        plt.legend(handles=list_legend, loc='upper left',fancybox =True,prop={'size': 6})
    if title:
        ax.title(title, fontname='Ubuntu', fontsize=14)
    for spine in plt.gca().spines.values():
        spine.set_visible(False)
    plt.tick_params(top=False, bottom=True, right=False, left=True, labelleft=True, labelbottom=True)
    plt.tight_layout()
    if saving_path:
        fig.savefig(saving_path + "." + format, format=format,dpi=2400)


In [7]:
plot_kcores(kcores_anomalies,kcores_without_anomalies,saving_path=figures_data+'kcores_anomalies_viz',legend = True)

NameError: name 'kcores_anomalies' is not defined