In [9]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import numpy as np
from collections import Counter
from os import walk
from os import path
import random

from sklearn.preprocessing import LabelEncoder

### Bro/Zeek log parser and anonymiser

#### This file serves to parse Bro/Zeek logs and identify and anonymise Personally Identifiable Information (PII). As example data the bro logs of the UNSW-NB15 dataset [1] have been used.

The PII information is listed in the PII list and one line of each log file is displayed below with the PII highlighted in red.

The PII fields are then transformed. The IP addresses to a random IP addresses where the IP addresses in each subnet are grouped together. For example: 1.2.3.50 and 1.2.3.100 might be transformed to 9.222.51.2 and 9.222.51.244. The other PII fields are encoced with scikitlearn's LabelEncoder. Which means that the same inputs are mapped to the same number (label)

[1] https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/

In [13]:
# Helper functions

def get_headers_types(log_type,directories):
    headers = []
    types = []
    for log_dir in directories:
        if path.exists(dir_to_parse+str(log_dir)+"/"+log_type):
            with open(dir_to_parse+str(log_dir)+"/"+log_type) as file:
                for i in range(8):
                    line = next(file).strip()
                    if line.startswith("#fields"):
                        headers = line.split("\t")[1:]
                    if line.startswith("#types"):
                        types = line.split("\t")[1:]
            break
    return headers, types

def get_types(df_type,type_list):
    for header in df_type.columns:
        if type_list.get(header) == 'num':
            df_type[header] = pd.to_numeric(df_type[header],errors='coerce')
        else:
            df_type[header] = df_type[header].astype(str)
    return df_type 


def read_file(file_path,seperator,headers):
    if path.exists(file_path):
        return pd.read_csv(file_path,sep=seperator,names=headers,skiprows=8,
                           skipfooter=1,engine='python')

def get_dataframe(log_type,seperator,filepath):
    headers, types = get_headers_types(log_type,filepath)
    type_list = {}
    for col_header, col_type in zip(headers,types):
        type_list[col_header] = 'num' if col_type in num_list else 'string'
    df_combined = pd.concat(map(lambda x: read_file(dir_to_parse+ str(x)+'/'+log_type,
                                                    seperator,headers), filepath))
    return get_types(df_combined,type_list)

def get_files_and_dirs(dir_to_parse):
    
    def get_filenames(x): 
        _, _, filenames = next(walk(dir_to_parse+str(x)+'/'))
        return filenames
    
    _, directories, _ = next(walk(dir_to_parse))
    filenames = [get_filenames(x) for x in directories]
    flat_filenames = np.unique([item for sublist in filenames for item in sublist])
    return directories, flat_filenames

def highlight_col(df_to_style):
    r = 'background-color: red'
    df1 = pd.DataFrame('', index=df_to_style.index, columns=df_to_style.columns)
    for info in PII:
        if info in df_to_style.columns:
            df1.loc[:,info] = r
    return df1    

def get_subnet_perserving_ip(ip):

    def get_random_subnet(sub,sub_dict):
        random_sub = ""
        if sub in sub_dict:
            random_sub = sub_dict[sub]
        else:
            is_old = True
            while is_old:
                random_sub = str(int(random.random()*256))
                if random_sub not in sub_dict.values():
                    is_old = False
            sub_dict[sub] = random_sub
        return random_sub
    
    if ip in ip_dict:
        random_ip = ip_dict[ip]
    else:
        sub_splits = ip.split('.')
        if len(sub_splits) > 3:
            sub0 = get_random_subnet(sub_splits[0], sub0_dict)
            sub1 = get_random_subnet(sub_splits[1], sub1_dict)
            sub2 = get_random_subnet(sub_splits[2], sub2_dict)
            sub3 = get_random_subnet(sub_splits[3], sub3_dict)
            random_ip = sub0+"."+sub1+"."+sub2+"."+sub3
            ip_dict[ip] = random_ip
        else:
            random_ip = "-"
            ip_dict[ip] = random_ip
    return random_ip


In [14]:
dir_to_parse = "logs/"

seperator = "[\t]"

string_list = ['string','addr','string','enum','set[string]','vector[string]','bool']
num_list = ['time','port','interval','count']

# List with columns containing PII.
PII = ['id.orig_h','id.resp_h',
       'mac','assigned_ip',
       'query','answers',
       'tx_hosts','rx_hosts','md5','sha1','sha256',
       'user','password','arg','data_channel.orig_h','data_channel.resp_h',
       'host','uri','filename','username',
       'remote_ip',
       'helo','mailfrom','rcptto','date','from','to','reply_to','msg_id','in_reply_to',
           'server_name','subject','issuer','client_subject','client_issuer',
       'message',
       'certificate.serial','certificate.subject','certificate.issuer','certificate.not_valid_before',
           'certificate.not_valid_after','san.dns','san.uri','san.email','san.ip']

IP_fields = ['id.orig_h','id.resp_h','assigned_ip','answers','data_channel.orig_h','data_channel.resp_h','remote_ip']


In [15]:
directories, filenames = get_files_and_dirs(dir_to_parse)

orig_log_dataframes = {}

for filename in filenames:
    df_log = get_dataframe(filename,seperator,directories)
    orig_log_dataframes[filename] = df_log
    display(df_log.iloc[:1,:].style.apply(highlight_col, axis=None))
    

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,local_orig,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents
0,1424256987.747828,C8kfmaJvH8YSujzW5,59.166.0.2,57872,149.171.126.9,56104,tcp,-,0.027552,227.0,11587.0,SF,-,0,ShADadfF,42,2646,44,25470,(empty)


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,mac,assigned_ip,lease_time,trans_id
0,1424257224.537779,CU34Cu2zrLEPwadl52,175.45.176.3,68,149.171.126.13,67,02:1a:c5:00:00:00,175.45.176.3,1879.0,4288085767


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,trans_id,query,qclass,qclass_name,qtype,qtype_name,rcode,rcode_name,AA,TC,RD,RA,Z,answers,TTLs,rejected
0,1424256988.447831,CcCQqj3E6KlwtFnRja,59.166.0.4,7745,149.171.126.2,53,udp,48100,server-95ab7e07.int,1.0,C_INTERNET,1.0,A,0.0,NOERROR,F,F,T,T,0,149.171.126.7,60.0,F


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,analyzer,failure_reason
0,1424257073.512632,CgvWyM1wBPRuu6ULUe,175.45.176.2,64558,149.171.126.14,80,tcp,HTTP,not a http reply line


Unnamed: 0,ts,fuid,tx_hosts,rx_hosts,conn_uids,source,depth,analyzers,mime_type,filename,duration,local_orig,is_orig,seen_bytes,total_bytes,missing_bytes,overflow_bytes,timedout,parent_fuid,md5,sha1,sha256,extracted
0,1424256989.355818,F1YVMA4nWYBEIQkbDd,149.171.126.0,59.166.0.2,C9zC0k1eMFe6n5INSd,SMTP,2,(empty),text/plain,-,0.0,-,F,298,,0,0,F,-,-,-,-,-


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,user,password,command,arg,mime_type,file_size,reply_code,reply_msg,data_channel.passive,data_channel.orig_h,data_channel.resp_h,data_channel.resp_p,fuid
0,1424256989.952859,CsfksnMVMnOYhPiB7,59.166.0.8,5146,149.171.126.3,21,anonymous,jobs@server.com,EPSV,-,-,,229,Extended Passive Mode OK (|||24196|),T,59.166.0.8,149.171.126.3,24196.0,-


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,trans_depth,method,host,uri,referrer,user_agent,request_body_len,response_body_len,status_code,status_msg,info_code,info_msg,filename,tags,username,password,proxied,orig_fuids,orig_mime_types,resp_fuids,resp_mime_types
0,1424256990.350022,CwgILB4FJZI9P9o4z1,59.166.0.1,41195,149.171.126.1,80,1,GET,Tracker,/announce?peer_id=-AR2621-949883860326&port=15836&uploaded=0&downloaded=0&left=8388610&compact=1&numwant=0&event=started&info_hash=\x1d]\xfb\xcc\x9f\xeb\xfckTfW\xe3e\xe8\xed\xa9 \x0f5\xf0,-,-,0,83,200.0,OK,,-,-,(empty),-,-,-,-,-,FdHIAGPzf33uaIXE9,text/plain


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,nick,user,command,value,addl,dcc_file_name,dcc_file_size,dcc_mime_type,fuid
0,1424256999.141962,CTF2G91mpfymYtsHWk,175.45.176.1,60007,149.171.126.17,6667,-,-,USER,anonym,fbJnFHnHq 149.171.126.17 Anonym,-,,-,-


Unnamed: 0,ts,node,filter,init,success
0,1424429070.727596,bro,ip or not ip,T,T


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,trans_depth,helo,mailfrom,rcptto,date,from,to,reply_to,msg_id,in_reply_to,subject,x_originating_ip,first_received,second_received,last_reply,path,user_agent,tls,fuids
0,1424256989.350394,C9zC0k1eMFe6n5INSd,59.166.0.2,56249,149.171.126.0,25,1,client-3ba60005.example.int,,,Tue Feb 17 17:14:38 +1100 2015,,,-,<5DBPyXnpx044jJOf@example.com>,-,"The thing, you thought it perfectly----- And then",-,qmail 1309 by uid 2584; Fri Feb 28 10:07:07 +1100 2014,-,250 2.6.0 Queued mail for delivery,"149.171.126.0,59.166.0.2",Microsoft Outlook Express 6.00.2800.1158,F,"F1YVMA4nWYBEIQkbDd,F1WlkS0yj1Py35am9"


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,duration,version,community,get_requests,get_bulk_requests,get_responses,set_requests,display_string,up_since
0,1424257146.818946,CTYBRU1hBdyRYpDU5b,175.45.176.1,52657,149.171.126.11,161,9e-06,1,PUBLIC,2,0,0,0,-,


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,status,direction,client,server
0,1424256988.242291,ChBvzV2CEZnYgi9gaa,59.166.0.0,3778,149.171.126.2,22,success,INBOUND,SSH-2.0-PuTTY_Release_0.60,SSH-1.99-OpenSSH_4.3


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,version,cipher,curve,server_name,session_id,last_alert,established,cert_chain_fuids,client_cert_chain_fuids,subject,issuer,client_subject,client_issuer
0,1424257005.940392,CkIFoc3DTlsBeaFW66,175.45.176.1,17478,149.171.126.12,443,TLSv10,TLS_RSA_WITH_3DES_EDE_CBC_SHA,-,-,08a79d45b747f5c828da00000000000000000000000000000000000000000000,-,F,-,-,-,-,-,-


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,facility,severity,message
0,1424257272.50549,Cj3GI83Aak9eXVMxRj,175.45.176.2,59788,149.171.126.19,514,udp,KERN,EMERG,4294967296 FtOzu6fBJ Gqvq2fF: jiSd9fZKD93uuhXTlgnUYiJabrYEZ5ScNv9ZSWpYJ9ZU1VtphmvQtkLwMqZBFJaAANohXQqtNjtHwFh0teyFi2QpSb1JN9zkqRedhLE0znm8OyeGq0QUOxDCg1knyDaHXmtchaToM7AeHBDZSywDgVJSANxLahZy5vII1BLIyTape44OkitTxsqFwhQmadxiG0sqot6BL7T90mei9KXRWvSAjYLDwSOEYwmkJLqbbuBoYeUFlQlkvwOf0nCBP7kXqVz7ddmrSXPgK4qpdf7s9Ts5GEVGBrC6DQnoI9jggrwSN4XnMKV5ImStRzIQpnMc3c15iemhtTK3jvXu0JDhyt5GHf2P0DvlOKyJrfc6SZLLisGpkLHuKO9mWajcrYXi8zn27YkZiwA8HQsomh5Nxi3gjElvi3y3zLPkcXwWimPYkKbjGju9ZaGBkL4S0yAMNWAypTpFeSZtISOHyGMG5iA6NQ66ZSi3XcEmfcTNQQRGfLp3eTEd2B01V9690LY1TuV8QARbt2CSI3t1r04zwGzyk2Mz54D6cSX3DF2Abhie5drJa2zaPpPkYs3TsvQ0ntuIlXNZPbT86zUcDasMeiS9vW908jGF5Shj0Ncs3ZzN54jTjs0v3kzSsVvv9rAOBUn25vWd8bP2OB0z3g62Q5AXJdbjYAqYVPQISkNaeDBSRiRuaNJq70TA8n811AJtPCz34sGdMllwNTRgGscoZ1t2RtRG3MNnu1zmvhcbMdERVX80st7Y5oVudQGsai\x0a


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,name,addl,notice,peer
0,1424256989.752044,CBAuL33hjNTqQ8MTak,59.166.0.8,53225.0,149.171.126.5,80.0,above_hole_data_without_any_acks,-,F,bro


Unnamed: 0,ts,id,certificate.version,certificate.serial,certificate.subject,certificate.issuer,certificate.not_valid_before,certificate.not_valid_after,certificate.key_alg,certificate.sig_alg,certificate.key_type,certificate.key_length,certificate.exponent,certificate.curve,san.dns,san.uri,san.email,san.ip,basic_constraints.ca,basic_constraints.path_len
0,1424257134.200798,F1UOdB2xjikZlEkvu5,2,9A773E9868AE2F2142,"CN=uVEycH.com\00njxoOwVyuqbebclExlJ.com,OU=mhUEnXB,O=UzzTqUsqUT,L=aTmKVEcrCIvmGiAdQ,ST=Minnesota,C=US","CN=cqbaqTlF.com,OU=Ae,O=awM,L=jhMfjaJOiSRBeKiax,ST=Wisconsin,C=US",1398268772.0,1455728372.0,rsaEncryption,sha1WithRSAEncryption,rsa,1024,65537,-,-,-,-,-,-,


In [18]:
log_dataframes = orig_log_dataframes.copy()
sub0_dict = {}
sub1_dict = {}
sub2_dict = {}
sub3_dict = {}
ip_dict = {}
    
for key in log_dataframes:
    log_dataframe = log_dataframes[key]
    for column in log_dataframe.columns:
        if column in PII:
            if column in IP_fields:
                log_dataframe[column] = [get_subnet_perserving_ip(ip) for ip in log_dataframe[column]]
            else:
                log_dataframe[column] = LabelEncoder().fit_transform(log_dataframe[column])
    log_dataframes[key] = log_dataframe
    print(key)
    display(log_dataframes[key].iloc[:1,:].style.apply(highlight_col, axis=None))


conn.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,local_orig,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents
0,1424256987.747828,C8kfmaJvH8YSujzW5,192.194.119.95,57872,185.201.128.141,56104,tcp,-,0.027552,227.0,11587.0,SF,-,0,ShADadfF,42,2646,44,25470,(empty)


dhcp.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,mac,assigned_ip,lease_time,trans_id
0,1424257224.537779,CU34Cu2zrLEPwadl52,32.237.194.136,68,185.201.128.181,67,0,32.237.194.136,1879.0,4288085767


dns.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,trans_id,query,qclass,qclass_name,qtype,qtype_name,rcode,rcode_name,AA,TC,RD,RA,Z,answers,TTLs,rejected
0,1424256988.447831,CcCQqj3E6KlwtFnRja,192.194.119.89,7745,185.201.128.95,53,udp,48100,24,1.0,C_INTERNET,1.0,A,0.0,NOERROR,F,F,T,T,0,185.201.128.177,60.0,F


dpd.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,analyzer,failure_reason
0,1424257073.512632,CgvWyM1wBPRuu6ULUe,32.237.194.95,64558,185.201.128.112,80,tcp,HTTP,not a http reply line


files.log


Unnamed: 0,ts,fuid,tx_hosts,rx_hosts,conn_uids,source,depth,analyzers,mime_type,filename,duration,local_orig,is_orig,seen_bytes,total_bytes,missing_bytes,overflow_bytes,timedout,parent_fuid,md5,sha1,sha256,extracted
0,1424256989.355818,F1YVMA4nWYBEIQkbDd,0,26,C9zC0k1eMFe6n5INSd,SMTP,2,(empty),text/plain,0,0.0,-,F,298,,0,0,F,-,0,0,0,-


ftp.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,user,password,command,arg,mime_type,file_size,reply_code,reply_msg,data_channel.passive,data_channel.orig_h,data_channel.resp_h,data_channel.resp_p,fuid
0,1424256989.952859,CsfksnMVMnOYhPiB7,192.194.119.139,5146,185.201.128.136,21,1,2,EPSV,0,-,,229,Extended Passive Mode OK (|||24196|),T,192.194.119.139,185.201.128.136,24196.0,-


http.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,trans_depth,method,host,uri,referrer,user_agent,request_body_len,response_body_len,status_code,status_msg,info_code,info_msg,filename,tags,username,password,proxied,orig_fuids,orig_mime_types,resp_fuids,resp_mime_types
0,1424256990.350022,CwgILB4FJZI9P9o4z1,192.194.119.180,41195,185.201.128.180,80,1,GET,251,216,-,-,0,83,200.0,OK,,-,0,(empty),0,0,-,-,-,FdHIAGPzf33uaIXE9,text/plain


irc.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,nick,user,command,value,addl,dcc_file_name,dcc_file_size,dcc_mime_type,fuid
0,1424256999.141962,CTF2G91mpfymYtsHWk,32.237.194.180,60007,185.201.128.110,6667,-,0,USER,anonym,fbJnFHnHq 149.171.126.17 Anonym,-,,-,-


packet_filter.log


Unnamed: 0,ts,node,filter,init,success
0,1424429070.727596,bro,ip or not ip,T,T


smtp.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,trans_depth,helo,mailfrom,rcptto,date,from,to,reply_to,msg_id,in_reply_to,subject,x_originating_ip,first_received,second_received,last_reply,path,user_agent,tls,fuids
0,1424256989.350394,C9zC0k1eMFe6n5INSd,192.194.119.95,56249,185.201.128.236,25,1,105,7,5,14,5,2,0,22,0,88,-,qmail 1309 by uid 2584; Fri Feb 28 10:07:07 +1100 2014,-,250 2.6.0 Queued mail for delivery,"149.171.126.0,59.166.0.2",Microsoft Outlook Express 6.00.2800.1158,F,"F1YVMA4nWYBEIQkbDd,F1WlkS0yj1Py35am9"


snmp.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,duration,version,community,get_requests,get_bulk_requests,get_responses,set_requests,display_string,up_since
0,1424257146.818946,CTYBRU1hBdyRYpDU5b,32.237.194.180,52657,185.201.128.178,161,9e-06,1,PUBLIC,2,0,0,0,-,


ssh.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,status,direction,client,server
0,1424256988.242291,ChBvzV2CEZnYgi9gaa,192.194.119.236,3778,185.201.128.95,22,success,INBOUND,SSH-2.0-PuTTY_Release_0.60,SSH-1.99-OpenSSH_4.3


ssl.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,version,cipher,curve,server_name,session_id,last_alert,established,cert_chain_fuids,client_cert_chain_fuids,subject,issuer,client_subject,client_issuer
0,1424257005.940392,CkIFoc3DTlsBeaFW66,32.237.194.180,17478,185.201.128.100,443,TLSv10,TLS_RSA_WITH_3DES_EDE_CBC_SHA,-,0,08a79d45b747f5c828da00000000000000000000000000000000000000000000,-,F,-,-,0,0,0,0


syslog.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,facility,severity,message
0,1424257272.50549,Cj3GI83Aak9eXVMxRj,32.237.194.95,59788,185.201.128.195,514,udp,KERN,EMERG,71


weird.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,name,addl,notice,peer
0,1424256989.752044,CBAuL33hjNTqQ8MTak,192.194.119.139,53225.0,185.201.128.46,80.0,above_hole_data_without_any_acks,-,F,bro


x509.log


Unnamed: 0,ts,id,certificate.version,certificate.serial,certificate.subject,certificate.issuer,certificate.not_valid_before,certificate.not_valid_after,certificate.key_alg,certificate.sig_alg,certificate.key_type,certificate.key_length,certificate.exponent,certificate.curve,san.dns,san.uri,san.email,san.ip,basic_constraints.ca,basic_constraints.path_len
0,1424257134.200798,F1UOdB2xjikZlEkvu5,2,3,2,2,4,4,rsaEncryption,sha1WithRSAEncryption,rsa,1024,65537,-,0,0,0,0,-,
