In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import numpy as np
from collections import Counter
from os import walk

### Bro/Zeek log parser and anonymiser

#### This file serves to parse Bro/Zeek logs and identify and anonymise Personally Identifiable Information (PII). As example data the bro logs of the UNSW-NB15 dataset [1] have been used.

The PII information is listed in the PII list and one line of each log file is displayed below with the PII highlighted in red.

[1] https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/

In [2]:
# Helper functions

def get_headers_types(log_type):
    headers = []
    with open("24/"+log_type) as file:
        for i in range(8):
            line = next(file).strip()
            if line.startswith("#fields"):
                headers = line.split("\t")[1:]
    return headers


def get_dataframe(log_type,seperator):
    headers = get_headers_types(log_type)
    return pd.read_csv('24/'+log_type,sep=seperator,
                       names=headers,skiprows=8,skipfooter=1,engine='python')

def highlight_col(x):
    r = 'background-color: red'
    df1 = pd.DataFrame('', index=x.index, columns=x.columns)
    df1.loc[:,:] = 'background-color: white' 
    for info in PII:
        if info in x.columns:
            df1.loc[:,info] = r

            
    return df1    


In [3]:
seperator = "[\t]"

_, _, filenames = next(walk('24/'))
filenames.sort()

# List with columns containing PII.
PII = ['id.orig_h','id.resp_h',
       'mac','assigned_ip',
       'query','answers',
       'tx_hosts','rx_hosts','md5','sha1','sha256',
       'user','password','data_channel.orig_h','data_channel.resp_h',
       'host','uri','filename','username',
       'helo','mailfrom','rcptto','date','from','to','reply_to','msg_id','in_reply_to','subject',
           'x_originating_ip','first_received','second_received','last_reply','path',
       'issuer','server_name','subject','issuer client_subject','client_issuer',
       'message',
       'certificate.serial','certificate.subject','certificate.issuer','certificate.not_valid_before',
           'certificate.not_valid_after','san.dns','san.uri','san.email','san.ip']


In [4]:
for filename in filenames:
    print(filename)
    df_log = get_dataframe(filename,seperator).iloc[:1,:]
    display(df_log.style.apply(highlight_col, axis=None))

conn.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,local_orig,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents
0,1424256987.747828,C8kfmaJvH8YSujzW5,59.166.0.2,57872,149.171.126.9,56104,tcp,-,0.027552,227,11587,SF,-,0,ShADadfF,42,2646,44,25470,(empty)


dhcp.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,mac,assigned_ip,lease_time,trans_id
0,1424257224.537779,CU34Cu2zrLEPwadl52,175.45.176.3,68,149.171.126.13,67,02:1a:c5:00:00:00,175.45.176.3,1879.0,4288085767


dns.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,trans_id,query,qclass,qclass_name,qtype,qtype_name,rcode,rcode_name,AA,TC,RD,RA,Z,answers,TTLs,rejected
0,1424256988.447831,CcCQqj3E6KlwtFnRja,59.166.0.4,7745,149.171.126.2,53,udp,48100,server-95ab7e07.int,1,C_INTERNET,1,A,0,NOERROR,F,F,T,T,0,149.171.126.7,60.0,F


dpd.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,analyzer,failure_reason
0,1424257073.512632,CgvWyM1wBPRuu6ULUe,175.45.176.2,64558,149.171.126.14,80,tcp,HTTP,not a http reply line


files.log


Unnamed: 0,ts,fuid,tx_hosts,rx_hosts,conn_uids,source,depth,analyzers,mime_type,filename,duration,local_orig,is_orig,seen_bytes,total_bytes,missing_bytes,overflow_bytes,timedout,parent_fuid,md5,sha1,sha256,extracted
0,1424256989.355818,F1YVMA4nWYBEIQkbDd,149.171.126.0,59.166.0.2,C9zC0k1eMFe6n5INSd,SMTP,2,(empty),text/plain,-,0.0,-,F,298,-,0,0,F,-,-,-,-,-


ftp.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,user,password,command,arg,mime_type,file_size,reply_code,reply_msg,data_channel.passive,data_channel.orig_h,data_channel.resp_h,data_channel.resp_p,fuid
0,1424256989.952859,CsfksnMVMnOYhPiB7,59.166.0.8,5146,149.171.126.3,21,anonymous,jobs@server.com,EPSV,-,-,-,229,Extended Passive Mode OK (|||24196|),T,59.166.0.8,149.171.126.3,24196,-


http.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,trans_depth,method,host,uri,referrer,user_agent,request_body_len,response_body_len,status_code,status_msg,info_code,info_msg,filename,tags,username,password,proxied,orig_fuids,orig_mime_types,resp_fuids,resp_mime_types
0,1424256990.350022,CwgILB4FJZI9P9o4z1,59.166.0.1,41195,149.171.126.1,80,1,GET,Tracker,/announce?peer_id=-AR2621-949883860326&port=15836&uploaded=0&downloaded=0&left=8388610&compact=1&numwant=0&event=started&info_hash=\x1d]\xfb\xcc\x9f\xeb\xfckTfW\xe3e\xe8\xed\xa9 \x0f5\xf0,-,-,0,83,200,OK,-,-,-,(empty),-,-,-,-,-,FdHIAGPzf33uaIXE9,text/plain


irc.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,nick,user,command,value,addl,dcc_file_name,dcc_file_size,dcc_mime_type,fuid
0,1424256999.141962,CTF2G91mpfymYtsHWk,175.45.176.1,60007,149.171.126.17,6667,-,-,USER,anonym,fbJnFHnHq 149.171.126.17 Anonym,-,-,-,-


packet_filter.log


Unnamed: 0,ts,node,filter,init,success
0,1424429070.727596,bro,ip or not ip,T,T


smtp.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,trans_depth,helo,mailfrom,rcptto,date,from,to,reply_to,msg_id,in_reply_to,subject,x_originating_ip,first_received,second_received,last_reply,path,user_agent,tls,fuids
0,1424256989.350394,C9zC0k1eMFe6n5INSd,59.166.0.2,56249,149.171.126.0,25,1,client-3ba60005.example.int,,,Tue Feb 17 17:14:38 +1100 2015,,,-,<5DBPyXnpx044jJOf@example.com>,-,"The thing, you thought it perfectly----- And then",-,qmail 1309 by uid 2584; Fri Feb 28 10:07:07 +1100 2014,-,250 2.6.0 Queued mail for delivery,"149.171.126.0,59.166.0.2",Microsoft Outlook Express 6.00.2800.1158,F,"F1YVMA4nWYBEIQkbDd,F1WlkS0yj1Py35am9"


snmp.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,duration,version,community,get_requests,get_bulk_requests,get_responses,set_requests,display_string,up_since
0,1424257146.818946,CTYBRU1hBdyRYpDU5b,175.45.176.1,52657,149.171.126.11,161,9e-06,1,PUBLIC,2,0,0,0,-,-


ssh.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,status,direction,client,server
0,1424256988.242291,ChBvzV2CEZnYgi9gaa,59.166.0.0,3778,149.171.126.2,22,success,INBOUND,SSH-2.0-PuTTY_Release_0.60,SSH-1.99-OpenSSH_4.3


ssl.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,version,cipher,curve,server_name,session_id,last_alert,established,cert_chain_fuids,client_cert_chain_fuids,subject,issuer,client_subject,client_issuer
0,1424257005.940392,CkIFoc3DTlsBeaFW66,175.45.176.1,17478,149.171.126.12,443,TLSv10,TLS_RSA_WITH_3DES_EDE_CBC_SHA,-,-,08a79d45b747f5c828da00000000000000000000000000000000000000000000,-,F,-,-,-,-,-,-


syslog.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,facility,severity,message
0,1424257272.50549,Cj3GI83Aak9eXVMxRj,175.45.176.2,59788,149.171.126.19,514,udp,KERN,EMERG,4294967296 FtOzu6fBJ Gqvq2fF: jiSd9fZKD93uuhXTlgnUYiJabrYEZ5ScNv9ZSWpYJ9ZU1VtphmvQtkLwMqZBFJaAANohXQqtNjtHwFh0teyFi2QpSb1JN9zkqRedhLE0znm8OyeGq0QUOxDCg1knyDaHXmtchaToM7AeHBDZSywDgVJSANxLahZy5vII1BLIyTape44OkitTxsqFwhQmadxiG0sqot6BL7T90mei9KXRWvSAjYLDwSOEYwmkJLqbbuBoYeUFlQlkvwOf0nCBP7kXqVz7ddmrSXPgK4qpdf7s9Ts5GEVGBrC6DQnoI9jggrwSN4XnMKV5ImStRzIQpnMc3c15iemhtTK3jvXu0JDhyt5GHf2P0DvlOKyJrfc6SZLLisGpkLHuKO9mWajcrYXi8zn27YkZiwA8HQsomh5Nxi3gjElvi3y3zLPkcXwWimPYkKbjGju9ZaGBkL4S0yAMNWAypTpFeSZtISOHyGMG5iA6NQ66ZSi3XcEmfcTNQQRGfLp3eTEd2B01V9690LY1TuV8QARbt2CSI3t1r04zwGzyk2Mz54D6cSX3DF2Abhie5drJa2zaPpPkYs3TsvQ0ntuIlXNZPbT86zUcDasMeiS9vW908jGF5Shj0Ncs3ZzN54jTjs0v3kzSsVvv9rAOBUn25vWd8bP2OB0z3g62Q5AXJdbjYAqYVPQISkNaeDBSRiRuaNJq70TA8n811AJtPCz34sGdMllwNTRgGscoZ1t2RtRG3MNnu1zmvhcbMdERVX80st7Y5oVudQGsai\x0a


weird.log


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,name,addl,notice,peer
0,1424256989.752044,CBAuL33hjNTqQ8MTak,59.166.0.8,53225,149.171.126.5,80,above_hole_data_without_any_acks,-,F,bro


x509.log


Unnamed: 0,ts,id,certificate.version,certificate.serial,certificate.subject,certificate.issuer,certificate.not_valid_before,certificate.not_valid_after,certificate.key_alg,certificate.sig_alg,certificate.key_type,certificate.key_length,certificate.exponent,certificate.curve,san.dns,san.uri,san.email,san.ip,basic_constraints.ca,basic_constraints.path_len
0,1424257134.200798,F1UOdB2xjikZlEkvu5,2,9A773E9868AE2F2142,"CN=uVEycH.com\00njxoOwVyuqbebclExlJ.com,OU=mhUEnXB,O=UzzTqUsqUT,L=aTmKVEcrCIvmGiAdQ,ST=Minnesota,C=US","CN=cqbaqTlF.com,OU=Ae,O=awM,L=jhMfjaJOiSRBeKiax,ST=Wisconsin,C=US",1398268772.0,1455728372.0,rsaEncryption,sha1WithRSAEncryption,rsa,1024,65537,-,-,-,-,-,-,-
