In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

import numpy as np
from collections import Counter
from os import walk
from os import path
import random

from copy import deepcopy

from sklearn.preprocessing import LabelEncoder

from helper_functions import *

### Bro/Zeek log parser and anonymiser

#### This file serves to parse Bro/Zeek logs and identify and anonymise Personally Identifiable Information (PII). As example data the bro logs of the UNSW-NB15 dataset [1] have been used.

The PII information is listed in the PII list and one line of each log file is displayed below with the PII highlighted in red.

The PII fields are then transformed. The IP addresses to a random IP addresses where the IP addresses in each subnet are grouped together. For example: 1.2.3.50 and 1.2.3.100 might be transformed to 9.222.51.2 and 9.222.51.244. The other PII fields are encoced with scikitlearn's LabelEncoder. Which means that the same inputs are mapped to the same number (label)

[1] https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/

In [2]:
chosen_files = ['conn.log','dns.log','ftp.log','http.log','ssh.log','ssl.log']
orig_log_dataframes = build_dataframes(chosen_files)
log_dataframes = anonymise_dataframes(orig_log_dataframes, chosen_files)

In [3]:



chosen_files = ['conn.log','dns.log','ftp.log','http.log','ssh.log','ssl.log']
for key in chosen_files:
    print("---- " + key + " ---- Normal")
    display(orig_log_dataframes[key].iloc[:1,:].style.apply(highlight_col, axis=None))
    print("---- " + key + " ---- Anonymised")
    display(log_dataframes[key].iloc[:1,:].style.apply(highlight_col, axis=None))

---- conn.log ---- Normal


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,local_orig,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents
0,1424256987.747828,C8kfmaJvH8YSujzW5,59.166.0.2,57872,149.171.126.9,56104,tcp,-,0.027552,227.0,11587.0,SF,-,0,ShADadfF,42,2646,44,25470,(empty)


---- conn.log ---- Anonymised


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,local_orig,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents
0,1424256987.747828,C8kfmaJvH8YSujzW5,145.44.174.81,57872,107.156.75.183,56104,tcp,-,0.027552,227.0,11587.0,SF,-,0,ShADadfF,42,2646,44,25470,(empty)


---- dns.log ---- Normal


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,trans_id,query,qclass,qclass_name,qtype,qtype_name,rcode,rcode_name,AA,TC,RD,RA,Z,answers,TTLs,rejected
0,1424256988.447831,CcCQqj3E6KlwtFnRja,59.166.0.4,7745,149.171.126.2,53,udp,48100,server-95ab7e07.int,1.0,C_INTERNET,1.0,A,0.0,NOERROR,F,F,T,T,0,149.171.126.7,60.0,F


---- dns.log ---- Anonymised


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,trans_id,query,qclass,qclass_name,qtype,qtype_name,rcode,rcode_name,AA,TC,RD,RA,Z,answers,TTLs,rejected
0,1424256988.447831,CcCQqj3E6KlwtFnRja,145.44.174.144,7745,107.156.75.81,53,udp,48100,24,1.0,C_INTERNET,1.0,A,0.0,NOERROR,F,F,T,T,0,107.156.75.83,60.0,F


---- ftp.log ---- Normal


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,user,password,command,arg,mime_type,file_size,reply_code,reply_msg,data_channel.passive,data_channel.orig_h,data_channel.resp_h,data_channel.resp_p,fuid
0,1424256989.952859,CsfksnMVMnOYhPiB7,59.166.0.8,5146,149.171.126.3,21,anonymous,jobs@server.com,EPSV,-,-,,229,Extended Passive Mode OK (|||24196|),T,59.166.0.8,149.171.126.3,24196.0,-


---- ftp.log ---- Anonymised


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,user,password,command,arg,mime_type,file_size,reply_code,reply_msg,data_channel.passive,data_channel.orig_h,data_channel.resp_h,data_channel.resp_p,fuid
0,1424256989.952859,CsfksnMVMnOYhPiB7,145.44.174.33,5146,107.156.75.65,21,1,2,EPSV,0,-,,229,Extended Passive Mode OK (|||24196|),T,145.44.174.33,107.156.75.65,24196.0,-


---- http.log ---- Normal


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,trans_depth,method,host,uri,referrer,user_agent,request_body_len,response_body_len,status_code,status_msg,info_code,info_msg,filename,tags,username,password,proxied,orig_fuids,orig_mime_types,resp_fuids,resp_mime_types
0,1424256990.350022,CwgILB4FJZI9P9o4z1,59.166.0.1,41195,149.171.126.1,80,1,GET,Tracker,/announce?peer_id=-AR2621-949883860326&port=15836&uploaded=0&downloaded=0&left=8388610&compact=1&numwant=0&event=started&info_hash=\x1d]\xfb\xcc\x9f\xeb\xfckTfW\xe3e\xe8\xed\xa9 \x0f5\xf0,-,-,0,83,200.0,OK,,-,-,(empty),-,-,-,-,-,FdHIAGPzf33uaIXE9,text/plain


---- http.log ---- Anonymised


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,trans_depth,method,host,uri,referrer,user_agent,request_body_len,response_body_len,status_code,status_msg,info_code,info_msg,filename,tags,username,password,proxied,orig_fuids,orig_mime_types,resp_fuids,resp_mime_types
0,1424256990.350022,CwgILB4FJZI9P9o4z1,145.44.174.69,41195,107.156.75.69,80,1,GET,251,216,-,-,0,83,200.0,OK,,-,0,(empty),0,0,-,-,-,FdHIAGPzf33uaIXE9,text/plain


---- ssh.log ---- Normal


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,status,direction,client,server
0,1424256988.242291,ChBvzV2CEZnYgi9gaa,59.166.0.0,3778,149.171.126.2,22,success,INBOUND,SSH-2.0-PuTTY_Release_0.60,SSH-1.99-OpenSSH_4.3


---- ssh.log ---- Anonymised


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,status,direction,client,server
0,1424256988.242291,ChBvzV2CEZnYgi9gaa,145.44.174.5,3778,107.156.75.81,22,success,INBOUND,SSH-2.0-PuTTY_Release_0.60,SSH-1.99-OpenSSH_4.3


---- ssl.log ---- Normal


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,version,cipher,curve,server_name,session_id,last_alert,established,cert_chain_fuids,client_cert_chain_fuids,subject,issuer,client_subject,client_issuer
0,1424257005.940392,CkIFoc3DTlsBeaFW66,175.45.176.1,17478,149.171.126.12,443,TLSv10,TLS_RSA_WITH_3DES_EDE_CBC_SHA,-,-,08a79d45b747f5c828da00000000000000000000000000000000000000000000,-,F,-,-,-,-,-,-


---- ssl.log ---- Anonymised


Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,version,cipher,curve,server_name,session_id,last_alert,established,cert_chain_fuids,client_cert_chain_fuids,subject,issuer,client_subject,client_issuer
0,1424257005.940392,CkIFoc3DTlsBeaFW66,208.162.205.69,17478,107.156.75.208,443,TLSv10,TLS_RSA_WITH_3DES_EDE_CBC_SHA,-,0,08a79d45b747f5c828da00000000000000000000000000000000000000000000,-,F,-,-,0,0,0,0
