## Auditd to transactions

In [4]:
# Helper functions
import re
type_lookup_table = {u'ADD_GROUP': 4,
 u'ADD_USER': 12,
 u'ANOM_ABEND': 0,
 u'CONFIG_CHANGE': 24,
 u'CRED_ACQ': 20,
 u'CRED_DISP': 13,
 u'CRED_REFR': 17,
 u'CRYPTO_KEY_USER': 6,
 u'CRYPTO_SESSION': 14,
 u'DAEMON_END': 8,
 u'DAEMON_START': 7,
 u'LOGIN': 19,
 u'NETFILTER_CFG': 22,
 u'SYSCALL': 5,
 u'SYSTEM_RUNLEVEL': 1,
 u'SYSTEM_SHUTDOWN': 18,
 u'USER_ACCT': 9,
 u'USER_AUTH': 10,
 u'USER_CHAUTHTOK': 21,
 u'USER_CMD': 3,
 u'USER_END': 23,
 u'USER_ERR': 11,
 u'USER_LOGIN': 2,
 u'USER_LOGOUT': 15,
 u'USER_START': 16}
def get_data(line, window_size=10, start_time=1422496861):
    timestamp = float(re.search('audit\(([0-9]+.[0-9]+)', line).group(1))
    type_code = type_lookup_table[re.search('type=([A-Z_]+) ', line).group(1)]
    window = int((timestamp -start_time)/window_size)
    return (window, type_code)
from collections import defaultdict
def get_longest_sets_possible(input_sets):
    def is_subset(main_set, item):
        is_subset = False
        for main_item in main_set:
            if item.issubset(main_item):
                is_subset = True
        return is_subset
    input_dict = defaultdict(set)
    for i in input_sets:
        input_dict[len(i)].add(i)
    
    output_sets = set()
    lengths = sorted(input_dict.keys(), reverse=True) # Largest first
    for i in input_dict[lengths[0]]: # since they are all the longest length we know that they are good
        output_sets.add(i) 
    
    for length in lengths[1:]:
        for item in input_dict[length]:
            if not is_subset(output_sets, item):
                output_sets.add(item)
    return output_sets

In [5]:
# Load Data
logs = sc.textFile("hdfs:///user/ytesfaye/lab41_logs_small.log.gz").repartition(10)
transactions = logs.map(get_data) \
                   .groupByKey() \
                   .map(lambda (key, iterator): list(set(iterator)))

#tBird Logs

In [10]:
# Load Data
tbird_logs = sc.textFile("hdfs:///user/ytesfaye/tbird.log.out.logCluster.processed.gz").repartition(10)
def get_tbird_data(line, window_size=10, start_time=1131523501):
    ls = line.split(',')
    timestamp = float(ls[0])
    type_code = int(ls[1])
    window = int((timestamp -start_time)/window_size)
    return (window, type_code)
transactions = tbird_logs.map(get_tbird_data) \
                   .groupByKey() \
                   .map(lambda (key, iterator): list(set(iterator)))

# Using ML Lib

In [13]:
from pyspark.mllib.fpm import FPGrowth
model = FPGrowth.train(transactions, minSupport=0.2, numPartitions=10)
result = model.freqItemsets().collect()

In [14]:

items = [frozenset(fi.items) for fi in result]
pruned_items = list(get_longest_sets_possible(items))
for item in pruned_items:
    print '|'.join([',' + str(i) + ',' for i in sorted(item, key=int)])

,-1,|,133,|,243,|,383,|,464,|,918,|,1033,|,1177,
,-1,|,89,|,242,|,244,|,918,|,1033,
,244,|,323,|,918,
,-1,|,89,|,133,|,243,|,244,|,383,|,918,
,-1,|,241,|,242,|,243,|,244,|,1033,|,1110,
,89,|,241,|,244,|,918,|,1110,
,-1,|,241,|,244,|,1033,|,1110,|,1177,
,-1,|,133,|,241,|,242,|,383,|,918,|,1177,
,-1,|,242,|,243,|,383,|,918,|,1177,
,-1,|,89,|,241,|,242,|,243,|,244,|,1033,
,-1,|,242,|,243,|,918,|,1178,
,-1,|,89,|,241,|,244,|,1033,|,1177,
,-1,|,89,|,241,|,242,|,243,|,244,|,1110,
,89,|,241,|,242,|,243,|,244,|,1033,|,1110,
,-1,|,241,|,323,|,918,
,-1,|,133,|,241,|,242,|,383,|,1177,|,1178,
,-1,|,242,|,323,|,918,
,-1,|,133,|,241,|,242,|,243,|,244,|,383,|,464,|,1033,|,1177,
,-1,|,918,|,1110,|,1177,
,89,|,242,|,244,|,918,|,1110,
,-1,|,133,|,241,|,243,|,383,|,918,|,1177,
,-1,|,133,|,242,|,243,|,383,|,464,|,1033,|,1178,
,241,|,243,|,323,|,918,
,-1,|,133,|,241,|,243,|,383,|,1177,|,1178,
,-1,|,242,|,243,|,244,|,1110,|,1177,|,1178,
,241,|,1112,
,-1,|,241,|,242,|,243,|,1033,|,1110,|,1177,
,-1,|,133,|,24