## Auditd to transactions

In [1]:
# Helper functions
import re
type_lookup_table = {u'ADD_GROUP': 4,
 u'ADD_USER': 12,
 u'ANOM_ABEND': 0,
 u'CONFIG_CHANGE': 24,
 u'CRED_ACQ': 20,
 u'CRED_DISP': 13,
 u'CRED_REFR': 17,
 u'CRYPTO_KEY_USER': 6,
 u'CRYPTO_SESSION': 14,
 u'DAEMON_END': 8,
 u'DAEMON_START': 7,
 u'LOGIN': 19,
 u'NETFILTER_CFG': 22,
 u'SYSCALL': 5,
 u'SYSTEM_RUNLEVEL': 1,
 u'SYSTEM_SHUTDOWN': 18,
 u'USER_ACCT': 9,
 u'USER_AUTH': 10,
 u'USER_CHAUTHTOK': 21,
 u'USER_CMD': 3,
 u'USER_END': 23,
 u'USER_ERR': 11,
 u'USER_LOGIN': 2,
 u'USER_LOGOUT': 15,
 u'USER_START': 16}
def get_data(line, window_size=10, start_time=1422496861):
    timestamp = float(re.search('audit\(([0-9]+.[0-9]+)', line).group(1))
    type_code = type_lookup_table[re.search('type=([A-Z_]+) ', line).group(1)]
    window = int((timestamp -start_time)/window_size)
    return (window, type_code)

from collections import defaultdict
def get_longest_sets_possible(input_sets):
    def is_subset(main_set, item):
        is_subset = False
        for main_item in main_set:
            if item.issubset(main_item):
                is_subset = True
        return is_subset
    input_dict = defaultdict(set)
    for i in input_sets:
        input_dict[len(i)].add(i)
    
    output_sets = set()
    lengths = sorted(input_dict.keys(), reverse=True) # Largest first
    for i in input_dict[lengths[0]]: # since they are all the longest length we know that they are good
        output_sets.add(i) 
    
    for length in lengths[1:]:
        for item in input_dict[length]:
            if not is_subset(output_sets, item):
                output_sets.add(item)
    return output_sets

In [7]:
# Load Data
#logs = sc.textFile("hdfs:///user/ytesfaye/lab41_logs_small.log.gz").repartition(10)
#transactions = logs.map(get_data) \
#                   .groupByKey() \
#                   .map(lambda (key, iterator): list(set(iterator)))

In [7]:
tbird_logs = sc.textFile("hdfs://l41-srv-mcdh01.b.internal/magichour/tbird.log.preProc.stringmatch").repartition(10)

In [8]:
tbird_logs.count()

3617951

In [9]:
tbird_logs.take(2)

[u'1131944192.0,3080,USER INT AFILE [ INT ]: [ FILEANDLINE ]: Failed discover node test, node INT ad INT b INT , KEYVALUE INT , LEVEL code INT',
 u'1131944334.0,3080,USER INT AFILE [ INT ]: [ FILEANDLINE ]: Failed discover node test, node INT ad INT b INT , KEYVALUE INT , LEVEL code INT']

#tBird Logs

In [11]:
# Load Data
#tbird_logs = sc.textFile("hdfs://l41-srv-mcdh01.b.internal/user/ytesfaye/tbird.log.out.logCluster.processed.gz").repartition(10)
tbird_logs = sc.textFile("hdfs://l41-srv-mcdh01.b.internal/magichour/tbird.log.preProc.stringmatch").repartition(10)
#transactionsNoFreqs = sc.textFile("hdfs://l41-srv-mcdh01.b.internal/magichour/tbird.log.out.logCluster.processed.10sec.50000freqnew")

def get_tbird_data(line, window_size=10):
    ls = line.split(',')
    timestamp = float(ls[0])
    type_code = int(ls[1])
    window = int(timestamp/window_size)
    #ls = line.split(' ')
    #return ls
    return (window, type_code)

transactions = tbird_logs.map(get_tbird_data) \
                   .groupByKey() \
                   .map(lambda (key, iterator): list(set([item for item in iterator if item != -1]))) \
                   .filter(lambda x: len(x) >= 2)

#transactions = transactionsNoFreqs.map(get_tbird_data)

# Load lookup table so that we can get back to raw strings
template_lookup = {}
for line in sc.textFile("hdfs://l41-srv-mcdh01.b.internal/user/ytesfaye/tmp.txt").collect():
    ls = line.split(',', 2)
    template_lookup[int(ls[0])] = ls[1]
dimension = max(template_lookup.keys()) + 1

# ML Lib LDA

In [24]:
from pyspark.mllib.linalg import Vectors, SparseVector
import numpy as np
from scipy.sparse import lil_matrix 
def make_vector(input_list, dimension=dimension):
    input_list, key = input_list
    return [key, SparseVector(dimension, sorted(input_list), np.ones(len(input_list)))]

In [25]:
vectorized_transactions = transactions.filter(lambda x: len(x) >= 2).zipWithUniqueId().map(make_vector)

In [30]:
from pyspark.mllib.clustering import LDA
model = LDA.train(vectorized_transactions, k=5, seed=1)

In [40]:
topics = model.topicsMatrix()

In [41]:
max_topics = 20
num_words_per_topic = 5
for topic_num, (ids, weights) in enumerate(model.describeTopics(num_words_per_topic)):
    print 'Topic %d'%topic_num
    print '---------------------'
    for i, n in enumerate(ids):
        print '%4d (%2.2f): %s'%(n, weights[i]*100.0, template_lookup[n])

Topic 0
---------------------
 241 (7.46): tbird-admin INT FILEPATH data_thread got not answer from any thunderbird_a INT datasource
 242 (7.21): tbird-admin INT FILEPATH data_thread got not answer from any thunderbird_c INT datasource
 243 (7.20): tbird-admin INT FILEPATH data_thread got not answer from any thunderbird_d INT datasource
 244 (7.00): tbird-admin INT FILEPATH data_thread got not answer from any thunderbird_b INT datasource
1110 (5.55): dn INT ntpd INT synchronized to IPADDR stratum INT
Topic 1
---------------------
 241 (7.08): tbird-admin INT FILEPATH data_thread got not answer from any thunderbird_a INT datasource
 242 (6.88): tbird-admin INT FILEPATH data_thread got not answer from any thunderbird_c INT datasource
 243 (6.86): tbird-admin INT FILEPATH data_thread got not answer from any thunderbird_d INT datasource
 244 (6.61): tbird-admin INT FILEPATH data_thread got not answer from any thunderbird_b INT datasource
1110 (5.30): dn INT ntpd INT synchronized to IPADDR 

# Using ML Lib FP-Growth

In [23]:
from pyspark.mllib.fpm import FPGrowth
model = FPGrowth.train(transactions, minSupport=0.01, numPartitions=10)
result = model.freqItemsets().collect()

In [24]:
items = [frozenset(fi.items) for fi in result]
pruned_items = list(get_longest_sets_possible(items))

In [25]:
len(pruned_items)

389

In [26]:
for item in pruned_items:
    print ''.join([' ' + str(i) + ' ' for i in sorted(item, key=int)])
    

#for i in range(len(pruned_items)):
    #print '---------------------'
    #for template in pruned_items[i]:
        #print '%4d'%template, template_lookup[template]

 373  375  377  381  386  390  544  563 
 19  373  375  544  3049 
 27  373  375  377  381  2246  2373  2386  2388  2390  2392 
 36  48  94  373  375  2317  2350  3042 
 94  373  375  377  381  390 
 19  373  375  377  563  3049 
 10  373  834  1959  1962  4603 
 19  94  2248  2261  2317  2350  3049 
 19  36  48  375  3049  3066  3090  3095  5179 
 10  36  48  373  2317  3042 
 15  19  373  377  3049  7191 
 36  48  375  377  3051  5174 
 375  2127  2129 
 10  94  563 
 10  375  834  1959  1962  4603 
 8  94  373  375  381 
 36  48  377  381  3051  5174 
 27  94  373  381  2246  2373  2386  2388  2390  2392 
 8  19  381  2191  3049 
 19  373  386  390  544  3049 
 10  94  373  381  2317  2350  3042  3051  3076  3090  3095  5174  7211  7240  7242 
 10  19  563  3049 
 373  375  377  834  868  1959  1962  1966  1971  4603  4623 
 377  2127  2129 
 19  27  377  2246  2373  2386  2388  2390  2392 
 94  381  386  390  544 
 2  373  375  377 
 15  19  48  94  373  3049 
 381  1976 
 19  36  

In [7]:
items = [frozenset(fi.items) for fi in result]
pruned_items = list(get_longest_sets_possible(items))
for item in pruned_items:
    print '|'.join([',' + str(i) + ',' for i in sorted(item, key=int)])

,918,|,1033,|,1110,|,1177,|,1178,
,78,|,89,|,92,|,241,|,242,|,243,|,383,|,411,|,918,
,241,|,243,|,244,|,1033,|,1112,
,241,|,242,|,244,|,1110,|,1111,
,78,|,92,|,101,|,226,|,383,|,411,|,918,
,412,|,1110,
,78,|,92,|,102,|,242,|,243,|,383,|,411,
,78,|,89,|,92,|,241,|,244,|,383,|,411,|,918,
,78,|,79,|,92,|,93,|,101,|,133,|,243,|,383,|,411,|,412,|,464,|,1033,
,89,|,241,|,1111,
,89,|,242,|,243,|,1033,|,1110,|,1177,|,1178,
,241,|,242,|,243,|,244,|,666,|,1033,|,1110,
,78,|,92,|,101,|,133,|,241,|,243,|,383,|,411,|,1177,
,78,|,89,|,92,|,241,|,242,|,244,|,411,|,918,
,78,|,79,|,92,|,93,|,411,|,1110,
,78,|,79,|,92,|,93,|,241,|,243,|,244,|,411,|,412,
,78,|,92,|,133,|,242,|,244,|,383,|,411,|,464,|,1033,|,1110,
,89,|,133,|,243,|,244,|,323,|,383,|,464,|,918,|,1177,
,133,|,383,|,918,|,1110,|,1177,|,1178,
,78,|,92,|,133,|,241,|,244,|,383,|,411,|,464,|,1033,|,1177,
,78,|,79,|,92,|,93,|,241,|,242,|,243,|,244,|,411,
,241,|,1033,|,1111,
,78,|,89,|,92,|,133,|,241,|,243,|,383,|,411,|,464,|,918,|,1033,
,244,|,32