In [12]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import udf
from pyspark.sql.types import *
#from graphframes import GraphFrame

In [13]:
#function to generate a graphframe
def create_graph(df):
    #create trace matrix from malicious events for speed. 
    # Create distinct vertices with source as actorid, destination as objectid for malicious
    src_vertices = df.selectExpr('actorID as id').distinct()
    dst_vertices = df.selectExpr('objectID as id').distinct()
    vertices = src_vertices.union(dst_vertices).distinct()

    # Create edges by using timestamp as an edge
    edges = df.selectExpr('actorID as src', 'objectID as dst', 'timestamp', 'object', 'action', 'hostname', 'user_name', 'privileges', 'image_path',
                          'parent_image_path', 'new_path', 'file_path', 'direction', 'logon_id', 'requesting_domain', 'requesting_user', 'malicious')

    # Create GraphFrame
    g = GraphFrame(vertices, edges)
    motifs6 = g.find("(a)-[e1]->(b); (b)-[e2]->(c); (c)-[e3]->(d); (d)-[e4]->(e); (e)-[e5]->(f); (f)-[e6]->(g)")

    #create paths and count
    # filter paths to only those where all edges are connected
    connected_paths = motifs6.filter("e1.dst = e2.src and e2.dst = e3.src and e3.dst = e4.src and e4.dst = e5.src and e5.dst = e6.src").cache()
    print("event traces: "+str(connected_paths.count()))
    
    return connected_paths

In [14]:
#function to accept an event trace data frame and then encode it. Ideally we run this twice, once on the malicious
#traces and again on benign traces. This implies that we run graphframes on both sets of events independently. 

#TODO: extract only the extensions, or declare file path an attribute 

def oneHotCol(df, colm, dict_mapping, cols_sparse):
    
    #now action
    #turn into numeric index before encoding
    
    num = colm+'_numeric'
    sparse = colm+'_sparse'
    indexer = StringIndexer(inputCol=colm, outputCol=num, handleInvalid="keep")
    indexer_fitted = indexer.fit(df)
    df_indexed = indexer_fitted.transform(df)

    encoder = OneHotEncoder(inputCols=[num], outputCols=[sparse],dropLast=False)
    df_onehot = encoder.fit(df_indexed).transform(df_indexed)
    df_onehot = df_onehot.drop(colm, num)

    #set dict to mapping
    dict_mapping[colm] = indexer_fitted.labels
    
    #add column to cols_sparse list
    cols_sparse.append(sparse)
        
    return df_onehot, dict_mapping, cols_sparse


In [15]:
#udf functions

# define a user-defined function to convert binary int array to string array
def binary_to_string_array(binary_int_array):
    string_array = []
    for i in binary_int_array:
        string_array.append(str(int(i)))
    return ''.join(string_array)

# register the user-defined function as a UDF
binary_to_string_array_udf = udf(binary_to_string_array, StringType())

def int_cast(num):
    return int(num)
int_cast_udf = udf(int_cast, IntegerType())


In [17]:
def trace_encode(df, list_cols, output = 'ind'):
    
    start_time = time.time()
    
    #step one accept the event trace transpose it, and explode it. 

    #add "trace index" to keep track of traces. important for transposing back
    df_transp = df.withColumn("Trace", (monotonically_increasing_id() + 1))
    df_transp = df_transp.select("Trace", 
                                 *[col for col in df_transp.columns if col != "Trace"])

    #drop all vertices
    df_transp = df_transp.drop('a','b','c','d','e','f','g')

    #transpose rows 
    stacked_df = df_transp.selectExpr(
        "Trace", 
        "posexplode(array(e1, e2, e3, e4, e5, e6)) as (pos, col)"
    ).select(
        "Trace", 
        expr('''CASE pos 
        WHEN 0 THEN 'e1' 
        WHEN 1 THEN 'e2'
        WHEN 2 THEN 'e3'
        WHEN 3 THEN 'e4'
        WHEN 4 THEN 'e5'
        ELSE 'e6' END''').alias("event"),
        "col"
    ).orderBy("Trace","event")

    #explode columns
    df_onehot = stacked_df.select(*stacked_df.columns, "col.*").drop('col')
    
    #instantiate dictionary and return df
    dict_mapping = {}
    #list of sparse cols
    list_sparse = []
    
    print("transposed explode: "+ str(time.time() - start_time))
    
    #for all columns to one hot, one hot, preserve mapping
    for colm in list_cols:
        df_onehot, dict_mapping, list_sparse = oneHotCol(df_onehot,colm, dict_mapping, list_sparse)
    
    #assemble vectors for all sparse columns - this might be enough for our ML algorithms
    assembler = VectorAssembler(inputCols=list_sparse, 
                            outputCol="final_vec")
    df_onehot = assembler.transform(df_onehot)
    
    
    #turn into string
    df_onehot = df_onehot.withColumn("vec2string", binary_to_string_array_udf("final_vec"))
    
    print("one-hot time: "+ str(time.time() - start_time))
    
    
    #now I need to arrange the output in a column-wise dataframe with event strings or indices and the malicious tag
    if output == 'vec':
        
         #Generate a list of columns to drop
        keep_cols = ['malicious','Trace','event','vec2string']
        drop_cols = [col for col in df_onehot.columns  
                     if col not in list_cols and col not in keep_cols]
    
        #i want to drop any columnn not in the column list or is the malicious column
        df_onehot = df_onehot.drop(*drop_cols)
        
        #first pivot aka transpose and keep all events
        pivot_vec = df_onehot.groupBy('Trace').pivot('event')\
        .agg(first('malicious'),first('vec2string'))
        #then consolidate the columns into a single event sequence
        df_onehot = pivot_vec.select('Trace',col('e1_first(malicious)').alias('malicious'),
                                  array('e1_first(vec2string)', 'e2_first(vec2string)',
                           'e3_first(vec2string)','e4_first(vec2string)', 
                           'e5_first(vec2string)', 'e6_first(vec2string)').alias('event_sequence'))
    else: 
        
        #index
        indexer = StringIndexer(inputCol='vec2string', outputCol='event_ind')
        indexer_fitted = indexer.fit(df_onehot)
        df_onehot = indexer_fitted.transform(df_onehot)

        #turn index into an integer
        df_onehot = df_onehot.withColumn("event_index", int_cast_udf("event_ind"))

        print("indexing time: "+ str(time.time() - start_time))

        #Generate a list of columns to drop
        keep_cols = ['malicious','Trace','event','vec2string',"event_index"]
        drop_cols = [col for col in df_onehot.columns  
                     if col not in list_cols and col not in keep_cols]
    
        #i want to drop any columnn not in the column list or is the malicious column
        df_onehot = df_onehot.drop(*drop_cols)
        
        #now do it for the indices
        pivot_ind = df_onehot.groupBy('Trace').pivot('event').agg(first('malicious'),
                                        first('event_index'))
        df_onehot = pivot_ind.select('Trace',col('e1_first(malicious)').alias('malicious'),
                                  array('e1_first(event_index)', 'e2_first(event_index)',
                           'e3_first(event_index)','e4_first(event_index)', 
                           'e5_first(event_index)', 'e6_first(event_index)').alias('event_sequence'))
        
    print("total elapsed time: "+ str(time.time() - start_time))
    
    return df_onehot,dict_mapping