In [2]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

In [3]:
#function to accept an event trace data frame and then encode it. Ideally we run this twice, once on the malicious
#traces and again on benign traces. This implies that we run graphframes on both sets of events independently. 

#TODO: extract only the extensions, or declare file path an attribute 

def oneHotCol(df, colm, dict_mapping):
    
    #now action
    #turn into numeric index before encoding
    
    num = colm+'_numeric'
    sparse = colm+'_sparse'
    indexer = StringIndexer(inputCol=colm, outputCol=num, handleInvalid="keep")
    indexer_fitted = indexer.fit(df)
    df_indexed = indexer_fitted.transform(df)

    encoder = OneHotEncoder(inputCols=[num], outputCols=[sparse],dropLast=False)
    df_onehot = encoder.fit(df_indexed).transform(df_indexed)
    df_onehot = df_onehot.drop(colm, num)

    #set dict to mapping
    dict_mapping[colm] = indexer_fitted.labels
        
    return df_onehot, dict_mapping




In [6]:
def trace_encode(df, list_cols):
    
    start_time = time.time()
    
    #step one accept the event trace transpose it, and explode it. 

    #add "trace index" to keep track of traces. important for transposing back
    df_transp = df.withColumn("Trace", (monotonically_increasing_id() + 1))
    df_transp = df_transp.select("Trace", 
                                 *[col for col in df_transp.columns if col != "Trace"])

    #drop all vertices
    df_transp = df_transp.drop('a','b','c','d','e','f','g')

    #transpose rows 
    stacked_df = df_transp.selectExpr(
        "Trace", 
        "posexplode(array(e1, e2, e3, e4, e5, e6)) as (pos, col)"
    ).select(
        "Trace", 
        expr('''CASE pos 
        WHEN 0 THEN 'e1' 
        WHEN 1 THEN 'e2'
        WHEN 2 THEN 'e3'
        WHEN 3 THEN 'e4'
        WHEN 4 THEN 'e5'
        ELSE 'e6' END''').alias("event"),
        "col"
    ).orderBy("Trace","event")

    #explode columns
    stacked_df = stacked_df.select(*stacked_df.columns, "col.*").drop('col')
    
    #instantiate dictionary and return df
    dict_mapping = {}
    df_onehot = stacked_df
    
    #for all columns to one hot, one hot, preserve mapping
    for colm in list_cols:
        df_onehot, dict_mapping = oneHotCol(df_onehot,colm, dict_mapping) 
    
    print("elapsed time: "+ str(time.time() - start_time))
    
    return df_onehot,dict_mapping