In [1]:
%run ./etl_trusted_features.ipynb

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/19 21:54:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/03/19 21:54:51 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


                                                                                

In [2]:
%pip install tensorflow --quiet
%pip install tensorflow_gnn --quiet
%pip install tensorflow_io --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import boto3
import numpy as np
import tensorflow as tf
import tensorflow_io as tfio
from tensorflow import keras
import tensorflow_gnn as tfgnn
from tensorflow_gnn import runner
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer

2023-03-19 21:55:03.854453: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
def getFirstEventFiles():
    s3 = boto3.client('s3')
    response = s3.list_objects_v2(
        Bucket = 'sapient-bucket-trusted',
        Prefix = f'prod/graph/first_events')
    all_files = []
    for content in response.get('Contents', []):
        all_files.append(content['Key'])
        # print(content['Key'])
    files = [f"s3://sapient-bucket-trusted/" + f for f in all_files if 'parquet' in f]

    return files

In [5]:
# https://towardsdatascience.com/how-to-split-a-tensorflow-dataset-into-train-validation-and-test-sets-526c8dd29438
def get_dataset_partitions_tf(ds, ds_size, train_split=0.8, test_split=0.1, val_split=0.1, shuffle=True, shuffle_size=10000):
    assert (train_split + test_split + val_split) == 1
    
    if shuffle:
        # Specify seed to always have the same split distribution between runs
        ds = ds.shuffle(shuffle_size, seed=12)
    
    train_size = int(train_split * ds_size)
    val_size = int(val_split * ds_size)
    
    train_ds = ds.take(train_size)    
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)
    
    return train_ds, val_ds, test_ds

In [6]:
# c_col = len(df.columns)

In [7]:
# import pyarrow.parquet as pq
# import s3fs
# s3 = s3fs.S3FileSystem()

In [8]:
# pandas_dataframe = pq.ParquetDataset('s3://sapient-bucket-trusted/prod/graph/first_events', filesystem=s3).read_pandas().to_pandas()

In [9]:
# TENSORFLOW CONFIGURATION
TRAIN_SIZE = 640
SHUFFLE_BUFFER = 500
BATCH_SIZE = 32
AUTO = tf.data.experimental.AUTOTUNE
tf.config.threading.set_inter_op_parallelism_threads(16)
pd.set_option('display.max_columns', None)

In [10]:
nonnull_ecar_cols = [
    'id', 'objectID','actorID','object','action','hostname', 'image_path', 
    'parent_image_path', 'new_path', 'file_path', 'malicious'
            ]

In [11]:
lb = LabelBinarizer()
mlb = MultiLabelBinarizer(sparse_output=True)

In [12]:
readFirstEvents().withColumn("malicious", when(col('malicious') == 1, "malicious")
                                            .otherwise("benign")) \
                .groupBy("malicious").count().show()

                                                                                

 9:55PM UTC on Mar 19, 2023 --- read time: 8.180023193359375 seconds ---




+---------+------+
|malicious| count|
+---------+------+
|   benign|981985|
|malicious|  3642|
+---------+------+



                                                                                

In [67]:
df = readFirstEvents().select(*nonnull_ecar_cols).cache() \
                    .withColumn("malicious", when(col('malicious') == 1, "malicious")
                                            .otherwise("benign")) \
                    .withColumn('order', row_number().over(Window.partitionBy(lit('1')).orderBy(lit('1')))) \
                    .withColumn('actorUID', dense_rank().over(Window.partitionBy().orderBy('actorID'))-1).orderBy('order') \
                    .withColumn('objectUID', dense_rank().over(Window.partitionBy().orderBy('objectID'))-1).orderBy('order') \
                    .withColumn('UID', dense_rank().over(Window.partitionBy().orderBy('id'))-1).orderBy('order') \
                    .drop('order') \
                    .toPandas()

                                                                                

10:48PM UTC on Mar 19, 2023 --- read time: 8.354093074798584 seconds ---


                                                                                

In [17]:
# df.isna().sum()

In [18]:
# df = df.fillna('unknown')

In [19]:
df = df.fillna(0)

In [20]:
df.columns

Index(['id', 'objectID', 'actorID', 'object', 'action', 'hostname',
       'image_path', 'parent_image_path', 'new_path', 'file_path', 'malicious',
       'actorUID', 'objectUID'],
      dtype='object')

In [68]:
# check number representation worked
df[['actorUID','objectUID', 'UID']].head()

Unnamed: 0,actorUID,objectUID,UID
0,0,377215,351593
1,0,0,851309
2,1,24,722057
3,2,213836,219043
4,2,40,37008


In [71]:
num_nodes = len(set(df['actorUID']).union(set(df['objectUID'])))

In [69]:
num_edges = len(set(df['UID']))

In [72]:
num_node_features = len(set(df['parent_image_path']))

In [70]:
# num_edge_features = len( set(df['base_address']).union(set(df['file_path'])).union(set(df['image_path'])) )
num_edge_features = len(set(df['UID']))

In [73]:
node_feature_shape = (num_nodes, num_node_features)
edge_feature_shape = (num_edges, num_edge_features)

In [74]:
# Create a dictionary to store the node and edge features
node_features = {}
for row in df.itertuples():
    source_node = row.actorID
    target_node = row.objectID
    source_node_features = row.UID
    target_node_features = row.objectID
    
    # Add node features for source node
    if source_node not in node_features:
        node_features[source_node] = source_node_features
    
    # Add node features for target node
    if target_node not in node_features:
        node_features[target_node] = target_node_features

In [75]:
edges = []
for row in df.itertuples():
    source_node = row.actorID
    target_node = row.objectID
    edge_features = row.UID
    
    edges.append((source_node, target_node, edge_features))

In [77]:
graph_tensor = tfgnn.GraphTensor.from_pieces(
    node_sets = {
        "actorUID": tfgnn.NodeSet.from_fields(
            sizes = [len(df)],
            features ={
                'parent_image_path': np.array(df['parent_image_path'],
                                dtype='str').reshape(len(df),1)
            }),
        "objectID": tfgnn.NodeSet.from_fields(
            sizes = [len(df)],
            features ={
                'file_path': np.array(df['file_path'],
                                   dtype='str').reshape(len(df),1),
                'image_path': np.array(df['image_path'],
                                   dtype='str').reshape(len(df),1),
            })
    },
    edge_sets ={
        "id": tfgnn.EdgeSet.from_fields(
            sizes = [len(df)],
            features = {
                'id': np.array(df['UID'],
                                  dtype='str').reshape(len(df),1)},
            adjacency = tfgnn.Adjacency.from_indices(
                source = ("actorUID", np.array(df['actorUID'], dtype='int32')),
                target = ("objectUID", np.array(df['objectUID'], dtype='int32')))),
})

In [78]:
graph_tensor

GraphTensor(
  context=Context(features={}, sizes=[1], shape=(), indices_dtype=tf.int32),
  node_set_names=['actorUID', 'objectID'],
  edge_set_names=['id'])

In [None]:
df = readFirstEvents().withColumn("malicious", when(col('malicious') == 1, "malicious")
                                            .otherwise("benign")) \
                    .withColumn('order', row_number().over(Window.partitionBy(lit('1')).orderBy(lit('1')))) \
                    .withColumn('actorUID', dense_rank().over(Window.partitionBy().orderBy('actorID'))-1).orderBy('order') \
                    .withColumn('objectUID', dense_rank().over(Window.partitionBy().orderBy('objectID'))-1).orderBy('order') \
                    .drop('order') \
                    .toPandas()

In [82]:
df.count()

id                   985627
objectID             985627
actorID              985627
object               985627
action               985627
hostname             985627
image_path           985627
parent_image_path    985627
new_path             985627
file_path            985627
malicious            985627
actorUID             985627
objectUID            985627
UID                  985627
dtype: int64

In [83]:
df.isna().sum()

id                   0
objectID             0
actorID              0
object               0
action               0
hostname             0
image_path           0
parent_image_path    0
new_path             0
file_path            0
malicious            0
actorUID             0
objectUID            0
UID                  0
dtype: int64

In [81]:
df = df.fillna(0)

In [34]:
# Extract the source and destination indices
src_idx = df['actorUID'].tolist()
dst_idx = df['objectUID'].tolist()

In [35]:
# Extract the node features
node_features = df[['parent_image_path', 'file_path']].values

In [36]:
# Extract the edge features
edge_features = df[['id']].values

In [37]:
# Create the sparse adjacency matrix
num_nodes = len(set(src_idx + dst_idx))
indices = list(zip(src_idx, dst_idx))
values = [1] * len(indices)
adj_matrix = tf.sparse.SparseTensor(
    indices=indices,
    values=values,
    dense_shape=[num_nodes, num_nodes]
)

In [38]:
# Create the sparse tensor for the edge features
edge_feature_tensor = tf.SparseTensor(
    indices=indices,
    values=tf.reshape(edge_features, [-1]),
    dense_shape=[num_nodes, edge_features.shape[1]]
)

In [None]:
sparse_graph = adj_matrix + edge_feature_tensor

In [39]:
nodes = dict()
nodes_list = list(set(df['actorUID'].unique()) | set(df['objectUID'].unique()))
for i, node in enumerate(nodes_list):
    nodes[node] = i

In [40]:
edges = []
for index, row in df.iterrows():
    source = nodes[row['actorUID']]
    destination = nodes[row['objectUID']]
    features = row.id
    edges.append((source, destination, features))

In [41]:
indices = [(edge[0], edge[1]) for edge in edges]
values = [edge[2] for edge in edges]
dense_shape = (len(nodes), len(nodes))

In [42]:
sparse_tensor = tf.SparseTensor(
    indices=indices,
    values=values,
    dense_shape=dense_shape
)

In [45]:
# Create input data
inputs = [[edge[0], edge[1]] for edge in edges]

In [46]:
inputs = tf.ragged.constant(inputs)

In [51]:
event_count = inputs.shape[0]

In [66]:
# Create output data
# outputs = np.array([label])

In [None]:
# Reshape input and output data for LSTM
inputs = tf.expand_dims(inputs, axis=0)
outputs = tf.expand_dims(outputs, axis=0)

In [None]:
# Define LSTM model
model = keras.Sequential([
    keras.layers.LSTM(64),
    keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
# Compile and fit model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(inputs, outputs, epochs=10, batch_size=1)

Data -> Sparse or Ragged Tensor -> Split -> Input -> Model

In [None]:
x = tf.keras.Input(shape=(4,), sparse=True)
y = tf.keras.layers.Dense(4)(x)

In [None]:
model = tf.keras.Model(x, y)

In [None]:
sparse_data = tf.sparse.SparseTensor(
    indices = [(0,0),(0,1),(0,2),
               (4,3),(5,0),(5,1)],
    values = [1,1,1,1,1,1],
    dense_shape = (6,4)
)

In [None]:
model(sparse_data)

In [None]:
model.predict(sparse_data)

In [22]:
data = tf.ragged.constant([ [[940, 203, 668, 387, 790, 320, 939, 185],[315, 515, 791, 181, 939, 787]], 
                             [[564, 205], [820, 180, 993, 739]] ]) 

2023-03-19 21:57:03.394898: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-19 21:57:03.400801: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-19 21:57:03.402201: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-19 21:57:03.404324: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

In [103]:
# https://stackoverflow.com/questions/70399367/sparse-feature-vector-sequence-as-input-to-lstm
X = keras.Input(shape=[inputs.shape[0], inputs.shape[1]], dtype=tf.int64, ragged=True)
l1 = keras.layers.Embedding(inputs.shape[0], 16)(X)
l2 = tf.reduce_sum(l1, axis=2) #To calculate the dense feature vector for each timestep.
l3 = keras.layers.LSTM(32, use_bias=False)(l2)
l4 = keras.layers.Dense(32, activation='sigmoid')(l3)
l5 = keras.layers.Activation(tf.nn.relu)(l4)
output = tf.keras.layers.Dense(1)(l5)



In [104]:
model = tf.keras.Model(X, output)
print(model.summary())
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "model_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_14 (InputLayer)       [(None, 985627, None)]    0         
                                                                 
 embedding_11 (Embedding)    (None, 985627, None, 16)  15770032  
                                                                 
 tf.math.reduce_sum_11 (TFOp  (None, 985627, 16)       0         
 Lambda)                                                         
                                                                 
 lstm_9 (LSTM)               (1, 32)                   6144      
                                                                 
 dense_17 (Dense)            (1, 32)                   1056      
                                                                 
 activation_7 (Activation)   (1, 32)                   0         
                                                          

In [105]:
print(model(data))  

tf.Tensor(
[[-0.26043302]
 [-0.26907024]], shape=(2, 1), dtype=float32)


In [91]:
model.predict(data)



array([[-0.9903194],
       [-0.9857288]], dtype=float32)

In [93]:
inputs.shape

TensorShape([985627, None])