In [1]:
%run ./etl_trusted_features.ipynb

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/19 19:02:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/03/19 19:02:47 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/03/19 19:02:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


                                                                                

In [2]:
%pip install tensorflow --quiet
%pip install tensorflow_gnn --quiet
%pip install tensorflow_io --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import boto3
import numpy as np
import tensorflow as tf
import tensorflow_io as tfio
from tensorflow import keras
import tensorflow_gnn as tfgnn
from tensorflow_gnn import runner
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer

2023-03-19 19:03:00.303302: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<span style="color:red">**<<<<<<< local**</span>

In [4]:
def getFirstEventFiles():
    s3 = boto3.client('s3')
    response = s3.list_objects_v2(
        Bucket = 'sapient-bucket-trusted',
        Prefix = f'prod/graph/first_events')
    all_files = []
    for content in response.get('Contents', []):
        all_files.append(content['Key'])
        # print(content['Key'])
    files = [f"s3://sapient-bucket-trusted/" + f for f in all_files if 'parquet' in f]

    return files

In [5]:
# https://towardsdatascience.com/how-to-split-a-tensorflow-dataset-into-train-validation-and-test-sets-526c8dd29438
def get_dataset_partitions_tf(ds, ds_size, train_split=0.8, test_split=0.1, val_split=0.1, shuffle=True, shuffle_size=10000):
    assert (train_split + test_split + val_split) == 1
    
    if shuffle:
        # Specify seed to always have the same split distribution between runs
        ds = ds.shuffle(shuffle_size, seed=12)
    
    train_size = int(train_split * ds_size)
    val_size = int(val_split * ds_size)
    
    train_ds = ds.take(train_size)    
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)
    
    return train_ds, val_ds, test_ds

In [6]:
# c_col = len(df.columns)

In [7]:
# import pyarrow.parquet as pq
# import s3fs
# s3 = s3fs.S3FileSystem()

In [8]:
# pandas_dataframe = pq.ParquetDataset('s3://sapient-bucket-trusted/prod/graph/first_events', filesystem=s3).read_pandas().to_pandas()

In [9]:
# TENSORFLOW CONFIGURATION
TRAIN_SIZE = 640
SHUFFLE_BUFFER = 500
BATCH_SIZE = 32
AUTO = tf.data.experimental.AUTOTUNE
tf.config.threading.set_inter_op_parallelism_threads(16)
pd.set_option('display.max_columns', None)

In [10]:
nonnull_ecar_cols = [
    'id', 'objectID','actorID','object','action','hostname', 'image_path', 
    'parent_image_path', 'new_path', 'file_path', 'malicious'
            ]

In [11]:
lb = LabelBinarizer()
mlb = MultiLabelBinarizer(sparse_output=True)

In [12]:
readFirstEvents().withColumn("malicious", when(col('malicious') == 1, "malicious")
                                            .otherwise("benign")) \
                .groupBy("malicious").count().show()

                                                                                

 7:03PM UTC on Mar 19, 2023 --- read time: 10.076992988586426 seconds ---




+---------+------+
|malicious| count|
+---------+------+
|   benign|981985|
|malicious|  3642|
+---------+------+



                                                                                

In [13]:
df = readFirstEvents().select(*nonnull_ecar_cols).cache() \
                    .withColumn("malicious", when(col('malicious') == 1, "malicious")
                                            .otherwise("benign")) \
                    .withColumn('order', row_number().over(Window.partitionBy(lit('1')).orderBy(lit('1')))) \
                    .withColumn('actorUID', dense_rank().over(Window.partitionBy().orderBy('actorID'))-1).orderBy('order') \
                    .withColumn('objectUID', dense_rank().over(Window.partitionBy().orderBy('objectID'))-1).orderBy('order') \
                    .withColumn('id', dense_rank().over(Window.partitionBy().orderBy('objectID'))-1).orderBy('order') \
                    .drop('order') \
                    .toPandas()

                                                                                

 7:03PM UTC on Mar 19, 2023 --- read time: 5.300315856933594 seconds ---


                                                                                

In [14]:
df.isna().sum()

id                        0
objectID                  0
actorID                   0
object                    0
action                    0
hostname                  0
image_path            95767
parent_image_path    739191
new_path             980621
file_path            345123
malicious                 0
actorUID                  0
objectUID                 0
dtype: int64

In [15]:
df = df.fillna(0)

In [16]:
df.columns

Index(['id', 'objectID', 'actorID', 'object', 'action', 'hostname',
       'image_path', 'parent_image_path', 'new_path', 'file_path', 'malicious',
       'actorUID', 'objectUID'],
      dtype='object')

In [17]:
# check number representation worked
df[['actorUID','objectUID']].head()

Unnamed: 0,actorUID,objectUID
0,0,377215
1,0,0
2,1,24
3,2,213836
4,2,40


In [18]:
# Extract the source and destination indices
src_idx = df['actorUID'].tolist()
dst_idx = df['objectUID'].tolist()

In [19]:
# Extract the node features
node_features = df[['parent_image_path', 'file_path']].values

In [20]:
# Extract the edge features
edge_features = df[['id']].values

In [21]:
# Create the sparse adjacency matrix
num_nodes = len(set(src_idx + dst_idx))
indices = list(zip(src_idx, dst_idx))
values = [1] * len(indices)
adj_matrix = tf.sparse.SparseTensor(
    indices=indices,
    values=values,
    dense_shape=[num_nodes, num_nodes]
)

2023-03-19 19:03:54.443199: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-19 19:03:54.446823: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-19 19:03:54.447443: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-03-19 19:03:54.448270: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

In [22]:
# Create the sparse tensor for the edge features
edge_feature_tensor = tf.SparseTensor(
    indices=indices,
    values=tf.reshape(edge_features, [-1]),
    dense_shape=[num_nodes, edge_features.shape[1]]
)

In [4]:
graph_schema = tfgnn.read_schema("./graph_schema.pbtxt")
gtspec = tfgnn.create_graph_spec_from_schema_pb(graph_schema)

In [5]:
gtspec

GraphTensorSpec({'context': ContextSpec({'features': {'malicious': RaggedTensorSpec(TensorShape([1, None]), tf.string, 1, tf.int32)}, 'sizes': TensorSpec(shape=(1,), dtype=tf.int32, name=None)}, TensorShape([]), tf.int32, None), 'node_sets': {'objectID': NodeSetSpec({'features': {'file_path': RaggedTensorSpec(TensorShape([None, None]), tf.string, 1, tf.int32), 'base_address': RaggedTensorSpec(TensorShape([None, None]), tf.string, 1, tf.int32), 'image_path': RaggedTensorSpec(TensorShape([None, None]), tf.string, 1, tf.int32)}, 'sizes': TensorSpec(shape=(1,), dtype=tf.int32, name=None)}, TensorShape([]), tf.int32, None), 'actorID': NodeSetSpec({'features': {'parent_image_path': RaggedTensorSpec(TensorShape([None, None]), tf.string, 1, tf.int32)}, 'sizes': TensorSpec(shape=(1,), dtype=tf.int32, name=None)}, TensorShape([]), tf.int32, None)}, 'edge_sets': {'id': EdgeSetSpec({'features': {'event_minute': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'event_hour': TensorSpec(shape=(N

In [33]:
gt = tfgnn.random_graph_tensor(gtspec)

2023-03-15 07:39:38.920187: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-03-15 07:39:38.920236: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-172-16-10-147.us-west-2.compute.internal): /proc/driver/nvidia/version does not exist
2023-03-15 07:39:38.921158: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<span style="color:red">**=======**</span>

In [None]:
def create_graph_tensor(df):
    graph_tensor = tfgnn.GraphTensor.from_pieces(
    context_spec = tfgnn.ContextSpec.from_field_specs(
        features_spec ={
            "malicious": np.array(df['malicious'],
                                dtype='string').reshape(len(df),1)
        }),
    node_sets = {
        "actorID": tfgnn.NodeSet.from_fields(
            sizes = [len(df)],
            features ={
                'parent_image_path': np.array(df['parent_image_path'],
                                dtype='string').reshape(len(df),1)
            }),
        "objectID": tfgnn.NodeSet.from_fields(
            sizes = [len(df)],
            features ={
                'base_address': np.array(df['base_address'],
                                 dtype='string').reshape(len(df),1),
                'file_path': np.array(df['file_path'],
                                   dtype='string').reshape(len(df),1),
                'image_path': np.array(df['image_path'],
                                   dtype='string').reshape(len(df),1),
            })
    },
    edge_sets ={
        "id": tfgnn.EdgeSet.from_fields(
            sizes = [len(df)],
            features = {
                'id': np.array(df['id'],
                                      dtype='string').reshape(len(df),1)},
            adjacency = tfgnn.Adjacency.from_indices(
                source = ("actorID", np.array(df['actorID'], dtype='string')),
                target = ("objectID", np.array(df['objectID'], dtype='string')))),
  })

    return graph_tensor

<span style="color:red">**>>>>>>> remote**</span>

In [23]:
nodes = dict()
nodes_list = list(set(df['actorUID']) | set(df['objectUID']))
for i, node in enumerate(nodes_list):
    nodes[node] = i

In [24]:
# create edges of source and destination and add them into the dictionary
edges = []
for index, row in df.iterrows():
    source = nodes[row['actorUID']]
    destination = nodes[row['objectUID']]
    features = row.id
    edges.append((source, destination, features))

In [28]:
indices = [(edge[0], edge[1]) for edge in edges]
values = [edge[2] for edge in edges]
dense_shape = (len(nodes), len(nodes))

In [29]:
# indices = np.array(df['actorUID'], dtype='str').reshape(len(df),1)
# values = np.array(df['actorUID'], dtype='str').reshape(len(df),1)
# dense_shape = (len(df['actorUID']), len(df['actorUID']))

In [30]:
sparse_tensor = tf.SparseTensor(
    indices=indices,
    values=values,
    dense_shape=dense_shape
)

In [None]:
# Create input data
inputs = []
for i in range(len(nodes)):
    inputs.append(sparse_tensor.indices[sparse_tensor.indices[:,0]==i][:,1])

In [None]:
len(inputs)

441854

In [None]:
# inputs = tf.ragged.constant(inputs)
inputs_ds = tf.random.normal(inputs)

InvalidArgumentError: {{function_node __wrapped__Pack_N_441854_device_/job:localhost/replica:0/task:0/device:GPU:0}} Shapes of all inputs must match: values[0].shape = [2] != values[1].shape = [1] [Op:Pack] name: shape

In [None]:
# Create output data
outputs = np.array([label])

In [None]:
# Reshape input and output data for LSTM
inputs_ds = tf.expand_dims(inputs_ds, axis=0)
outputs = tf.expand_dims(outputs, axis=0)