In [1]:
#setup based on this: https://t-redactyl.io/blog/2020/08/reading-s3-data-into-a-spark-dataframe-using-sagemaker.html
import os
import boto3
import json 
import time
import pandas as pd
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import matplotlib.pyplot as plt
import sagemaker_pyspark
import botocore.session

## Set Spark Session Configuration

In [2]:
session = botocore.session.get_session()
credentials = session.get_credentials()

In [3]:
client = boto3.client('secretsmanager')
response = client.get_secret_value(
    SecretId='sapient-s3-access'
)
response = json.loads(response['SecretString'])
access_key = response["aws_access_key_id"]
secret_key = response["aws_secret_access_key"]

In [4]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages graphframes:graphframes:0.8.2-spark3.2-s_2.12 pyspark-shell'
conf = (SparkConf()
        .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))

In [5]:
# https://spark.apache.org/docs/latest/configuration.html#memory-management
spark = (
    SparkSession
    .builder
    .config(conf=conf) \
    .config('fs.s3a.access.key', access_key)
    .config('fs.s3a.secret.key', secret_key)
    .config('spark.network.timeout', 300)
    .config('spark.local.dir', '/home/ec2-user/SageMaker/tmp')
    .config("spark.executor.memory", "16g")
    .config("spark.driver.memory", "8g")
    .config("spark.memory.offHeap.enabled", "true")
    .config("spark.memory.offHeap.size","20g")
    .appName("sapient")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")



:: loading settings :: url = jar:file:/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ec2-user/.ivy2/cache
The jars for the packages stored in: /home/ec2-user/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-5b2fbcbe-e998-4296-937e-9d50b3240e76;1.0
	confs: [default]
	found graphframes#graphframes;0.8.2-spark3.2-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 178ms :: artifacts dl 3ms
	:: modules in use:
	graphframes#graphframes;0.8.2-spark3.2-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	-----------------------------

23/03/07 05:02:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/07 05:02:43 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/03/07 05:02:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/03/07 05:02:43 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Functions to Load and Read Data

In [18]:
# read from raw bucket + write to refined bucket + aggregate final to the trusted bucket
s3_url_raw = "s3a://sapient-bucket-raw/"
s3_url_refined = "s3a://sapient-bucket-refined/"
s3_url_trusted = "s3a://sapient-bucket-trusted/"
ecar_cols = ['id','timestamp','objectID','actorID','object','action','hostname', 'user_name']
bro_cols_conn = ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_', 'id.resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 
                 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents']
bro_cols_rep = ['ts', 'level', 'message', 'location']

In [45]:
def readDev(type='ecar', env='dev'):
    """
    type: ecar, ecar-bro, bro
    """
    if type == 'labels':
        s3_parquet_loc = f"{s3_url_refined}/{env}/{type}"
    else:
        s3_parquet_loc = f"{s3_url_refined}/{env}/{type}"
    start_time = time.time()
    df = spark.read.parquet(s3_parquet_loc) \
                   .select(*ecar_cols)
    # rdd = spark.sparkContext.parallelize(df.take(1000))
    # print(f"Your dataframe has {rdd.count():,} rows.")
    print(time.strftime('%l:%M%p %Z on %b %d, %Y') + " --- read and cache time: %s seconds ---" % (time.time() - start_time))
    return df

In [46]:
df = readDev().withColumn('event_minute', minute(col('timestamp'))) \
               .withColumn('event_day', dayofmonth(col('timestamp'))) \
               .withColumn('event_hour', hour(col('timestamp'))) \
               .cache()

 6:32AM UTC on Mar 07, 2023 --- read and cache time: 0.40653538703918457 seconds ---


In [52]:
ev_mins = [row.event_minute for row in df.select("event_minute").distinct().collect()]
len(ev_mins)
# x = max events per min
# y = total minutes
# x_axis = 2*(x-1) + 2
# y_axis = 2*(y+2)

11

In [None]:
               .withColumn('x_axis', hour(col('timestamp'))) \
               .withColumn('y_axis', hour(col('timestamp'))) \

In [21]:
df.limit(5).toPandas()

Unnamed: 0,id,timestamp,objectID,actorID,object,action,hostname,user_name
0,8fe97976-9e2d-4c1c-bd94-518fb45733c0,2019-09-23T09:11:20.548-04:00,b1b8b9bb-8d05-4be9-a9f6-61a571d6b175,aa240ddc-17cd-4f29-bb39-e40a51f088b2,FLOW,START,SysClient0004.systemia.com,
1,4df8406d-e254-4652-a46c-0d9991b09cf4,2019-09-23T09:11:20.552-04:00,b1b8b9bb-8d05-4be9-a9f6-61a571d6b175,aa240ddc-17cd-4f29-bb39-e40a51f088b2,FLOW,START,SysClient0004.systemia.com,
2,8f8797b4-809b-4fed-ad57-abfa51bbbe0a,2019-09-23T09:11:20.554-04:00,4f9297eb-314b-4188-b760-09d22bc786b4,aa240ddc-17cd-4f29-bb39-e40a51f088b2,FLOW,START,SysClient0004.systemia.com,
3,17ffc85d-7f24-44ba-8968-82ce6826fb0a,2019-09-23T09:11:20.557-04:00,b1b8b9bb-8d05-4be9-a9f6-61a571d6b175,aa240ddc-17cd-4f29-bb39-e40a51f088b2,FLOW,START,SysClient0004.systemia.com,
4,8c449f67-b36d-4caa-a354-4068108d8855,2019-09-23T09:11:20.56-04:00,4f9297eb-314b-4188-b760-09d22bc786b4,aa240ddc-17cd-4f29-bb39-e40a51f088b2,FLOW,START,SysClient0004.systemia.com,


In [23]:
actIDs = [row.actorID for row in df.select("actorID").distinct().collect()]

In [24]:
len(actIDs)

94

In [25]:
objIDs = [row.objectID for row in df.select("objectID").distinct().collect()]

In [26]:
len(objIDs)

1525

In [31]:
len(set(actIDs).intersection(objIDs))

52

In [39]:
df_dup = df.withColumn('id', lit(None).cast(StringType()))

In [40]:
df_dup.limit(5).toPandas()

Unnamed: 0,id,timestamp,objectID,actorID,object,action,hostname,user_name
0,,2019-09-23T09:11:20.548-04:00,b1b8b9bb-8d05-4be9-a9f6-61a571d6b175,aa240ddc-17cd-4f29-bb39-e40a51f088b2,FLOW,START,SysClient0004.systemia.com,
1,,2019-09-23T09:11:20.552-04:00,b1b8b9bb-8d05-4be9-a9f6-61a571d6b175,aa240ddc-17cd-4f29-bb39-e40a51f088b2,FLOW,START,SysClient0004.systemia.com,
2,,2019-09-23T09:11:20.554-04:00,4f9297eb-314b-4188-b760-09d22bc786b4,aa240ddc-17cd-4f29-bb39-e40a51f088b2,FLOW,START,SysClient0004.systemia.com,
3,,2019-09-23T09:11:20.557-04:00,b1b8b9bb-8d05-4be9-a9f6-61a571d6b175,aa240ddc-17cd-4f29-bb39-e40a51f088b2,FLOW,START,SysClient0004.systemia.com,
4,,2019-09-23T09:11:20.56-04:00,4f9297eb-314b-4188-b760-09d22bc786b4,aa240ddc-17cd-4f29-bb39-e40a51f088b2,FLOW,START,SysClient0004.systemia.com,


In [43]:
df_new = df.union(df_dup).sort("timestamp")

In [44]:
df_new.limit(5).toPandas()

Unnamed: 0,id,timestamp,objectID,actorID,object,action,hostname,user_name
0,,2019-09-23T09:06:23.238-04:00,febae102-2e7f-4c61-8ffa-6b1b6a66d209,a989aa42-152e-482b-b8aa-95b0dbf42b2f,PROCESS,OPEN,SysClient0021.systemia.com,
1,3d842b72-5c2d-45a3-ac61-4d6d2e4bc7cc,2019-09-23T09:06:23.238-04:00,febae102-2e7f-4c61-8ffa-6b1b6a66d209,a989aa42-152e-482b-b8aa-95b0dbf42b2f,PROCESS,OPEN,SysClient0021.systemia.com,
2,6703b5cb-ceb5-4279-a702-edaa90158c89,2019-09-23T09:06:23.238-04:00,febae102-2e7f-4c61-8ffa-6b1b6a66d209,a989aa42-152e-482b-b8aa-95b0dbf42b2f,PROCESS,OPEN,SysClient0021.systemia.com,
3,1d3ad0d1-2f8b-41e3-ae8a-ec32413cf288,2019-09-23T09:06:23.238-04:00,febae102-2e7f-4c61-8ffa-6b1b6a66d209,a989aa42-152e-482b-b8aa-95b0dbf42b2f,PROCESS,CREATE,SysClient0021.systemia.com,
4,,2019-09-23T09:06:23.238-04:00,febae102-2e7f-4c61-8ffa-6b1b6a66d209,a989aa42-152e-482b-b8aa-95b0dbf42b2f,PROCESS,CREATE,SysClient0021.systemia.com,
