In [1]:
#setup based on this: https://t-redactyl.io/blog/2020/08/reading-s3-data-into-a-spark-dataframe-using-sagemaker.html
import boto3
import json 
import time
import pandas as pd
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import matplotlib.pyplot as plt
import sagemaker_pyspark
import botocore.session

## Set Spark Session Configuration

In [2]:
session = botocore.session.get_session()
credentials = session.get_credentials()

In [3]:
client = boto3.client('secretsmanager')
response = client.get_secret_value(
    SecretId='sapient-s3-access'
)
response = json.loads(response['SecretString'])
access_key = response["aws_access_key_id"]
secret_key = response["aws_secret_access_key"]

In [4]:
conf = (SparkConf()
        .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars()))
       )

In [5]:
# https://spark.apache.org/docs/latest/configuration.html#memory-management
spark = (
    SparkSession
    .builder
    .config(conf=conf) \
    .config('fs.s3a.access.key', access_key)
    .config('fs.s3a.secret.key', secret_key)
    .config('spark.network.timeout', 300)
    .config('spark.local.dir', '/home/ec2-user/SageMaker/tmp')
    .config("spark.executor.memory", "400g")
    .config("spark.driver.memory", "80g")
    .config("spark.memory.offHeap.enabled", "true")
    .config("spark.memory.offHeap.size","50g")
    .appName("sapient")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/11 07:26:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/03/11 07:26:07 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


## Functions to Load and Read Data

In [6]:
# read from raw bucket + write to refined bucket + aggregate final to the trusted bucket
s3_url_raw = "s3a://sapient-bucket-raw/"
s3_url_refined = "s3a://sapient-bucket-refined/"
s3_url_trusted = "s3a://sapient-bucket-trusted/"
ecar_cols = [
    'id','timestamp','objectID','actorID','object','action','hostname', 'user_name', 'privileges', 'image_path', 
    'parent_image_path', 'new_path', 'file_path', 'direction', 'logon_id', 'requesting_domain', 'requesting_user'
            ]
bro_cols_conn = ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_', 'id.resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 
                 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents']
bro_cols_rep = ['ts', 'level', 'message', 'location']

In [7]:
# Create a new dataframe with distinct objectIDs to identify malcious ObjectIds
df_labels = spark.read.parquet(f"{s3_url_refined}/prod/labels").cache()
df_malcious_objectIDs = df_labels.select('id').distinct()
df_labels.unpersist()

                                                                                

DataFrame[hostname: string, id: string, objectID: string, actorID: string, timestamp: timestamp, object: string, action: string]

In [8]:
def get_count(type='ecar', env='prod'):
    """
    type: ecar, ecar-bro, bro
    """
    s3_read_loc = f"{s3_url_refined}/{env}/{type}"
    if type == 'labels':
        s3_write_loc = f"{s3_url_trusted}/{env}/{type}"
        df = spark.read.parquet(s3_read_loc).cache()
    else:
        # Create a new column called 'malicious' in df_ecar to label malicious records
        s3_write_loc = f"{s3_url_trusted}/{env}/{type}/{size}"
        start_time = time.time()
        df = spark.read.parquet(s3_read_loc)\
                    .select(*ecar_cols)
        df = df.withColumn('event_minute', minute(col('timestamp'))) \
               .withColumn('event_day', dayofmonth(col('timestamp'))) \
               .withColumn('event_hour', hour(col('timestamp')))  \
               .cache()
    print(time.strftime('%l:%M%p %Z on %b %d, %Y') + " --- read and cache time: %s seconds ---" % (time.time() - start_time))
    row_count = df.count()
    print(time.strftime('%l:%M%p %Z on %b %d, %Y') + f" -- Your new dataframe has {row_count:,} rows.")
    df.unpersist()
    return row_count

In [9]:
def refined_to_trusted(type='ecar-bro', env='prod', size='test', read_lim=10000000, limit=False):
    """
    type: ecar, ecar-bro, bro
    """
    s3_read_loc = f"{s3_url_refined}/{env}/{type}"
    if type == 'labels':
        s3_write_loc = f"{s3_url_trusted}/{env}/{type}"
        df = spark.read.parquet(s3_read_loc).cache()
    else:       
        s3_write_loc = f"{s3_url_trusted}/{env}/{type}/{size}"
        start_time = time.time()
        df = spark.read.parquet(s3_read_loc)\
                    .select(*ecar_cols) \
                    .filter((dayofmonth(col('timestamp')) == 23)) 
        df = df.withColumn('event_minute', minute(col('timestamp'))) \
               .withColumn('event_day', dayofmonth(col('timestamp'))) \
               .withColumn('event_hour', hour(col('timestamp'))) \
               .cache()
        df = df.withColumn('malicious', when(col('id').isin(df_malcious_objectIDs.rdd.flatMap(lambda x: x).collect()), 1).otherwise(0))
        if limit:
            df = df.limit(read_lim)
        else:
            pass
    print(time.strftime('%l:%M%p %Z on %b %d, %Y') + " --- read and cache time: %s seconds ---" % (time.time() - start_time))
    start_time = time.time()
    df.write.option("maxRecordsPerFile", 300000).mode("overwrite").parquet(s3_write_loc)
    print(time.strftime('%l:%M%p %Z on %b %d, %Y') + " --- write time: %s seconds ---" % (time.time() - start_time))

In [10]:
refined_to_trusted('ecar', size='test',   read_lim=1000)

                                                                                

 7:30AM UTC on Mar 11, 2023 --- read and cache time: 248.64191460609436 seconds ---


                                                                                

 7:43AM UTC on Mar 11, 2023 --- write time: 801.289222240448 seconds ---


In [11]:
refined_to_trusted('ecar', size='small',  read_lim=10000000)

 7:47AM UTC on Mar 11, 2023 --- read and cache time: 247.1960678100586 seconds ---


                                                                                

 7:57AM UTC on Mar 11, 2023 --- write time: 587.4563958644867 seconds ---


In [12]:
refined_to_trusted('ecar', size='medium', read_lim=100000000)

 8:01AM UTC on Mar 11, 2023 --- read and cache time: 251.94494605064392 seconds ---


                                                                                

 8:11AM UTC on Mar 11, 2023 --- write time: 604.237820148468 seconds ---


In [13]:
refined_to_trusted('ecar', size='all', limit=False)

 8:16AM UTC on Mar 11, 2023 --- read and cache time: 245.52330088615417 seconds ---


                                                                                

 8:26AM UTC on Mar 11, 2023 --- write time: 610.7315983772278 seconds ---


In [14]:
# s3_read_loc = f"{s3_url_refined}/prod/ecar"
# df = spark.read.parquet(s3_read_loc)\
#                     .cache()

In [15]:
# df.count()