In [1]:
#setup based on this: https://t-redactyl.io/blog/2020/08/reading-s3-data-into-a-spark-dataframe-using-sagemaker.html
import os
import boto3
import json 
import time
import ntpath
import pandas as pd
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import matplotlib.pyplot as plt
import sagemaker_pyspark
import botocore.session
from pyspark.sql import Window
from pyspark.sql.functions import rank

## Set Spark Session Configuration

In [2]:
session = botocore.session.get_session()
credentials = session.get_credentials()

In [3]:
client = boto3.client('secretsmanager')
response = client.get_secret_value(
    SecretId='sapient-s3-access'
)
response = json.loads(response['SecretString'])
access_key = response["aws_access_key_id"]
secret_key = response["aws_secret_access_key"]

In [4]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages graphframes:graphframes:0.8.2-spark3.2-s_2.12 pyspark-shell'
conf = (SparkConf()
        .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))

In [4]:
# https://spark.apache.org/docs/latest/configuration.html#memory-management
spark = (
    SparkSession
    .builder
    .config(conf=conf) \
    .config('fs.s3a.access.key', access_key)
    .config('fs.s3a.secret.key', secret_key)
    .config('spark.network.timeout', 300)
    .config('spark.local.dir', '/home/ec2-user/SageMaker/tmp')
    .config("spark.executor.memory", "90g")
    .config("spark.driver.memory", "50g")
    .config("spark.memory.offHeap.enabled", "true")
    .config("spark.memory.offHeap.size","20g")
    .config("spark.driver.maxResultSize","5g")
    
    .appName("sapient")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

NameError: name 'SparkSession' is not defined

## Functions to Load and Read Data

In [7]:
# read from raw bucket + write to refined bucket + aggregate final to the trusted bucket
s3_url_raw = "s3a://sapient-bucket-raw/"
s3_url_refined = "s3a://sapient-bucket-refined/"
s3_url_trusted = "s3a://sapient-bucket-trusted/"
bro_cols_conn = ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_', 'id.resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 
                 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents']
bro_cols_rep = ['ts', 'level', 'message', 'location']

In [8]:
def readCheckpoint(type='ecar', env='prod', size='small'):
    """
    type: ecar, ecar-bro, bro
    """
    if type == 'labels':
        s3_parquet_loc = f"{s3_url_trusted}/{env}/{type}"
    else:
        s3_parquet_loc = f"{s3_url_trusted}/{env}/{type}/{size}"
    start_time = time.time()
    df = spark.read.parquet(s3_parquet_loc).cache()
    print(time.strftime('%l:%M%p %Z on %b %d, %Y') + " --- read and cache time: %s seconds ---" % (time.time() - start_time))
    return df

In [9]:
def readCheckpoint_bcm(type='ecar-bro', env='dev', size='small'):
    """
    type: ecar, ecar-bro, bro
    """
    if type == 'labels' or type == 'ecar':
        s3_parquet_loc = f"{s3_url_refined}/{env}/{type}"
        df = spark.read.parquet(s3_parquet_loc).cache()
    else:
        s3_parquet_loc = f"{s3_url_refined}/{env}/{type}/{size}"
    start_time = time.time()
    #df = spark.select('id').read.parquet(s3_parquet_loc).cache()
    # rdd = spark.sparkContext.parallelize(df.take(1000))
    # print(f"Your dataframe has {rdd.count():,} rows.")
    print(time.strftime('%l:%M%p %Z on %b %d, %Y') + " --- read and cache time: %s seconds ---" % (time.time() - start_time))
    return df

In [10]:
def get_firsts(df):
    """
    this creates then drops duplicates and gets the first appearance of each relationship entry
    input - dataframe with columns objectID and actorID
    output dataframe
    """
    window = Window.partitionBy("relationship").orderBy("timestamp")
    df_new = df.withColumn('relationship', concat(df.actorID, lit('->'),df.objectID) ) \
                .withColumn('rank', rank().over(window)) \
                .filter(col('rank') == 1) \
                .drop('rank') \
                .cache()
    df.unpersist()
    return df_new

In [11]:
def getFile(str):
    """
    udf to get the file from a full file path
    similar udf (non-windows): https://stackoverflow.com/questions/40848681/udf-to-extract-only-the-file-name-from-path-in-spark-sql
    """
    if str == None:
        pass
    else:
        new_str = ntpath.basename(str)
        return new_str
getFileUDF = udf(lambda z: getFile(z),StringType())

In [12]:
def darpaColFeatures(df):
    df_new = spark.read.parquet(f"{s3_url_trusted}/prod/graph/first_events")\
                .withColumn("image_path", getFileUDF(col("image_path"))) \
                .withColumn("parent_image_path", getFileUDF(col("parent_image_path"))) \
                .withColumn("new_path", getFileUDF(col("new_path"))) \
                .withColumn("file_path", getFileUDF(col("file_path"))) \
                .cache()
    df.unpersist()
    return df_new

In [13]:
def readFirstEvents():
    start_time = time.time()
    df = spark.read.parquet(f"{s3_url_trusted}/prod/final_features")
    print(time.strftime('%l:%M%p %Z on %b %d, %Y') + " --- read time: %s seconds ---" % (time.time() - start_time))
    return df

In [15]:
def checkFirstEventRead(day = 23):
    start_time = time.time()
    df = spark.read.parquet(f"{s3_url_trusted}/prod/graph/first_events").filter(col('event_day')==day)
    print(time.strftime('%l:%M%p %Z on %b %d, %Y') + " --- read time: %s seconds ---" % (time.time() - start_time))
    return df