In [1]:
#setup based on this: https://t-redactyl.io/blog/2020/08/reading-s3-data-into-a-spark-dataframe-using-sagemaker.html
import boto3
import json 
import time
import pandas as pd
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, isnan, when, count, col
import matplotlib.pyplot as plt
import sagemaker_pyspark
import botocore.session

## Set Spark Session Configuration

In [2]:
session = botocore.session.get_session()
credentials = session.get_credentials()

In [3]:
client = boto3.client('secretsmanager')
response = client.get_secret_value(
    SecretId='sapient-s3-access'
)
response = json.loads(response['SecretString'])
access_key = response["aws_access_key_id"]
secret_key = response["aws_secret_access_key"]

In [4]:
conf = (SparkConf()
        .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))

In [5]:
spark = (
    SparkSession
    .builder
    .config(conf=conf) \
    .config('fs.s3a.access.key', access_key)
    .config('fs.s3a.secret.key', secret_key)
    .config('spark.network.timeout', 300)
    .config('spark.memory.offHeap.size','4g')
    .config('spark.executor.memory', '16g')
    .appName("sapient")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/22 07:02:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/02/22 07:02:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Functions to Load and Read Data

In [6]:
# read from raw bucket + write to refined bucket + aggregate final to the trusted bucket
s3_url_raw = "s3a://sapient-bucket-raw/"
s3_url_refined = "s3a://sapient-bucket-refined/"
s3_url_trusted = "s3a://sapient-bucket-trusted/"
bro_cols_conn = ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_', 'id.resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 
                 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents']
bro_cols_rep = ['ts', 'level', 'message', 'location']

In [7]:
def loadAndCheckpoint(type='ecar-bro', env='dev', size='small'):
    """
    type: ecar, ecar-bro, bro, labels
    This function reads a file from json or log text and writes it as a parquet.
    """
    if size == 'small':
        # 1 million
        read_lim = 1000000
    elif size == 'medium':
        # 1 billion
        read_lim = 1000000000
    elif size == 'large':
        # 15 billion
        read_lim = 15000000000   
    start_time = time.time()
    if type in ('ecar', 'car'):
        df = spark.read.json(f"{s3_url_raw}/{env}/{type}/**/**/**/*.json")
        df = df.limit(read_lim)
        df = df.select(*df.columns, "properties.*").drop('properties')
        df.write.option("maxRecordsPerFile", 300000).mode("overwrite").parquet(f"{s3_url_refined}/{size}/{env}/ecar")
    elif type in ('ecar-bro','car-bro'):
        df = spark.read.json(f"{s3_url_raw}/{env}/{type}/**/**/**/*.json")
        # this will extract and flatten nested properties column
        df = df.limit(read_lim)
        df = df.select(*df.columns, "properties.*").drop('properties')
        df.write.option("maxRecordsPerFile", 300000).mode("overwrite").parquet(f"{s3_url_refined}/{size}/{env}/ecar-bro")
    elif type == 'bro':
        df = spark.read.csv(f"{s3_url_raw}/{env}/**/**/conn*.log", sep="\t", comment="#", header=False)
        df = df.limit(read_lim)
        df = df.toDF(*bro_cols_conn)
        df.write.option("maxRecordsPerFile", 300000).mode("overwrite").parquet(f"{s3_url_refined}/{size}/{env}/bro")
    elif type == 'labels':
        df = spark.read.csv(f"{s3_url_raw}/{env}/{type}/*.csv", sep=",", header=True)
        df.write.option("maxRecordsPerFile", 300000).mode("overwrite").parquet(f"{s3_url_refined}/{size}/{env}/labels")
    print("--- %s seconds ---" % (time.time() - start_time))
    df.unpersist()

In [8]:
env='prod'
size='small'

In [9]:
loadAndCheckpoint('bro', env=env, size=size)

23/02/22 07:02:06 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

--- 54.75096869468689 seconds ---


In [10]:
loadAndCheckpoint('car-bro', env=env, size=size)

                                                                                

--- 367.31977105140686 seconds ---


In [None]:
loadAndCheckpoint('car', env=env, size=size)

[Stage 14:=>                                                (1046 + 16) / 50536]

In [None]:
size='medium'

In [None]:
loadAndCheckpoint('bro', env=env, size=size)

In [None]:
loadAndCheckpoint('car-bro', env=env, size=size)

In [None]:
loadAndCheckpoint('car', env=env, size=size)

In [None]:
size='large'

In [None]:
loadAndCheckpoint('bro', env=env, size=size)

In [None]:
loadAndCheckpoint('car-bro', env=env, size=size)

In [None]:
loadAndCheckpoint('car', env=env, size=size)