In [1]:
#setup based on this: https://t-redactyl.io/blog/2020/08/reading-s3-data-into-a-spark-dataframe-using-sagemaker.html
import boto3
import json 
import time
import pandas as pd
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, isnan, when, count, col
import matplotlib.pyplot as plt
import sagemaker_pyspark
import botocore.session

## Set Spark Session Configuration

In [2]:
session = botocore.session.get_session()
credentials = session.get_credentials()

In [3]:
client = boto3.client('secretsmanager')
response = client.get_secret_value(
    SecretId='sapient-s3-access'
)
response = json.loads(response['SecretString'])
access_key = response["aws_access_key_id"]
secret_key = response["aws_secret_access_key"]

In [4]:
conf = (SparkConf()
        .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))

In [5]:
spark = (
    SparkSession
    .builder
    .config(conf=conf) \
    .config('fs.s3a.access.key', access_key)
    .config('fs.s3a.secret.key', secret_key)
    .config('spark.network.timeout', 300)
    .config('spark.memory.offHeap.size','4g')
    .config('spark.executor.memory', '16g')
    .appName("sapient")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/16 02:18:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/02/16 02:18:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/02/16 02:18:47 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Functions to Load and Read Data

In [6]:
# read from raw bucket + write to refined bucket + aggregate final to the trusted bucket
s3_url_raw = "s3a://sapient-bucket-raw/"
s3_url_refined = "s3a://sapient-bucket-refined/"
s3_url_trusted = "s3a://sapient-bucket-trusted/"
bro_cols_conn = ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_', 'id.resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 
                 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents']
bro_cols_rep = ['ts', 'level', 'message', 'location']

In [7]:
# ecarbro.json, AIA-1-25.ecar.json, conn.09_00_00-10_00_00.log
def s3_file(file):
    dev_file = f"{s3_url_raw}/{env}/{file}"
    prod_file = ""
    if env == "prod":
      filename = prod_file
    else:
      filename = dev_file
    return filename

In [8]:
def loadAndCheckpoint(type):
    """
    type: ecar, ecar-bro, bro, labels
    This function reads a file from json or log text and writes it as a parquet.
    """
    start_time = time.time()
    if type == 'ecar':
        df = spark.read.json(f"{s3_url_raw}/{env}/{type}/**/**/**/*.json")
        df = df.limit(1000)
        df = df.select(*df.columns, "properties.*").drop('properties')
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    elif type == 'ecar-bro':
        df = spark.read.json(f"{s3_url_raw}/{env}/{type}/**/**/**/*.json")
        # this will extract and flatten nested properties column
        df = df.limit(1000)
        df = df.select(*df.columns, "properties.*").drop('properties')
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    elif type == 'bro':
        df = spark.read.csv(f"{s3_url_raw}/{env}/**/**/*.log", sep="\t", comment="#", header=False)
        df = df.limit(1000)
        df = df.toDF(*bro_cols_conn)
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    elif type == 'labels':
        df = spark.read.csv(f"{s3_url_raw}/{env}/{type}/*.csv", sep=",", header=True)
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    print("--- %s seconds ---" % (time.time() - start_time))
    df.unpersist()

In [9]:
start_time = time.time()
df = spark.read.json(f"{s3_url_raw}/dev/ecar/evaluation/23Sep19-red/AIA-1-25/AIA-1-25.ecar.json")
print("--- %s seconds ---" % (time.time() - start_time))

23/02/16 02:18:49 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties




23/02/16 02:20:19 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
--- 91.13822054862976 seconds ---


                                                                                

In [10]:
start_time = time.time()
df = spark.read.format("json").load(f"{s3_url_raw}/dev/ecar/evaluation/23Sep19-red/AIA-1-25/AIA-1-25.ecar.json")
print("--- %s seconds ---" % (time.time() - start_time))



--- 84.66692614555359 seconds ---


                                                                                

In [11]:
import boto3
s3 = boto3.resource('s3')

In [13]:
bucket = s3.Bucket(s3_url_raw)
prefix="dev"

In [64]:
# https://medium.com/towards-data-engineering/get-keys-inside-an-s3-bucket-at-the-subfolder-level-7be42d858372
def get_matching_s3_objects(bucket, prefix="", suffix=""):
    """
    Generate objects in an S3 bucket.
    :param bucket: Name of the S3 bucket.
    :type bucket: str
    :param prefix: Only fetch objects whose key starts with this prefix (optional).
    :type prefix: tuple, list, str
    :param suffix: Only fetch objects whose keys end with this suffix (optional).
    :type suffix: str
    :return: None
    :rtype:
    """

    if isinstance(prefix, str):
        prefixes = (prefix, )
    else:
        prefixes = prefix

    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket)
    
    count = 0
    files_list = []
    
    for key_prefix in prefixes:
        for object_summary in my_bucket.objects.filter(Prefix=key_prefix):
            key = object_summary.key
            if key.endswith(suffix):
                count += 1
                files_list.append(key)
    print(f"count of total objects is {count}.")
    print(f"guesstimated time is " + str(round(2*count/60, 0)) + " hours.")
    # print(files_list)

In [65]:
get_matching_s3_objects("sapient-bucket-raw", "prod")

count of total objects is 527.
guesstimated time is 18.0 hours.


In [67]:
import io
import gzip

In [69]:
def download_json_gz(s3client, bucket, key):
    ''' download gzipped json file from s3 and convert to dict '''
    response = s3client.get_object(Bucket=bucket, Key=key)
    content = response['Body'].read()
    with gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') as fh:
        return json.load(fh)# at the end of the file
# spark.stop()

In [70]:
bucketname = 'sapient-bucket-raw'      # input for your bucketname
key = 's3://sapient-bucket-raw/prod/ecar/evaluation/23Sep-night/AIA-501-525/AIA-501-525.ecar-2019-11-16T23-22-29.234.json.gz'  # input for your key on S3 (means S3 object fullpath)
actual = download_json_gz(s3, bucketname, key)

AttributeError: 's3.ServiceResource' object has no attribute 'get_object'

In [69]:
def download_json_gz(s3client, bucket, key):
    ''' download gzipped json file from s3 and convert to dict '''
    response = s3client.get_object(Bucket=bucket, Key=key)
    content = response['Body'].read()
    with gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') as fh:
        return json.load(fh)# at the end of the file
# spark.stop()