In [None]:
#setup based on this: https://t-redactyl.io/blog/2020/08/reading-s3-data-into-a-spark-dataframe-using-sagemaker.html
import boto3
import json 
import time
import pandas as pd
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, isnan, when, count, col
import matplotlib.pyplot as plt
import sagemaker_pyspark
import botocore.session

## Set Spark Session Configuration

In [None]:
session = botocore.session.get_session()
credentials = session.get_credentials()

In [None]:
client = boto3.client('secretsmanager')
response = client.get_secret_value(
    SecretId='sapient-s3-access'
)
response = json.loads(response['SecretString'])
access_key = response["aws_access_key_id"]
secret_key = response["aws_secret_access_key"]

In [None]:
conf = (SparkConf()
        .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))

In [None]:
spark = (
    SparkSession
    .builder
    .config(conf=conf) \
    .config('fs.s3a.access.key', access_key)
    .config('fs.s3a.secret.key', secret_key)
    .config('spark.network.timeout', 300)
    .config('spark.memory.offHeap.size','4g')
    .config('spark.executor.memory', '16g')
    .appName("sapient")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/17 23:02:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/02/17 23:02:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Functions to Load and Read Data

In [None]:
# read from raw bucket + write to refined bucket + aggregate final to the trusted bucket
s3_url_raw = "s3a://sapient-bucket-raw/"
s3_url_refined = "s3a://sapient-bucket-refined/"
s3_url_trusted = "s3a://sapient-bucket-trusted/"
bro_cols_conn = ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_', 'id.resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 
                 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents']
bro_cols_rep = ['ts', 'level', 'message', 'location']

In [None]:
def loadAndCheckpoint(type):
    """
    type: ecar, ecar-bro, bro, labels
    This function reads a file from json or log text and writes it as a parquet.
    """
    start_time = time.time()
    if type == 'ecar':
        df = spark.read.json(f"{s3_url_raw}/{env}/{type}/**/**/**/*.json")
        df = df.limit(1000)
        df = df.select(*df.columns, "properties.*").drop('properties')
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    elif type == 'ecar-bro':
        df = spark.read.json(f"{s3_url_raw}/{env}/{type}/**/**/**/*.json")
        # this will extract and flatten nested properties column
        df = df.limit(1000)
        df = df.select(*df.columns, "properties.*").drop('properties')
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    elif type == 'bro':
        df = spark.read.csv(f"{s3_url_raw}/{env}/**/**/*.log", sep="\t", comment="#", header=False)
        df = df.limit(1000)
        df = df.toDF(*bro_cols_conn)
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    elif type == 'labels':
        df = spark.read.csv(f"{s3_url_raw}/{env}/{type}/*.csv", sep=",", header=True)
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    print("--- %s seconds ---" % (time.time() - start_time))
    df.unpersist()

In [None]:
start_time = time.time()
# df = spark.read.json(f"{s3_url_raw}/dev/ecar/evaluation/23Sep19-red/AIA-1-25/AIA-1-25.ecar.json")
print("--- %s seconds ---" % (time.time() - start_time))

--- 4.00543212890625e-05 seconds ---


In [None]:
start_time = time.time()
# df = spark.read.format("json").load(f"{s3_url_raw}/dev/ecar/evaluation/23Sep19-red/AIA-1-25/AIA-1-25.ecar.json")
print("--- %s seconds ---" % (time.time() - start_time))

--- 3.4332275390625e-05 seconds ---


In [None]:
import boto3
s3 = boto3.resource('s3')

In [None]:
bucket = s3.Bucket(s3_url_raw)
prefix="dev"

In [4]:
# https://medium.com/towards-data-engineering/get-keys-inside-an-s3-bucket-at-the-subfolder-level-7be42d858372
def get_matching_s3_objects(bucket, prefix="", suffix=""):
    """
    Generate objects in an S3 bucket.
    :param bucket: Name of the S3 bucket.
    :type bucket: str
    :param prefix: Only fetch objects whose key starts with this prefix (optional).
    :type prefix: tuple, list, str
    :param suffix: Only fetch objects whose keys end with this suffix (optional).
    :type suffix: str
    :return: None
    :rtype:
    """

    if isinstance(prefix, str):
        prefixes = (prefix, )
    else:
        prefixes = prefix

    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket)
    
    count = 0
    files_list = []
    
    for key_prefix in prefixes:
        for object_summary in my_bucket.objects.filter(Prefix=key_prefix):
            key = object_summary.key
            if key.endswith(suffix):
                count += 1
                files_list.append(key)
    print(f"count of total objects is {count}.")
    print(f"guesstimated time is " + str(round(2*count/60, 0)) + " hours.")
    return files_list

In [27]:
import io
import os
import boto3
import gzip
from boto3.s3.transfer import TransferConfig

In [62]:
config = TransferConfig(multipart_threshold=1024 * 300, 
                        max_concurrency=10,
                        multipart_chunksize=1024 * 300,
                        use_threads=True)

In [73]:
class ProgressPercentage(object):
        def __init__(self, filename):
            self._filename = filename
            self._size = float(os.path.getsize(filename))
            self._seen_so_far = 0
            self._lock = threading.Lock()

        def __call__(self, bytes_amount):
            # To simplify we'll assume this is hooked up
            # to a single filename.
            with self._lock:
                self._seen_so_far += bytes_amount
                percentage = (self._seen_so_far / self._size) * 100
                sys.stdout.write(
                    "\r%s  %s / %s  (%.2f%%)" % (
                        self._filename, self._seen_so_far, self._size,
                        percentage))
                sys.stdout.flush()

In [86]:
# https://medium.com/analytics-vidhya/aws-s3-multipart-upload-download-using-boto3-python-sdk-2dedb0945f11
# https://stackoverflow.com/questions/48466421/python-how-to-decompress-a-gzip-file-to-an-uncompressed-file-on-disk
def expand_json_gz(bucket, key):
    ''' download gzipped json file from s3, expand, and send to s3 '''
    s3 = boto3.resource('s3')
    client = boto3.client('s3')
    tmp_loc = f'/home/ec2-user/SageMaker/tmp/{key}'
    new_dir = os.path.dirname(tmp_loc)
    try: 
        os.makedirs(new_dir)
    except:
        pass
    # download gz file
    response = s3.meta.client.download_file(Bucket=bucket, Key=key, Filename=tmp_loc)
    exp_filename = tmp_loc.rstrip('.gz')
    print(exp_filename)
    with gzip.open(tmp_loc, 'r') as f_in, open(exp_filename, 'wb') as f_out:
        response = client.upload_file(
                            Filename = exp_filename,
                            Bucket=bucket,
                            Key= 'prod/'+ key.lstrip(file_pre),
                            Config=config
    )
    print("file upload complete")
    try: 
        os.remove(tmp_loc)
    except Exception as e:
        print(e)


In [87]:
file_pre = "pre_prod/"
file_prod = "prod"
log_type = "bro"
bucket = "sapient-bucket-raw"
s3_files = get_matching_s3_objects(bucket, file_pre+log_type)

count of total objects is 233.
guesstimated time is 8.0 hours.


In [88]:
bucketname = 'sapient-bucket-raw'      # input for your bucketname
key = 'pre_prod/ecar/evaluation/23Sep-night/AIA-501-525/AIA-501-525.ecar-2019-11-16T23-22-29.234.json.gz'  # input for your key on S3 (means S3 object fullpath)
actual = expand_json_gz(bucketname, key)

/home/ec2-user/SageMaker/tmp/pre_prod/ecar/evaluation/23Sep-night/AIA-501-525/AIA-501-525.ecar-2019-11-16T23-22-29.234.json
file upload complete


In [None]:
for f in s3_files:
    print(f.lstrip(file_pre))