In [2]:
#setup based on this: https://t-redactyl.io/blog/2020/08/reading-s3-data-into-a-spark-dataframe-using-sagemaker.html
import boto3
import json 
import time
import pandas as pd
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, isnan, when, count, col
import matplotlib.pyplot as plt
import sagemaker_pyspark
import botocore.session

## Set Spark Session Configuration

In [3]:
session = botocore.session.get_session()
credentials = session.get_credentials()

In [4]:
client = boto3.client('secretsmanager')
response = client.get_secret_value(
    SecretId='sapient-s3-access'
)
response = json.loads(response['SecretString'])
access_key = response["aws_access_key_id"]
secret_key = response["aws_secret_access_key"]

In [5]:
conf = (SparkConf()
        .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))

In [6]:
spark = (
    SparkSession
    .builder
    .config(conf=conf) \
    .config('fs.s3a.access.key', access_key)
    .config('fs.s3a.secret.key', secret_key)
    .config('spark.network.timeout', 300)
    .config('spark.memory.offHeap.size','4g')
    .config('spark.executor.memory', '16g')
    .appName("sapient")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/19 16:13:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Functions to Load and Read Data

In [7]:
# read from raw bucket + write to refined bucket + aggregate final to the trusted bucket
s3_url_raw = "s3a://sapient-bucket-raw/"
s3_url_refined = "s3a://sapient-bucket-refined/"
s3_url_trusted = "s3a://sapient-bucket-trusted/"
bro_cols_conn = ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_', 'id.resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 
                 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents']
bro_cols_rep = ['ts', 'level', 'message', 'location']

In [8]:
def loadAndCheckpoint(type):
    """
    type: ecar, ecar-bro, bro, labels
    This function reads a file from json or log text and writes it as a parquet.
    """
    start_time = time.time()
    if type == 'ecar':
        df = spark.read.json(f"{s3_url_raw}/{env}/{type}/**/**/**/*.json")
        df = df.limit(1000)
        df = df.select(*df.columns, "properties.*").drop('properties')
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    elif type == 'ecar-bro':
        df = spark.read.json(f"{s3_url_raw}/{env}/{type}/**/**/**/*.json")
        # this will extract and flatten nested properties column
        df = df.limit(1000)
        df = df.select(*df.columns, "properties.*").drop('properties')
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    elif type == 'bro':
        df = spark.read.csv(f"{s3_url_raw}/{env}/**/**/*.log", sep="\t", comment="#", header=False)
        df = df.limit(1000)
        df = df.toDF(*bro_cols_conn)
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    elif type == 'labels':
        df = spark.read.csv(f"{s3_url_raw}/{env}/{type}/*.csv", sep=",", header=True)
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    print("--- %s seconds ---" % (time.time() - start_time))
    df.unpersist()

In [9]:
start_time = time.time()
# df = spark.read.json(f"{s3_url_raw}/dev/ecar/evaluation/23Sep19-red/AIA-1-25/AIA-1-25.ecar.json")
print("--- %s seconds ---" % (time.time() - start_time))

--- 3.814697265625e-05 seconds ---


In [10]:
start_time = time.time()
# df = spark.read.format("json").load(f"{s3_url_raw}/dev/ecar/evaluation/23Sep19-red/AIA-1-25/AIA-1-25.ecar.json")
print("--- %s seconds ---" % (time.time() - start_time))

--- 3.123283386230469e-05 seconds ---


In [11]:
import boto3
s3 = boto3.resource('s3')

In [12]:
bucket = s3.Bucket(s3_url_raw)
prefix="dev"

In [13]:
# https://medium.com/towards-data-engineering/get-keys-inside-an-s3-bucket-at-the-subfolder-level-7be42d858372
def get_matching_s3_objects(bucket, prefix="", suffix=""):
    """
    Generate objects in an S3 bucket.
    :param bucket: Name of the S3 bucket.
    :type bucket: str
    :param prefix: Only fetch objects whose key starts with this prefix (optional).
    :type prefix: tuple, list, str
    :param suffix: Only fetch objects whose keys end with this suffix (optional).
    :type suffix: str
    :return: None
    :rtype:
    """

    if isinstance(prefix, str):
        prefixes = (prefix, )
    else:
        prefixes = prefix

    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket)
    
    count = 0
    files_list = []
    
    for key_prefix in prefixes:
        for object_summary in my_bucket.objects.filter(Prefix=key_prefix):
            key = object_summary.key
            if key.endswith(suffix):
                count += 1
                files_list.append(key)
    print(f"count of total objects is {count}.")
    print(f"guesstimated time is " + str(round(178*count/60/60, 0)) + " hours.")
    return count, files_list

In [14]:
import io
import os
import gzip
import boto3
import shutil
from boto3.s3.transfer import TransferConfig

In [15]:
config = TransferConfig(multipart_threshold=1024 * 300, 
                        max_concurrency=10,
                        multipart_chunksize=1024 * 300,
                        use_threads=True)

In [31]:
# https://medium.com/analytics-vidhya/aws-s3-multipart-upload-download-using-boto3-python-sdk-2dedb0945f11
# https://stackoverflow.com/questions/48466421/python-how-to-decompress-a-gzip-file-to-an-uncompressed-file-on-disk
def expand_json_gz(bucket, key):
    ''' download gzipped json file from s3, expand, and send to s3 '''
    s3 = boto3.resource('s3')
    client = boto3.client('s3')
    tmp_loc = f'/home/ec2-user/SageMaker/tmp/{key}'
    new_dir = os.path.dirname(tmp_loc)
    start_time = time.time()
    try: 
        os.makedirs(new_dir)
    except:
        pass
    # download gz file
    response = s3.meta.client.download_file(Bucket=bucket, Key=key, Filename=tmp_loc)
    exp_loc = tmp_loc.replace(".gz", "")
    key_exp = key.replace(file_pre, "").replace(".gz", "")
    print(exp_loc)
    with gzip.open(tmp_loc, 'r') as f_in: 
        with open(exp_loc, 'wb') as f_out:
            try:
                shutil.copyfileobj(f_in, f_out)
                response = client.upload_file(
                                Filename = exp_loc,
                                Bucket=bucket,
                                Key= 'prod/'+ key_exp.lstrip(file_pre),
                                Config=config
        )
                print("file upload complete")
            except Exception as e:
                print(e)
                print("failed file: " + file_pre)
                pass
    try: 
        os.remove(tmp_loc)
        os.remove(exp_loc)
    except Exception as e:
        print(e)
    print("--- %s seconds ---" % (time.time() - start_time))

In [32]:
file_pre = "pre_prod/"
file_prod = "prod"
log_type = "bro"
bucket = "sapient-bucket-raw"

In [33]:
# Single file test expansion
bucketname = 'sapient-bucket-raw'      # input for your bucketname
key = 'pre_prod/ecar/benign/18-19Sep19/AIA-451-475/AIA-451-475.ecar-2019-12-07T01-28-46.139.json.gz'  # input for your key on S3 (means S3 object fullpath)
actual = expand_json_gz(bucketname, key)

/home/ec2-user/SageMaker/tmp/pre_prod/ecar/benign/18-19Sep19/AIA-451-475/AIA-451-475.ecar-2019-12-07T01-28-46.139.json
Not a gzipped file (b'<!')
failed file: pre_prod/
--- 0.12856626510620117 seconds ---


In [53]:
print("start time at" + time.strftime('%l:%M%p %Z on %b %d, %Y'))
# ignore completed files previously logged
infile = r"/home/ec2-user/SageMaker/sapient/expand_files.log"
fs = ".json"

with open(infile) as f:
    f = f.readlines()
    
completed = [x.replace("/home/ec2-user/SageMaker/tmp/", "").replace("\n", "") for x in f if ".json" in x]
start_time = time.time()
s3_count, s3_files = get_matching_s3_objects(bucket = bucket, prefix = file_pre + "ecar/", suffix="gz")
for f in s3_files:
    if f.replace(".gz", "") not in completed:
        print(f)
        s3_count -= 1
    # expand_json_gz(bucketname, f)
    # print("There are " + str(s3_count) + " files remaining to convert")
print("total time was  --- %s seconds ---" % (time.time() - start_time))

start time at 4:52AM UTC on Feb 20, 2023


NameError: name 'get_matching_s3_objects' is not defined

In [2]:
infiles = [r"/home/ec2-user/SageMaker/sapient/expand_files.log", r"/home/ec2-user/SageMaker/sapient/expand_files.log1",
          r"/home/ec2-user/SageMaker/sapient/expand_files.log2"]
corpus = []

for infile in infiles:
    with open(infile) as f:
        f = f.readlines()
    corpus = corpus + f

# print(corpus)
    
# corpus = [*set(corpus)]
completed = [x.replace("/home/ec2-user/SageMaker/tmp/", "").replace("\n", "") for x in corpus if ".json" in x]

print(len(completed))
# print(completed)

473
