In [1]:
#setup based on this: https://t-redactyl.io/blog/2020/08/reading-s3-data-into-a-spark-dataframe-using-sagemaker.html
import boto3
import json 
import time
from pyspark import SparkConf
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import sagemaker_pyspark
import botocore.session

## Set Spark Session Configuration

In [2]:
session = botocore.session.get_session()
credentials = session.get_credentials()

In [3]:
client = boto3.client('secretsmanager')
response = client.get_secret_value(
    SecretId='sapient-s3-access'
)
response = json.loads(response['SecretString'])
access_key = response["aws_access_key_id"]
secret_key = response["aws_secret_access_key"]

In [4]:
conf = (SparkConf()
        .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))

In [5]:
spark = (
    SparkSession
    .builder
    .config(conf=conf) \
    .config('fs.s3a.access.key', access_key)
    .config('fs.s3a.secret.key', secret_key)
    .appName("sapient")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/10 20:18:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Environment configuration

In [6]:
env = "dev"

In [7]:
# read from raw bucket + write to refined bucket + aggregate final to the trusted bucket
s3_url_raw = "s3a://sapient-bucket-raw/"
s3_url_refined = "s3a://sapient-bucket-refined/"
s3_url_trusted = "s3a://sapient-bucket-trusted/"

In [8]:
# ecarbro.json, AIA-1-25.ecar.json, conn.09_00_00-10_00_00.log
def s3_file(file):
    dev_file = f"{s3_url_raw}/{file}"
    prod_file = ""
    if env == "prod":
      filename = prod_file
    else:
      filename = dev_file
    return filename

In [9]:
def loadAndCheckpoint(type, filename, format):
    """
    type: ecar, ecar_bro, bro
    filename: name of file. only does one at a time
    format: 'json' or 'log'
    """
    filename = s3_file(filename)
    start_time = time.time()
    if format == 'json':
        df = spark.read.json(filename).cache()
    elif format == 'log':
        df = spark.read.text(filename).cache()
    df.write.mode("overwrite").parquet(f"{s3_url_refined}/{type}")
    # df.write.saveAsTable("df_ecar_bro")
    print("--- %s seconds ---" % (time.time() - start_time))
    # df.unpersist()

In [10]:
def readCheckpoint(file_type):
    """
    type: ecar, ecar_bro, bro
    """
    s3_parquet_loc = f"{s3_url_refined}/{file_type}"
    start_time = time.time()
    df = spark.read.parquet(s3_parquet_loc).cache()
    print("--- %s seconds ---" % (time.time() - start_time))
    return df

In [11]:
loadAndCheckpoint('ecar_bro', 'ecarbro.json', 'json')

23/02/10 20:18:05 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

--- 19.770634412765503 seconds ---


In [14]:
df_ecarbro = readCheckpoint('ecar_bro')

23/02/10 20:19:10 WARN CacheManager: Asked to cache already cached data.
--- 0.27623867988586426 seconds ---


In [15]:
df_ecarbro.limit(5).toPandas()

Unnamed: 0,action,actorID,hostname,id,object,objectID,pid,ppid,principal,properties,tid,timestamp
0,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,2ea7d45b-0a28-4d92-8ca7-58502d1ba631,FLOW,dfaf05b2-4d7c-408d-bb0f-d56f52b07f12,5920,-1,,"(1, C46Z8E9qAFrZ8Iks4, 195.219.101.2, 80, outb...",-1,2019-09-23T09:10:15.593-04:00
1,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,264c42c4-7914-4e76-809f-982fe0b241d0,FLOW,6ea8ef0c-6450-4e39-b473-c7d3d53578f3,5920,-1,,"(1, Chrr5B4nTDEYBgYjme, 195.219.101.5, 80, out...",-1,2019-09-23T09:10:15.623-04:00
2,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,3f057bc7-a39e-487e-8490-eb77f5bb6ed6,FLOW,6558c991-e5a0-409d-b5b9-96a3e44c747a,5920,-1,,"(1, CvZy4E919XjWUGVD4, 195.219.101.5, 443, out...",-1,2019-09-23T09:10:15.635-04:00
3,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,71978192-53c8-4cfa-a79f-193d760c3b01,FLOW,d57caaa4-66ec-400a-b305-049fa912b1dd,5920,-1,,"(1, CXUYLX9hxem2or1bh, 195.219.101.5, 443, out...",-1,2019-09-23T09:10:15.789-04:00
4,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,d02151e6-c183-46e2-82e0-0ed9eded3c92,FLOW,54530c7c-0b65-4ba9-a0c1-891d3e4e4efa,5920,-1,,"(1, Cv9qwV2jFSOU4wpBu9, 195.219.101.5, 443, ou...",-1,2019-09-23T09:10:15.79-04:00


In [16]:
loadAndCheckpoint('ecar', 'AIA-1-25.ecar.json', 'json')

                                                                                

23/02/10 20:22:44 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 7:>                                                        (0 + 4) / 120]

23/02/10 20:22:51 WARN MemoryStore: Not enough space to cache rdd_34_1 in memory! (computed 106.2 MiB so far)
23/02/10 20:22:51 WARN BlockManager: Persisting block rdd_34_1 to disk instead.
23/02/10 20:22:51 WARN MemoryStore: Not enough space to cache rdd_34_3 in memory! (computed 105.0 MiB so far)
23/02/10 20:22:51 WARN BlockManager: Persisting block rdd_34_3 to disk instead.
23/02/10 20:22:52 WARN MemoryStore: Not enough space to cache rdd_34_2 in memory! (computed 106.2 MiB so far)
23/02/10 20:22:52 WARN BlockManager: Persisting block rdd_34_2 to disk instead.
23/02/10 20:22:56 WARN MemoryStore: Not enough space to cache rdd_34_1 in memory! (computed 54.7 MiB so far)
23/02/10 20:22:56 WARN MemoryStore: Not enough space to cache rdd_34_2 in memory! (computed 54.8 MiB so far)


[Stage 7:=>                                                       (4 + 4) / 120]

23/02/10 20:23:05 WARN MemoryStore: Not enough space to cache rdd_34_4 in memory! (computed 54.4 MiB so far)
23/02/10 20:23:05 WARN BlockManager: Persisting block rdd_34_4 to disk instead.
23/02/10 20:23:06 WARN MemoryStore: Not enough space to cache rdd_34_5 in memory! (computed 54.2 MiB so far)
23/02/10 20:23:06 WARN BlockManager: Persisting block rdd_34_5 to disk instead.
23/02/10 20:23:06 WARN MemoryStore: Not enough space to cache rdd_34_7 in memory! (computed 54.0 MiB so far)
23/02/10 20:23:06 WARN BlockManager: Persisting block rdd_34_7 to disk instead.
23/02/10 20:23:06 WARN MemoryStore: Not enough space to cache rdd_34_6 in memory! (computed 53.7 MiB so far)
23/02/10 20:23:06 WARN BlockManager: Persisting block rdd_34_6 to disk instead.
23/02/10 20:23:27 WARN BlockManager: Block rdd_34_7 could not be removed as it was not found on disk or in memory
23/02/10 20:23:27 ERROR Executor: Exception in task 7.0 in stage 7.0 (TID 139)
java.lang.OutOfMemoryError: GC overhead limit excee

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/pyspark/sql/utils.py", line 190, in deco
    return f(*a, **kw)
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/py4j/protocol.py", line 326, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: <unprintable Py4JJavaError object>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in

Py4JError: py4j does not exist in the JVM

In [None]:
df_ecar = readCheckpoint('ecar')

In [None]:
df_ecar.limit(5).toPandas()