In [1]:
#setup based on this: https://t-redactyl.io/blog/2020/08/reading-s3-data-into-a-spark-dataframe-using-sagemaker.html
import boto3
import json 
import time
from pyspark import SparkConf
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import sagemaker_pyspark
import botocore.session

## Set Spark Session Configuration

In [2]:
session = botocore.session.get_session()
credentials = session.get_credentials()

In [3]:
client = boto3.client('secretsmanager')
response = client.get_secret_value(
    SecretId='sapient-s3-access'
)
response = json.loads(response['SecretString'])
access_key = response["aws_access_key_id"]
secret_key = response["aws_secret_access_key"]

In [4]:
conf = (SparkConf()
        .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))

In [5]:
spark = (
    SparkSession
    .builder
    .config(conf=conf) \
    .config('fs.s3a.access.key', access_key)
    .config('fs.s3a.secret.key', secret_key)
    .appName("sapient")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/10 18:19:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Environment configuration

In [6]:
env = "dev"
# read from raw bucket + write to refined bucket
s3_url_raw = "s3a://sapient-bucket-raw/"
s3_url_refined = "s3a://sapient-bucket-refined/"

In [7]:
# ecarbro.json, AIA-1-25.ecar.json, conn.09_00_00-10_00_00.log
dev_file = f"{s3_url_raw}/ecarbro.json"
prod_file = ""
if env == "prod":
  filename = prod_file
else:
  filename = dev_file

In [8]:
start_time = time.time()
df_ecar_bro = spark.read.json(filename).cache()
print("--- %s seconds ---" % (time.time() - start_time))

23/02/10 18:19:27 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

--- 11.702203750610352 seconds ---


In [9]:
df_ecar_bro.limit(5).toPandas()

                                                                                

Unnamed: 0,action,actorID,hostname,id,object,objectID,pid,ppid,principal,properties,tid,timestamp
0,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,2ea7d45b-0a28-4d92-8ca7-58502d1ba631,FLOW,dfaf05b2-4d7c-408d-bb0f-d56f52b07f12,5920,-1,,"(1, C46Z8E9qAFrZ8Iks4, 195.219.101.2, 80, outb...",-1,2019-09-23T09:10:15.593-04:00
1,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,264c42c4-7914-4e76-809f-982fe0b241d0,FLOW,6ea8ef0c-6450-4e39-b473-c7d3d53578f3,5920,-1,,"(1, Chrr5B4nTDEYBgYjme, 195.219.101.5, 80, out...",-1,2019-09-23T09:10:15.623-04:00
2,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,3f057bc7-a39e-487e-8490-eb77f5bb6ed6,FLOW,6558c991-e5a0-409d-b5b9-96a3e44c747a,5920,-1,,"(1, CvZy4E919XjWUGVD4, 195.219.101.5, 443, out...",-1,2019-09-23T09:10:15.635-04:00
3,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,71978192-53c8-4cfa-a79f-193d760c3b01,FLOW,d57caaa4-66ec-400a-b305-049fa912b1dd,5920,-1,,"(1, CXUYLX9hxem2or1bh, 195.219.101.5, 443, out...",-1,2019-09-23T09:10:15.789-04:00
4,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,d02151e6-c183-46e2-82e0-0ed9eded3c92,FLOW,54530c7c-0b65-4ba9-a0c1-891d3e4e4efa,5920,-1,,"(1, Cv9qwV2jFSOU4wpBu9, 195.219.101.5, 443, ou...",-1,2019-09-23T09:10:15.79-04:00


In [10]:
# checkpoint loaded json to parquet
start_time = time.time()
df_ecar_bro.write.mode("overwrite").parquet(f"{s3_url_refined}/df_ecar_bro")
df_ecar_bro.write.saveAsTable("df_ecar_bro")
print("--- %s seconds ---" % (time.time() - start_time))

[Stage 3:>                                                          (0 + 2) / 2]

--- 10.557655572891235 seconds ---


                                                                                

In [11]:
# load checkpointed data as new dataframe
start_time = time.time()
df_ecar_bro = spark.read.parquet(f"{s3_url_refined}/df_ecar_bro").cache()
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.40232110023498535 seconds ---


In [12]:
df_ecar_bro.limit(5).toPandas()

                                                                                

Unnamed: 0,action,actorID,hostname,id,object,objectID,pid,ppid,principal,properties,tid,timestamp
0,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,2ea7d45b-0a28-4d92-8ca7-58502d1ba631,FLOW,dfaf05b2-4d7c-408d-bb0f-d56f52b07f12,5920,-1,,"(1, C46Z8E9qAFrZ8Iks4, 195.219.101.2, 80, outb...",-1,2019-09-23T09:10:15.593-04:00
1,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,264c42c4-7914-4e76-809f-982fe0b241d0,FLOW,6ea8ef0c-6450-4e39-b473-c7d3d53578f3,5920,-1,,"(1, Chrr5B4nTDEYBgYjme, 195.219.101.5, 80, out...",-1,2019-09-23T09:10:15.623-04:00
2,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,3f057bc7-a39e-487e-8490-eb77f5bb6ed6,FLOW,6558c991-e5a0-409d-b5b9-96a3e44c747a,5920,-1,,"(1, CvZy4E919XjWUGVD4, 195.219.101.5, 443, out...",-1,2019-09-23T09:10:15.635-04:00
3,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,71978192-53c8-4cfa-a79f-193d760c3b01,FLOW,d57caaa4-66ec-400a-b305-049fa912b1dd,5920,-1,,"(1, CXUYLX9hxem2or1bh, 195.219.101.5, 443, out...",-1,2019-09-23T09:10:15.789-04:00
4,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,d02151e6-c183-46e2-82e0-0ed9eded3c92,FLOW,54530c7c-0b65-4ba9-a0c1-891d3e4e4efa,5920,-1,,"(1, Cv9qwV2jFSOU4wpBu9, 195.219.101.5, 443, ou...",-1,2019-09-23T09:10:15.79-04:00
