In [1]:
#setup based on this: https://t-redactyl.io/blog/2020/08/reading-s3-data-into-a-spark-dataframe-using-sagemaker.html
import boto3
import json 
import time
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import countDistinct
import matplotlib.pyplot as plt
import sagemaker_pyspark
import botocore.session

## Set Spark Session Configuration

In [2]:
session = botocore.session.get_session()
credentials = session.get_credentials()

In [3]:
client = boto3.client('secretsmanager')
response = client.get_secret_value(
    SecretId='sapient-s3-access'
)
response = json.loads(response['SecretString'])
access_key = response["aws_access_key_id"]
secret_key = response["aws_secret_access_key"]

In [4]:
conf = (SparkConf()
        .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))

In [5]:
spark = (
    SparkSession
    .builder
    .config(conf=conf) \
    .config('fs.s3a.access.key', access_key)
    .config('fs.s3a.secret.key', secret_key)
    .config('spark.network.timeout', 300)
    .config('spark.memory.offHeap.size','4g')
    .config('spark.executor.memory', '16g')
    .appName("sapient")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/11 18:14:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Environment configuration

In [56]:
# env can be dev or prod
env = "dev"

In [7]:
# read from raw bucket + write to refined bucket + aggregate final to the trusted bucket
s3_url_raw = "s3a://sapient-bucket-raw/"
s3_url_refined = "s3a://sapient-bucket-refined/"
s3_url_trusted = "s3a://sapient-bucket-trusted/"

## Functions to Load and Read Data

In [9]:
# ecarbro.json, AIA-1-25.ecar.json, conn.09_00_00-10_00_00.log
def s3_file(file):
    dev_file = f"{s3_url_raw}/{env}/{file}"
    prod_file = ""
    if env == "prod":
      filename = prod_file
    else:
      filename = dev_file
    return filename

In [46]:
def loadAndCheckpoint(type, filename, format):
    """
    type: ecar, ecar_bro, bro
    filename: name of file. only does one at a time
    format: 'json' or 'log'
    """
    start_time = time.time()
    if format == 'json':
        df = spark.read.json(f"{s3_url_raw}/{env}/{type}/**/**/**/*.json")
        df.write.mode("overwrite").partitionBy("hostname").parquet(f"{s3_url_refined}/{env}/{type}")
    elif format == 'log':
        df = spark.read.text(f"{s3_url_raw}/{env}/**/**/*.log")
        # print(f"Your dataframe has {df.count():,} rows.")
        df.write.mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    print("--- %s seconds ---" % (time.time() - start_time))
    df.unpersist()

In [47]:
def readCheckpoint(file_type):
    """
    type: ecar, ecar-bro, bro
    """
    s3_parquet_loc = f"{s3_url_refined}/{env}/{file_type}"
    start_time = time.time()
    df = spark.read.parquet(s3_parquet_loc).cache()
    print(f"Your dataframe has {df.count():,} rows.")
    print("--- %s seconds ---" % (time.time() - start_time))
    return df

## Data Processing

In [48]:
loadAndCheckpoint('ecar-bro', 'ecarbro.json', 'json')

                                                                                

--- 45.427204608917236 seconds ---


In [49]:
df_ecarbro = readCheckpoint('ecar-bro')

23/02/11 19:11:04 WARN CacheManager: Asked to cache already cached data.




Your dataframe has 175,840 rows.
--- 4.731751918792725 seconds ---


                                                                                

In [50]:
df_ecarbro.limit(5).toPandas()

Unnamed: 0,action,actorID,id,object,objectID,pid,ppid,principal,properties,tid,timestamp,hostname
0,INFO,9e2dcfd0-7c69-4955-b3bc-0ba80d19e54f,88336986-8b3c-4b2a-95d2-322bd942ebc6,FLOW,5b568c6b-7923-4b22-afb1-2e4b934f0b55,5928,5188,SYSTEMIACOM\syamamori,"(1, C7ZGVe3gZhlaDh9w85, 165.101.35.5, 443, out...",-1,2019-09-23T14:58:40.808-04:00,SysClient0016.systemia.com
1,INFO,9e2dcfd0-7c69-4955-b3bc-0ba80d19e54f,b197e093-5c0a-4ead-8692-98d73f42c4e3,FLOW,62317d1b-e0f3-4d7b-ab55-24c625c23950,5928,5188,SYSTEMIACOM\syamamori,"(1, C9dFtb4uitSXUzcdWc, 165.101.35.5, 443, out...",-1,2019-09-23T14:58:40.819-04:00,SysClient0016.systemia.com
2,INFO,9e2dcfd0-7c69-4955-b3bc-0ba80d19e54f,3520c984-ac3d-46a4-a29e-3a956f3f5289,FLOW,0d734c8c-b93f-4a6c-a242-76fa4d9ca282,5928,5188,SYSTEMIACOM\syamamori,"(1, CzwkjJ3Crv3HFrugrd, 165.101.35.5, 443, out...",-1,2019-09-23T14:58:40.833-04:00,SysClient0016.systemia.com
3,INFO,9e2dcfd0-7c69-4955-b3bc-0ba80d19e54f,9dd35179-cff2-49ef-87e5-e74a9c40f96f,FLOW,ae868dc1-05aa-4108-8f0b-95688c3d987c,5928,5188,SYSTEMIACOM\syamamori,"(1, CAJ3urNkIIlXGf3Fk, 165.101.35.5, 443, outb...",-1,2019-09-23T14:58:40.843-04:00,SysClient0016.systemia.com
4,INFO,9e2dcfd0-7c69-4955-b3bc-0ba80d19e54f,9ed02891-3b5b-4de3-9bd5-7d1fe68ac63b,FLOW,84477621-a0bb-4252-8a5c-2cf4f9171ac9,5928,5188,SYSTEMIACOM\syamamori,"(1, CeG0vk1NgaAAUOh1c, 165.101.35.5, 443, outb...",-1,2019-09-23T14:58:41.084-04:00,SysClient0016.systemia.com


In [51]:
df_ecarbro.select(countDistinct("hostname")).show()

+------------------------+
|count(DISTINCT hostname)|
+------------------------+
|                      23|
+------------------------+



In [52]:
df_ecarbro.select('action').distinct().collect()

[Row(action='INFO')]

In [53]:
loadAndCheckpoint('bro', 'conn.09_00_00-10_00_00.log', 'log')

                                                                                

--- 7.9849817752838135 seconds ---


In [54]:
df_bro = readCheckpoint('bro')

23/02/11 19:11:26 WARN CacheManager: Asked to cache already cached data.


[Stage 59:>                                                         (0 + 4) / 4]

Your dataframe has 1,638,521 rows.
--- 2.1504507064819336 seconds ---


                                                                                

In [55]:
df_bro.limit(5).toPandas()

Unnamed: 0,value
0,1569230193.550518\tCKPrPt3L4VSAVVODHi\t46.143....
1,1569230193.614852\tCU16Ab2bJSshPdlVka\t142.20....
2,1569230193.667477\tCQoLEJ1M8wo0u1kpxf\t142.20....
3,1569230192.827132\tCoUN297UhM6izGwLc\t142.20.5...
4,1569230187.517345\tCTk5ld2wT6l3uNjPOb\t142.20....
