In [1]:
#setup based on this: https://t-redactyl.io/blog/2020/08/reading-s3-data-into-a-spark-dataframe-using-sagemaker.html
import boto3
import json 
import time
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col
import matplotlib.pyplot as plt
import sagemaker_pyspark
import botocore.session

## Set Spark Session Configuration

In [2]:
session = botocore.session.get_session()
credentials = session.get_credentials()

In [3]:
client = boto3.client('secretsmanager')
response = client.get_secret_value(
    SecretId='sapient-s3-access'
)
response = json.loads(response['SecretString'])
access_key = response["aws_access_key_id"]
secret_key = response["aws_secret_access_key"]

In [4]:
conf = (SparkConf()
        .set("spark.driver.extraClassPath", ":".join(sagemaker_pyspark.classpath_jars())))

In [5]:
spark = (
    SparkSession
    .builder
    .config(conf=conf) \
    .config('fs.s3a.access.key', access_key)
    .config('fs.s3a.secret.key', secret_key)
    .config('spark.network.timeout', 300)
    .config('spark.memory.offHeap.size','4g')
    .config('spark.executor.memory', '16g')
    .appName("sapient")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/13 07:36:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Functions to Load and Read Data

In [6]:
# read from raw bucket + write to refined bucket + aggregate final to the trusted bucket
s3_url_raw = "s3a://sapient-bucket-raw/"
s3_url_refined = "s3a://sapient-bucket-refined/"
s3_url_trusted = "s3a://sapient-bucket-trusted/"
bro_cols_conn = ['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_', 'id.resp_p', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes', 'conn_state', 
                 'local_orig', 'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'tunnel_parents']
bro_cols_rep = ['ts', 'level', 'message', 'location']

In [7]:
# ecarbro.json, AIA-1-25.ecar.json, conn.09_00_00-10_00_00.log
def s3_file(file):
    dev_file = f"{s3_url_raw}/{env}/{file}"
    prod_file = ""
    if env == "prod":
      filename = prod_file
    else:
      filename = dev_file
    return filename

In [8]:
def loadAndCheckpoint(type):
    """
    type: ecar, ecar-bro, bro
    This function reads a file from json or log text and writes it as a parquet.
    """
    start_time = time.time()
    if type == 'ecar':
        df = spark.read.json(f"{s3_url_raw}/{env}/{type}/**/**/**/*.json")
        df = df.limit(1000)
        df = df.select(*df.columns, "properties.*").drop('properties')
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    elif type == 'ecar-bro':
        df = spark.read.json(f"{s3_url_raw}/{env}/{type}/**/**/**/*.json")
        # this will extract and flatten nested properties column
        df = df.limit(1000)
        df = df.select(*df.columns, "properties.*").drop('properties')
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    elif type == 'bro':
        df = spark.read.csv(f"{s3_url_raw}/{env}/**/**/*.log", sep="\t", comment="#", header=False)
        df = df.limit(1000)
        df = df.toDF(*bro_cols_conn)
        df.write.option("maxRecordsPerFile", 100000).mode("overwrite").parquet(f"{s3_url_refined}/{env}/{type}")
    print("--- %s seconds ---" % (time.time() - start_time))
    df.unpersist()

In [9]:
def readCheckpoint(file_type):
    """
    type: ecar, ecar-bro, bro
    """
    s3_parquet_loc = f"{s3_url_refined}/{env}/{file_type}"
    start_time = time.time()
    df = spark.read.parquet(s3_parquet_loc)
    # rdd = spark.sparkContext.parallelize(df.take(1000))
    # print(f"Your dataframe has {rdd.count():,} rows.")
    print("--- %s seconds ---" % (time.time() - start_time))
    return df

## Environment configuration

In [10]:
# env can be dev or prod
env = "dev"

## Data Processing

In [11]:
loadAndCheckpoint('bro')

23/02/13 07:36:07 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

--- 12.691768407821655 seconds ---


In [12]:
df_bro = readCheckpoint('bro')

--- 0.38132190704345703 seconds ---


In [22]:
df_bro.printSchema()

root
 |-- ts: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- id.orig_h: string (nullable = true)
 |-- id.orig_p: string (nullable = true)
 |-- id.resp_: string (nullable = true)
 |-- id.resp_p: string (nullable = true)
 |-- proto: string (nullable = true)
 |-- service: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- orig_bytes: string (nullable = true)
 |-- resp_bytes: string (nullable = true)
 |-- conn_state: string (nullable = true)
 |-- local_orig: string (nullable = true)
 |-- local_resp: string (nullable = true)
 |-- missed_bytes: string (nullable = true)
 |-- history: string (nullable = true)
 |-- orig_pkts: string (nullable = true)
 |-- orig_ip_bytes: string (nullable = true)
 |-- resp_pkts: string (nullable = true)
 |-- resp_ip_bytes: string (nullable = true)
 |-- tunnel_parents: string (nullable = true)



In [13]:
df_bro.select("uid", "`id.orig_h`", "`id.orig_p`", "`id.resp_`", "`id.resp_p`").limit(5).toPandas()

Unnamed: 0,uid,id.orig_h,id.orig_p,id.resp_,id.resp_p
0,CXjvkk4Cn2Z9gtezOk,142.20.58.111,52559,165.101.35.5,443
1,CJtLsd15U6DBxokNYk,142.20.57.118,52465,153.129.45.5,443
2,CadbW9GsF5vphwxGl,142.20.57.62,50181,86.62.223.5,443
3,C2GVgr1qQ7xf9uE2Aj,142.20.57.62,50185,86.62.223.5,443
4,CagHOu3mFWXRh2gr47,142.20.58.38,62734,165.101.35.5,443


In [14]:
loadAndCheckpoint('ecar-bro')

                                                                                

--- 5.213835716247559 seconds ---


In [15]:
df_ecarbro = readCheckpoint('ecar-bro')

--- 0.24833250045776367 seconds ---


In [16]:
df_ecarbro.limit(5).toPandas()

Unnamed: 0,action,actorID,hostname,id,object,objectID,pid,ppid,principal,tid,timestamp,acuity_level,bro_uid,dest_ip,dest_port,direction,image_path,l4protocol,src_ip,src_port
0,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,2ea7d45b-0a28-4d92-8ca7-58502d1ba631,FLOW,dfaf05b2-4d7c-408d-bb0f-d56f52b07f12,5920,-1,,-1,2019-09-23T09:10:15.593-04:00,1,C46Z8E9qAFrZ8Iks4,195.219.101.2,80,outbound,,6,142.20.56.25,63394
1,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,264c42c4-7914-4e76-809f-982fe0b241d0,FLOW,6ea8ef0c-6450-4e39-b473-c7d3d53578f3,5920,-1,,-1,2019-09-23T09:10:15.623-04:00,1,Chrr5B4nTDEYBgYjme,195.219.101.5,80,outbound,,6,142.20.56.25,63395
2,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,3f057bc7-a39e-487e-8490-eb77f5bb6ed6,FLOW,6558c991-e5a0-409d-b5b9-96a3e44c747a,5920,-1,,-1,2019-09-23T09:10:15.635-04:00,1,CvZy4E919XjWUGVD4,195.219.101.5,443,outbound,,6,142.20.56.25,63396
3,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,71978192-53c8-4cfa-a79f-193d760c3b01,FLOW,d57caaa4-66ec-400a-b305-049fa912b1dd,5920,-1,,-1,2019-09-23T09:10:15.789-04:00,1,CXUYLX9hxem2or1bh,195.219.101.5,443,outbound,,6,142.20.56.25,63397
4,INFO,4767c80e-1e6b-412e-9432-fad5898fa7db,SysClient0024.systemia.com,d02151e6-c183-46e2-82e0-0ed9eded3c92,FLOW,54530c7c-0b65-4ba9-a0c1-891d3e4e4efa,5920,-1,,-1,2019-09-23T09:10:15.79-04:00,1,Cv9qwV2jFSOU4wpBu9,195.219.101.5,443,outbound,,6,142.20.56.25,63398


In [17]:
loadAndCheckpoint('ecar')

                                                                                

23/02/13 07:41:00 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

--- 283.63865780830383 seconds ---


In [18]:
df_ecar = readCheckpoint('ecar')

--- 0.22626137733459473 seconds ---


In [19]:
df_ecar.printSchema()

root
 |-- action: string (nullable = true)
 |-- actorID: string (nullable = true)
 |-- hostname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- object: string (nullable = true)
 |-- objectID: string (nullable = true)
 |-- pid: long (nullable = true)
 |-- ppid: long (nullable = true)
 |-- principal: string (nullable = true)
 |-- tid: long (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- acuity_level: string (nullable = true)
 |-- base_address: string (nullable = true)
 |-- command_line: string (nullable = true)
 |-- context_info: string (nullable = true)
 |-- data: string (nullable = true)
 |-- dest_ip: string (nullable = true)
 |-- dest_port: string (nullable = true)
 |-- direction: string (nullable = true)
 |-- end_time: string (nullable = true)
 |-- file_path: string (nullable = true)
 |-- image_path: string (nullable = true)
 |-- info_class: string (nullable = true)
 |-- key: string (nullable = true)
 |-- l4protocol: string (nullable = true)
 |-- logo

In [20]:
df_ecar.limit(5).toPandas()

Unnamed: 0,action,actorID,hostname,id,object,objectID,pid,ppid,principal,tid,...,task_process_uuid,tgt_pid,tgt_pid_uuid,tgt_tid,type,user,user_name,user_stack_base,user_stack_limit,value
0,CREATE,89f91b70-9613-4e70-9473-23bb029e3889,SysClient0004.systemia.com,97e0d110-cc53-4baf-a293-4646e6d967a9,THREAD,9e902e20-df97-4120-8ed5-01b61980bab8,312,-1,,4548,...,,312,,3332,,,,3631f00000,3631ef8000,
1,CREATE,89f91b70-9613-4e70-9473-23bb029e3889,SysClient0004.systemia.com,9adb7fc5-73f3-4567-8f6b-adbbc4a9ba3d,THREAD,bafe59cd-d8af-4be1-96b1-e7c727c1aec8,312,-1,,4548,...,,312,,3188,,,,3632000000,3631ff8000,
2,CREATE,89f91b70-9613-4e70-9473-23bb029e3889,SysClient0004.systemia.com,4fc209b1-c5e4-49a2-94ec-d2310f211a02,THREAD,99e6ee71-6469-421e-b0c4-198a8e508488,312,-1,,1640,...,,312,,2140,,,,3632100000,36320f8000,
3,REMOTE_CREATE,ff8a4d62-eb4a-4cfc-aea2-1870d4dba3f1,SysClient0004.systemia.com,a7624d42-57d8-4c31-bc71-191f8be09a49,THREAD,2d085a9f-c563-4fad-b86c-b035ba199f0b,1880,-1,,2776,...,,4,aa240ddc-17cd-4f29-bb39-e40a51f088b2,3676,,,,0,0,
4,CREATE,88984aa1-d004-41d9-a498-8e9eab111f62,SysClient0022.systemia.com,77bb2643-59a8-45e0-ad50-b198a4d8e5f0,THREAD,0ef70c6e-6aa3-4d5c-b74a-e902b630b25d,364,-1,,2612,...,,364,,4352,,,,6b08500000,6b084f8000,


In [23]:
df_ecarbro = df_ecarbro.join(df_bro, df_ecarbro["bro_uid"] ==  df_bro["uid"], "inner").drop(col('uid'))

In [24]:
df_ecarbro

DataFrame[action: string, actorID: string, hostname: string, id: string, object: string, objectID: string, pid: bigint, ppid: bigint, principal: string, tid: bigint, timestamp: string, acuity_level: string, bro_uid: string, dest_ip: string, dest_port: string, direction: string, image_path: string, l4protocol: string, src_ip: string, src_port: string, ts: string, id.orig_h: string, id.orig_p: string, id.resp_: string, id.resp_p: string, proto: string, service: string, duration: string, orig_bytes: string, resp_bytes: string, conn_state: string, local_orig: string, local_resp: string, missed_bytes: string, history: string, orig_pkts: string, orig_ip_bytes: string, resp_pkts: string, resp_ip_bytes: string, tunnel_parents: string]

In [26]:
df_ecarbro.select('bro_uid').show(2)

+-------+
|bro_uid|
+-------+
+-------+

