In [None]:
import pyspark as ps

spark = (ps.sql.SparkSession.builder 
        .master("local[4]") 
        .appName("nathanscope") 
        .getOrCreate()
        )

In [None]:
sc = spark.sparkContext

# Abstract

# Create Spark Data Frames
- Auth: Authentication events between networked computer
- Proc: Start and Stop Times for Proccesses run on machines from AV Software
- Flow: Duration, Byte and Packet Count for Data Moving Between Machines
- DNS: DNS Entries and Lookup Resolution(Not used in IP Literal Transactions)

## Define Casting Functions for RDDS

In [None]:
def auth_casting_function(row):
    (time, scr_user, dst_user, src_comp, dst_comp, auth_type,logon_type,auth_orientation,success) = row
    if(time=='?'):
        time = 666999666
    return (int(time),scr_user, dst_user, src_comp, dst_comp, auth_type,logon_type,auth_orientation,success)

In [None]:
def proc_casting_function(row):
    (time,user_domain,comp,proc_name,start) = row
    if(time=='?'):
        time = 666999666
    return (int(time),user_domain,comp,proc_name,start)

In [None]:
def flow_casting_function(row):
    (time, duration,src_comp,src_port,dst_comp,dst_port,protocol,pk_count,byte_count) = row
    if(time=='?'):
        time = 666999666
    return (int(time),int(duration),src_comp,src_port,dst_comp,dst_port,protocol,int(pk_count),int(byte_count))

In [None]:
def DNS_casting_function(row):
    (time, src_comp,comp_resolved) = row
    if(time=='?'):
        time = 666999666
    return (int(time),src_comp,comp_resolved)

## Define Schema for DataFrames

In [None]:
# import the many data types
from pyspark.sql.types import *

# create a schema of your own
auth_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('src_user_domain',   StringType(),  True),
    StructField('dst_user_domain',   StringType(),  True),
    StructField('src_comp',   StringType(),  True),
    StructField('dst_comp',   StringType(),  True),
    StructField('auth_type',  StringType(), True),
    StructField('logon_type',  StringType(),  True),
    StructField('auth_orientation',  StringType(),  True),
    StructField('Success',  StringType(),  True)] )

In [None]:
proc_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('user_domain',   StringType(),  True),
    StructField('comp',   StringType(),  True),
    StructField('proc_name',   StringType(),  True),
    StructField('start',   StringType(),  True),
    ] )

In [None]:
flow_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('duration',     LongType(), True),
    StructField('src_comp',   StringType(),  True),
    StructField('src_port',   StringType(),  True),
    StructField('dst_comp',   StringType(),  True),
    StructField('dst_port',   StringType(),  True),
    StructField('protocol',  StringType(), True),
    StructField('pk_count',  LongType(),  True),
    StructField('byte_count',  LongType(),  True)] )

In [None]:
DNS_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('src_comp',   StringType(),  True),
    StructField('comp_resolved',   StringType(),  True),
    
    ] )

## Create RDDS from Text Input

In [None]:
rdd_auth = (sc.textFile('data/auth.txt')
            .map(lambda rowstr : rowstr.split(","))
            .map(auth_casting_function)
           )

In [None]:
rdd_proc = (sc.textFile('data/proc.txt')
            .map(lambda rowstr : rowstr.split(","))
            .map(proc_casting_function)
           )

In [None]:
rdd_flow = (sc.textFile('data/flows.txt')
            .map(lambda rowstr : rowstr.split(","))
            .map(flow_casting_function)
           )

In [None]:
rdd_DNS = (sc.textFile('data/dns.txt')
            .map(lambda rowstr : rowstr.split(","))
            .map(DNS_casting_function)
           )

## Create DataFrames from RDDs and 

In [None]:
auth_df = spark.createDataFrame(rdd_auth,auth_schema)

In [None]:
proc_df = spark.createDataFrame(rdd_proc,proc_schema)

In [None]:
flow_df = spark.createDataFrame(rdd_flow,flow_schema)

In [None]:
DNS_df = spark.createDataFrame(rdd_DNS,DNS_schema)

time,   source user@domain,   destination user@domain,  source computer,   destination computer,  authentication type,    logon type,   authentication orientation,  success/failure"

# Sample First Day

## filter by items in firtst day and Save to Local Files

In [None]:
auth_df_1 = auth_df.filter(auth_df.time < 2592000)

In [None]:
auth_df_1.rdd.saveAsPickleFile('Data/Day30/auth')

In [None]:
proc_df_1 = proc_df.filter(proc_df.time < 2592000)

In [None]:
proc_df_1.rdd.saveAsPickleFile('Data/Day30/proc')

In [None]:
flow_df_1 = flow_df.filter(flow_df.time < 2592000)

In [None]:
flow_df_1.rdd.saveAsPickleFile('Data/Day30/flow')

In [None]:
DNS_df_1 = DNS_df.filter(DNS_df.time < 2592000)

In [None]:
DNS_df_1.rdd.saveAsPickleFile('Data/Day30/DNS')

In [None]:
DNS_1_len = DNS_df_1.count()

In [None]:
DNS_1_len

schema = StructType( [
    StructField('id',     IntegerType(), True),
    StructField('date',   StringType(),  True),
    StructField('store',  IntegerType(), True),
    StructField('state',  StringType(),  True),
    StructField('product',IntegerType(), True),
    StructField('amount', FloatType(),   True) ] )

len_auth = rdd_auth.count()

len_auth