In [1]:
import pyspark as ps
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql.functions import udf, array
from pyspark.sql.types import StringType
from pyspark.sql.functions import col
from pyspark.sql.functions import countDistinct

spark = (ps.sql.SparkSession.builder 
        .master("local[4]") 
        .appName("nathanscope") 
        .getOrCreate()
        )

In [2]:
sc = spark.sparkContext

In [3]:
# import the many data types
from pyspark.sql.types import *

# create a schema of your own
auth_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('src_user_domain',   StringType(),  True),
    StructField('dst_user_domain',   StringType(),  True),
    StructField('src_comp',   StringType(),  True),
    StructField('dst_comp',   StringType(),  True),
    StructField('auth_type',  StringType(), True),
    StructField('logon_type',  StringType(),  True),
    StructField('auth_orientation',  StringType(),  True),
    StructField('Success',  StringType(),  True)] )

proc_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('user_domain',   StringType(),  True),
    StructField('comp',   StringType(),  True),
    StructField('proc_name',   StringType(),  True),
    StructField('start',   StringType(),  True),
    ] )

flow_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('duration',     LongType(), True),
    StructField('src_comp',   StringType(),  True),
    StructField('src_port',   StringType(),  True),
    StructField('dst_comp',   StringType(),  True),
    StructField('dst_port',   StringType(),  True),
    StructField('protocol',  StringType(), True),
    StructField('pk_count',  LongType(),  True),
    StructField('byte_count',  LongType(),  True)] )

DNS_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('src_comp',   StringType(),  True),
    StructField('comp_resolved',   StringType(),  True),
    
    ] )


In [4]:
rdd_auth = (sc.pickleFile("Data/Day1/auth/part*")
            
           )

rdd_proc = (sc.pickleFile("Data/Day1/proc/part*")
            
           )

rdd_flow = (sc.pickleFile('Data/Day1/flow/part*')
           )

rdd_DNS = (sc.pickleFile("Data/Day1/DNS/part*")
           
           )

In [5]:
auth_df = spark.createDataFrame(rdd_auth,auth_schema)

proc_df = spark.createDataFrame(rdd_proc,proc_schema)

flow_df = spark.createDataFrame(rdd_flow,flow_schema)

DNS_df = spark.createDataFrame(rdd_DNS,DNS_schema)

In [6]:
active_comps_df = pd.read_csv('Summary/Day1/active_comps_df.csv')
active_user_df= pd.read_csv('Summary/Day1/active_users_df.csv')
active_red_team =pd.read_csv('Summary/Day1/active_red_team.csv')

In [7]:
active_comps_df.columns = ['ignore','comp','count_user']

In [8]:
add_download = udf(lambda x : 1 if x in list(active_comps_df['comp']) else 0 )
add_upload = udf(lambda x : 1 if x in list(active_comps_df['comp']) else 0 )

In [9]:
flow_df = flow_df.withColumn('Download', add_download(flow_df['dst_comp']))
flow_df = flow_df.withColumn('Upload', add_upload(flow_df['src_comp']))

In [10]:
add_count = udf(lambda x : x[1] if x[0]=='1' else 0 )


In [11]:
flow_df = flow_df.withColumn('upload_bytes',add_count(array('Upload','byte_count')))
flow_df = flow_df.withColumn('download_bytes',add_count(array('Download','byte_count')))
flow_df = flow_df.withColumn('upload_pk',add_count(array('Upload','pk_count')))
flow_df = flow_df.withColumn('download_pk',add_count(array('Download','pk_count')))

In [12]:
all_comps = flow_df.groupBy('src_comp').agg({'upload_bytes':'sum','download_bytes':'sum'}).collect()

In [14]:
all_comps = pd.DataFrame(all_comps)

In [20]:
'C5693' in list(all_comps[0])

False

In [22]:
active_red_team.sort_values('time')

Unnamed: 0.1,Unnamed: 0,time,src_domain,src_comp,dst_comp
0,4,153792,U636@DOM1,C17693,C294
1,5,155219,U748@DOM1,C17693,C5693
2,39,483455,U1723@DOM1,C17693,C294
3,40,483981,U1723@DOM1,C17693,C294
4,41,485925,U1723@DOM1,C17693,C294
5,42,486443,U636@DOM1,C17693,C294
6,45,491747,U1723@DOM1,C17693,C294
7,79,736894,U2837@DOM1,C17693,C1484
8,166,755904,U162@DOM1,C17693,C798
9,168,757235,U737@DOM1,C17693,C1125


In [None]:
interval_size = 600

In [None]:
make_interval = udf(lambda x :  str((x//interval_size)*interval_size).zfill(7))

In [None]:
flow_df = flow_df.withColumn('interval', make_interval(flow_df['time']))

In [None]:
reds = ['C294','C5693']

In [None]:
red_flow_df =  flow_df.where(col("src_comp").isin(reds) | col("dst_comp").isin(reds))
red_time_series = red_flow_df.groupBy('interval').agg({'duration': 'mean','pk_count':'sum','byte_count':'sum','upload_bytes':'sum','download_bytes':'sum','upload_pk':'sum','download_pk':'sum'}).collect()

red_time_series_df = pd.DataFrame(red_time_series)
red_time_series_df.columns = ['time','mean_duration','pk_count','byte_count','upload_bytes','download_bytes','upload_pk','download_pk']



In [None]:
red_time_series_df