In [1]:
import pyspark as ps
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import col
from pyspark.sql.functions import countDistinct

spark = (ps.sql.SparkSession.builder 
        .master("local[4]") 
        .appName("nathanscope") 
        .getOrCreate()
        )

In [2]:
sc = spark.sparkContext

In [3]:
# import the many data types
from pyspark.sql.types import *

# create a schema of your own
auth_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('src_user_domain',   StringType(),  True),
    StructField('dst_user_domain',   StringType(),  True),
    StructField('src_comp',   StringType(),  True),
    StructField('dst_comp',   StringType(),  True),
    StructField('auth_type',  StringType(), True),
    StructField('logon_type',  StringType(),  True),
    StructField('auth_orientation',  StringType(),  True),
    StructField('Success',  StringType(),  True)] )

proc_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('user_domain',   StringType(),  True),
    StructField('comp',   StringType(),  True),
    StructField('proc_name',   StringType(),  True),
    StructField('start',   StringType(),  True),
    ] )

flow_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('duration',     LongType(), True),
    StructField('src_comp',   StringType(),  True),
    StructField('src_port',   StringType(),  True),
    StructField('dst_comp',   StringType(),  True),
    StructField('dst_port',   StringType(),  True),
    StructField('protocol',  StringType(), True),
    StructField('pk_count',  LongType(),  True),
    StructField('byte_count',  LongType(),  True)] )

DNS_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('src_comp',   StringType(),  True),
    StructField('comp_resolved',   StringType(),  True),
    
    ] )


In [4]:
rdd_auth = (sc.pickleFile("Data/Day1/auth/part*")
            
           )

rdd_proc = (sc.pickleFile("Data/Day1/proc/part*")
            
           )

rdd_flow = (sc.pickleFile('Data/Day1/flow/part*')
           )

rdd_DNS = (sc.pickleFile("Data/Day1/DNS/part*")
           
           )

In [5]:
auth_df = spark.createDataFrame(rdd_auth,auth_schema)

proc_df = spark.createDataFrame(rdd_proc,proc_schema)

flow_df = spark.createDataFrame(rdd_flow,flow_schema)

DNS_df = spark.createDataFrame(rdd_DNS,DNS_schema)

In [6]:
active_comps_df = pd.read_csv('Summary/Day1/active_comps_df.csv')
active_user_df= pd.read_csv('Summary/Day1/active_users_df.csv')
active_red_team =pd.read_csv('Summary/Day1/active_red_team.csv')

In [7]:
active_comps_df.columns = ['ignore','comp','count_user']

In [8]:
list(active_comps_df['comp'])

['C15210',
 'C1804',
 'C14040',
 'C9587',
 'C9082',
 'C12087',
 'C6791',
 'C13762',
 'C8230',
 'C5934',
 'C13070',
 'C9224',
 'C5974',
 'C10206',
 'C14798',
 'C7810',
 'C10328',
 'C10269',
 'C1665',
 'C824',
 'C11506',
 'C8666',
 'C12029',
 'C1130',
 'C8675',
 'C14874',
 'C3202',
 'C7164',
 'C12042',
 'C7826',
 'C10721',
 'C4909',
 'C13996',
 'C14809',
 'C14876',
 'C3642',
 'C4942',
 'C22',
 'C9008',
 'C14122',
 'C12480',
 'C14057',
 'C8508',
 'C13285',
 'C13130',
 'C15145',
 'C6951',
 'C5855',
 'C10472',
 'C13615',
 'C10095',
 'C13419',
 'C15146',
 'C6005',
 'C9438',
 'C8893',
 'C7579',
 'C15112',
 'C14112',
 'C10705',
 'C1798',
 'C12766',
 'C11380',
 'C4305',
 'C10067',
 'C15228',
 'C8579',
 'C459',
 'C317',
 'C935',
 'C5874',
 'C360',
 'C14141',
 'C9096',
 'C13099',
 'C15914',
 'C13707',
 'C6861',
 'C9666',
 'C6119',
 'C1542',
 'C8679',
 'C1934',
 'C10713',
 'C3835',
 'C1608',
 'C3960',
 'C10944',
 'C3459',
 'C7497',
 'C1817',
 'C12630',
 'C9079',
 'C9464',
 'C15464',
 'C13162',
 'C

In [9]:
active_flow_df = flow_df.where(col("src_comp").isin(list(active_comps_df['comp'])) | col("dst_comp").isin(list(active_comps_df['comp'])))

In [10]:
add_download = udf(lambda x : 1 if x in list(active_comps_df['comp']) else 0 )
add_upload = udf(lambda x : 1 if x in list(active_comps_df['comp']) else 0 )
active_flow_df = active_flow_df.withColumn('Download', add_download(active_flow_df['dst_comp']))
active_flow_df = active_flow_df.withColumn('Upload', add_upload(active_flow_df['src_comp']))

In [11]:
active_flow_df.agg({"Download":"sum"}).collect()

[Row(sum(Download)=869528.0)]

In [12]:
active_flow_df.agg({"Upload":"sum"}).collect()

[Row(sum(Upload)=915599.0)]

In [13]:
active_flow_df.filter((col("Download")==1)&(col("Upload")==1)).count()

494

In [14]:
active_flow_df.count()

1784633

In [15]:
869528+915599

1785127

In [16]:
active_flow_df.agg(countDistinct("src_comp")).collect()

[Row(count(DISTINCT src_comp)=1684)]

In [17]:
active_flow_df.agg(countDistinct("dst_comp")).collect()

[Row(count(DISTINCT dst_comp)=1157)]