In [1]:
import pyspark as ps
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import col
from pyspark.sql.functions import countDistinct

spark = (ps.sql.SparkSession.builder 
        .master("local[4]") 
        .appName("nathanscope") 
        .getOrCreate()
        )

In [2]:
sc = spark.sparkContext

In [3]:
# import the many data types
from pyspark.sql.types import *

# create a schema of your own
auth_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('src_user_domain',   StringType(),  True),
    StructField('dst_user_domain',   StringType(),  True),
    StructField('src_comp',   StringType(),  True),
    StructField('dst_comp',   StringType(),  True),
    StructField('auth_type',  StringType(), True),
    StructField('logon_type',  StringType(),  True),
    StructField('auth_orientation',  StringType(),  True),
    StructField('Success',  StringType(),  True)] )

proc_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('user_domain',   StringType(),  True),
    StructField('comp',   StringType(),  True),
    StructField('proc_name',   StringType(),  True),
    StructField('start',   StringType(),  True),
    ] )

flow_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('duration',     LongType(), True),
    StructField('src_comp',   StringType(),  True),
    StructField('src_port',   StringType(),  True),
    StructField('dst_comp',   StringType(),  True),
    StructField('dst_port',   StringType(),  True),
    StructField('protocol',  StringType(), True),
    StructField('pk_count',  LongType(),  True),
    StructField('byte_count',  LongType(),  True)] )

DNS_schema = StructType( [
    StructField('time',     LongType(), True),
    StructField('src_comp',   StringType(),  True),
    StructField('comp_resolved',   StringType(),  True),
    
    ] )


In [4]:
rdd_auth = (sc.pickleFile("Data/Day1/auth/part*")
            
           )

rdd_proc = (sc.pickleFile("Data/Day1/proc/part*")
            
           )

rdd_flow = (sc.pickleFile('Data/Day1/flow/part*')
           )

rdd_DNS = (sc.pickleFile("Data/Day1/DNS/part*")
           
           )


In [5]:
auth_df = spark.createDataFrame(rdd_auth,auth_schema)

proc_df = spark.createDataFrame(rdd_proc,proc_schema)

flow_df = spark.createDataFrame(rdd_flow,flow_schema)

DNS_df = spark.createDataFrame(rdd_DNS,DNS_schema)

In [6]:
red_team = pd.read_csv('Data/redteam.txt',header=None)

In [7]:
red_team.columns = ['time','src_domain','src_comp','dst_comp']

In [8]:
add_success_value = udf(lambda x : 1 if x == 'Success' else 0 )
auth_df = auth_df.withColumn('Success_Value', add_success_value(auth_df['Success']))

In [10]:
src_df = auth_df.filter("logon_type = 'Interactive'").groupBy('src_user_domain').agg({'Success_Value': 'mean','Success':'count'}).collect()
dst_df = auth_df.filter("logon_type = 'Interactive'").groupBy('dst_user_domain').agg({'Success_Value': 'mean','Success':'count'}).collect()

dst_df = pd.DataFrame(dst_df)
dst_df.columns = ['user','avg_success','count_success']
dst_df = dst_df.sort_values('count_success',ascending=False)


src_df = pd.DataFrame(src_df)
src_df.columns = ['user','avg_success','count_success']
src_df = src_df.sort_values('count_success',ascending=False)

src_DOM_df = src_df[[('DOM' in x and '$' not in x) for x in src_df['user']]]
dst_DOM_df = dst_df[[('DOM' in x and '$' not in x) for x in dst_df['user']]]


In [11]:
active_users = list(src_DOM_df['user'])

In [12]:
active_auth_df = auth_df.filter("logon_type = 'Interactive'").where(col("src_user_domain").isin(active_users))

In [15]:
active_comps_df = pd.DataFrame(active_auth_df.groupBy('src_comp').agg(countDistinct('src_user_domain')).collect())

In [21]:
active_comps = set(active_comps_df[0])

In [26]:
len(active_comps)

1773

In [19]:
red_team.head()

Unnamed: 0,time,src_domain,src_comp,dst_comp
0,150885,U620@DOM1,C17693,C1003
1,151036,U748@DOM1,C17693,C305
2,151648,U748@DOM1,C17693,C728
3,151993,U6115@DOM1,C17693,C1173
4,153792,U636@DOM1,C17693,C294


In [29]:
red_team_comps = set(red_team['dst_comp'])

In [31]:
known_infected = red_team_comps.intersection(active_comps)

In [33]:
active_red_team = red_team[[x in known_infected for x in red_team['dst_comp']]]

In [34]:
active_red_team

Unnamed: 0,time,src_domain,src_comp,dst_comp
4,153792,U636@DOM1,C17693,C294
5,155219,U748@DOM1,C17693,C5693
39,483455,U1723@DOM1,C17693,C294
40,483981,U1723@DOM1,C17693,C294
41,485925,U1723@DOM1,C17693,C294
42,486443,U636@DOM1,C17693,C294
45,491747,U1723@DOM1,C17693,C294
79,736894,U2837@DOM1,C17693,C1484
166,755904,U162@DOM1,C17693,C798
168,757235,U737@DOM1,C17693,C1125
