In [1]:
# First check the processing speed and memory available to ensure there is enough
# grunt to run the code (at least 8GB memory & 4 cores @ 2.4Ghz)

import platform
import psutil

print("-"*20 + "Platform Information" + "-"*20)

print(platform.processor())
cores = psutil.cpu_count(logical=True)
print("Physical cores:", psutil.cpu_count(logical=False))
print("Total cores:", cores)

cpufreq = psutil.cpu_freq()
print(f"Max Frequency: {cpufreq.max:.2f}Mhz")
print(f"Current Frequency: {cpufreq.current:.2f}Mhz\n")

if(cores<4 or int(cpufreq.current)<2400):
    print("Warning: Processor speed limit! - This notebook requires at least 4 core @ 2.4Ghz")

print("CPU Usage Per Core:")
for i, percentage in enumerate(psutil.cpu_percent(percpu=True)):
    print(f"Core {i}: {percentage}%")
print(f"Total CPU Usage: {psutil.cpu_percent()}%\n")

svmem = psutil.virtual_memory()
print("Total Memory: " + str(svmem.total))
memAvailable = svmem.available
print ("Memory Available: " + str(memAvailable))
if (memAvailable<8e9):
    print("Warning: Not enough memory available! - This notebook requires at least 8GB of free RAM")

--------------------Platform Information--------------------
Intel64 Family 6 Model 78 Stepping 3, GenuineIntel
Physical cores: 2
Total cores: 4
Max Frequency: 2808.00Mhz
Current Frequency: 2607.00Mhz

CPU Usage Per Core:
Core 0: 0.0%
Core 1: 0.0%
Core 2: 0.0%
Core 3: 0.0%
Total CPU Usage: 0.0%

Total Memory: 17101529088
Memory Available: 8700911616


In [2]:
# Import all required libraries

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

# Base location of all data
dataDirectory = 'D:\\LANL\\2015\\'

# timestsamp to enable runtime measurement
start = datetime.now()

# Set the timeSlice to 10 minutes for feature generation
timeSlice = 600 

# Variables for calculating the number of seconds in an hour and day
secHour = 3600
secDay = 86400

In [14]:
# Define DataFrame columns (as per descriptions in Cell 1)

procCols = ['Time', 'User', 'Computer', 'ProcessName', 'Status']

authCols = ['Time', 'User', 'DestUser', 'SourceComputer', 'DestComputer', 'AuthType', \
          'LogonType', 'AuthOrient', 'SuccessFail']

redCols = ['Time', 'User', 'SourceComputer', 'DestComputer']

# Open raw files as pandas DataFrames (opening proc and auth data in chunks due to their size)

chunkSize = 1e6

procDataChunks = pd.read_csv(dataDirectory + 'Raw\\proc.txt.gz', compression='gzip', \
                           chunksize=chunkSize, names=procCols)

authDataChunks = pd.read_csv(dataDirectory + 'Raw\\auth.txt.gz', compression='gzip', \
                           chunksize=chunkSize, names=authCols)

redDf = pd.read_csv(dataDirectory + 'Raw\\redteam.txt.gz', names=redCols)

In [5]:
# Cycle through proccess events and generate counts and feature vectors for analysis and modelling

lastHourDf = pd.DataFrame(columns = procCols)
dayFeatureDf = pd.DataFrame()
currentDay=1

for chunk in procDataChunks:
    
    # Add new Day, Hour and Timeslice columns
    chunk['Day'] = chunk['Time'].apply(lambda x : int((x/secDay)+1))
    chunk['Hour'] = chunk['Time'].apply(lambda x : int((x/secHour)%24))
    chunk['Slice'] = chunk['Time'].apply(lambda x : int(x/timeSlice))
    chunk = chunk.astype({'Day': 'int', 'Hour': 'int', 'Slice': 'int'})
    
    print('Processing Process Events Day: ' + str(int(chunk['Day'].max())) + ', \
    Hour: ' + str(int(chunk['Hour'].min())))
    
    # Get incomplete final day/hour from last chunk and append all complete hours in this chunk 
    lastHour=chunk[chunk['Day'] == chunk.Day.max()].Hour.max()
    procChunk = lastHourDf.append(chunk[chunk['Hour'] != lastHour]) 
        
    # Generate features for User activity by time slice (computers used and processes launched)
    featureVector = procChunk.groupby(['Day', 'Hour', 'Slice', 'User']).agg({ \
                                       'ProcessName': [lambda x: set(x), 'count'], \
                                       'Computer': [lambda x: set(x), 'count']})
    
    featureVector = featureVector.reset_index()
    featureVector.columns = ['Day', 'Hour', 'Slice', 'User', 'Processes', 'ProcCnt',\
                             'Computers', 'CompCnt']
   
    featureVector = featureVector.astype({'Day': 'int', 'Hour': 'int', 'Slice': 'int', 'ProcCnt': 'int', \
                             'CompCnt': 'int'})
    featureVector['UniqueProcCnt'] = featureVector['Processes'].apply(lambda x: len(x))
    featureVector['UniqueCompCnt'] = featureVector['Computers'].apply(lambda x: len(x))
    
    dayFeatureDf = dayFeatureDf.append(featureVector[featureVector['Day']==currentDay])
    if featureVector.Day.max() > currentDay:
        meanValues = dayFeatureDf.groupby(['User']).agg({'UniqueProcCnt': [np.nanmean], \
                                                            'UniqueCompCnt': [np.nanmean], \
                                                            'ProcCnt': [np.nanmean], \
                                                            'CompCnt': [np.nanmean]})
        meanValues = meanValues.reset_index()
        meanValues.columns = ['User', 'UniqueProcCntMean', 'UniqueCompCntMean', 'ProcCntMean', 'CompCntMean']
        meanValues = meanValues.astype({'UniqueProcCntMean': 'float', 'UniqueCompCntMean': 'float', \
                                        'ProcCntMean': 'float', 'CompCntMean': 'float'})
        
        dayFeatureDf = dayFeatureDf.merge(meanValues, how='outer', left_on=['User'], right_on=['User'])
        
        dayFeatureDf['ProcCntNorm'] = dayFeatureDf['ProcCnt'] / dayFeatureDf['ProcCntMean']
        dayFeatureDf['CompCntNorm'] = dayFeatureDf['CompCnt'] / dayFeatureDf['CompCntMean']
        dayFeatureDf['UniqueProcCntNorm'] = dayFeatureDf['UniqueProcCnt'] / dayFeatureDf['UniqueProcCntMean']
        dayFeatureDf['UniqueCompCntNorm'] = dayFeatureDf['UniqueCompCnt'] / dayFeatureDf['UniqueCompCntMean']
        
        dayFeatureDf.to_pickle(dataDirectory + 'Analysis\\Day_' + str(currentDay).zfill(2) + '_procFeat.pkl', \
                               compression='gzip')
        currentDay+=1
        dayFeatureDf = featureVector[featureVector['Day']==currentDay]
    
    # Get last incomplete hour of this chunk to append to the next chunk
    lastHourDf = chunk[chunk['Hour'] == lastHour]
    
    
print('Process Logs Complete...')

Processing Process Events Day: 1,     Hour: 0


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Processing Process Events Day: 1,     Hour: 7
Processing Process Events Day: 1,     Hour: 9
Processing Process Events Day: 1,     Hour: 11
Processing Process Events Day: 1,     Hour: 14
Processing Process Events Day: 1,     Hour: 16
Processing Process Events Day: 2,     Hour: 0
Processing Process Events Day: 2,     Hour: 1
Processing Process Events Day: 2,     Hour: 6
Processing Process Events Day: 2,     Hour: 9
Processing Process Events Day: 2,     Hour: 11
Processing Process Events Day: 2,     Hour: 14
Processing Process Events Day: 2,     Hour: 16
Processing Process Events Day: 3,     Hour: 0
Processing Process Events Day: 3,     Hour: 2
Processing Process Events Day: 3,     Hour: 6
Processing Process Events Day: 3,     Hour: 10
Processing Process Events Day: 3,     Hour: 13
Processing Process Events Day: 3,     Hour: 16
Processing Process Events Day: 4,     Hour: 0
Processing Process Events Day: 4,     Hour: 2
Processing Process Events Day: 4,     Hour: 8
Processing Process Events

Processing Process Events Day: 27,     Hour: 7
Processing Process Events Day: 27,     Hour: 9
Processing Process Events Day: 27,     Hour: 11
Processing Process Events Day: 27,     Hour: 14
Processing Process Events Day: 27,     Hour: 16
Processing Process Events Day: 28,     Hour: 0
Processing Process Events Day: 28,     Hour: 1
Processing Process Events Day: 28,     Hour: 6
Processing Process Events Day: 28,     Hour: 8
Processing Process Events Day: 28,     Hour: 11
Processing Process Events Day: 28,     Hour: 13
Processing Process Events Day: 28,     Hour: 16
Processing Process Events Day: 28,     Hour: 19
Processing Process Events Day: 29,     Hour: 0
Processing Process Events Day: 29,     Hour: 3
Processing Process Events Day: 29,     Hour: 7
Processing Process Events Day: 29,     Hour: 9
Processing Process Events Day: 29,     Hour: 11
Processing Process Events Day: 29,     Hour: 14
Processing Process Events Day: 29,     Hour: 16
Processing Process Events Day: 29,     Hour: 19
Pr

Processing Process Events Day: 49,     Hour: 14
Processing Process Events Day: 49,     Hour: 17
Processing Process Events Day: 50,     Hour: 0
Processing Process Events Day: 50,     Hour: 0
Processing Process Events Day: 50,     Hour: 4
Processing Process Events Day: 50,     Hour: 8
Processing Process Events Day: 50,     Hour: 10
Processing Process Events Day: 50,     Hour: 11
Processing Process Events Day: 50,     Hour: 13
Processing Process Events Day: 50,     Hour: 15
Processing Process Events Day: 50,     Hour: 17
Processing Process Events Day: 51,     Hour: 0
Processing Process Events Day: 51,     Hour: 0
Processing Process Events Day: 51,     Hour: 4
Processing Process Events Day: 51,     Hour: 5
Processing Process Events Day: 51,     Hour: 6
Processing Process Events Day: 51,     Hour: 7
Processing Process Events Day: 51,     Hour: 7
Processing Process Events Day: 51,     Hour: 8
Processing Process Events Day: 51,     Hour: 8
Processing Process Events Day: 51,     Hour: 9
Proces

In [6]:
# Cycle through authentication events and generate counts and feature vector data for analysis and modelling

lastHourDf = pd.DataFrame(columns = authCols)
dayFeatureDf = pd.DataFrame()
currentDay=1

for chunk in authDataChunks:
    
    # Add new Day, Hour and Timeslice columns and limit AuthType field length
    chunk['Day'] = chunk['Time'].apply(lambda x : int((x/secDay)+1))
    chunk['Hour'] = chunk['Time'].apply(lambda x : int((x/secHour)%24))
    chunk['Slice'] = chunk['Time'].apply(lambda x : int(x/timeSlice))
    chunk = chunk.astype({'Day': 'int', 'Hour': 'int', 'Slice': 'int'})
    chunk['AuthType'] = chunk['AuthType'].apply(lambda x: x[:24] if len(x) > 24 else x)
    
    print('Processing Authentication Events Day: ' + str(int(chunk['Day'].max())) + ', \
    Hour: ' + str(int(chunk['Hour'].min())))
    
    # Get incomplete final day/hour from last chunk and append all complete hours in this chunk 
    lastHour=chunk[chunk['Day'] == chunk.Day.max()].Hour.max()
    authChunk = lastHourDf.append(chunk[chunk['Hour'] != lastHour])  
    
    # Generate features for User activity by time slice
    featureVector = authChunk.groupby(['Day', 'Hour', 'Slice', 'User']).agg({ \
                                        'DestUser': [lambda x: set(x),'count'], \
                                        'DestComputer': [lambda x: set(x), 'count'], \
                                        'SourceComputer': [lambda x: set(x), 'count'],
                                        'AuthType': [lambda x: set(x)], \
                                        'LogonType': [lambda x: set(x)], \
                                        'AuthOrient': [lambda x: set(x)], \
                                        'SuccessFail': [ \
                                        lambda x: x[x.values=='Success'].count(), \
                                        lambda x: x[x.values=='Fail'].count()]})
    
    featureVector = featureVector.reset_index()
    featureVector.columns = ['Day', 'Hour', 'Slice', 'User', 'DestUsers', 'DestUserCnt', \
                   'DestComputers', 'DestCompCnt', 'SourceComputers', 'SourceCompCnt', \
                   'AuthTypes', 'LogonTypes', 'AuthOrients', 'SuccessfulLogonCnt', 'FailedLogonCnt'] 
    
    featureVector = featureVector.astype({'Day': 'int', 'Hour': 'int', 'Slice': 'int', 'DestUserCnt': 'int', \
                             'DestCompCnt': 'int', 'SourceCompCnt': 'int', 'SuccessfulLogonCnt': 'int', \
                                'FailedLogonCnt': 'int'})
    
    featureVector['UniqueDestUserCnt'] = featureVector['DestUsers'].apply(lambda x: len(x))
    featureVector['UniqueSourceCompCnt'] = featureVector['SourceComputers'].apply(lambda x: len(x))
    featureVector['UniqueDestCompCnt'] = featureVector['DestComputers'].apply(lambda x: len(x))
    featureVector['AuthKerberos'] = featureVector['AuthTypes'].apply(lambda x: 'Kerberos' in str(x))
    featureVector['AuthNTLM'] = featureVector['AuthTypes'].apply(lambda x: 'NTLM' in str(x))
    featureVector['AuthMicrosoft'] = featureVector['AuthTypes'].apply(lambda x: 'MICROSOFT' in str(x))
    featureVector['AuthNegotiate'] = featureVector['AuthTypes'].apply(lambda x: 'Negotiate' in str(x))
    featureVector['AuthUnknown'] = featureVector['AuthTypes'].apply(lambda x: '?' in str(x))
    featureVector['LogonNet'] = featureVector['LogonTypes'].apply(lambda x: 'Network' in str(x))
    featureVector['LogonService'] = featureVector['LogonTypes'].apply(lambda x: 'Service' in str(x))
    featureVector['LogonClearText'] = featureVector['LogonTypes'].apply(lambda x: 'Cleartext' in str(x))
    featureVector['LogonBatch'] = featureVector['LogonTypes'].apply(lambda x: 'Batch' in str(x))
    featureVector['LogonInteractive'] = featureVector['LogonTypes'].apply(lambda x: 'Interactive' in str(x))
    featureVector['LogonUnlock'] = featureVector['LogonTypes'].apply(lambda x: 'Unlock' in str(x))
    featureVector['LogonCreds'] = featureVector['LogonTypes'].apply(lambda x: 'Credentials' in str(x))
    featureVector['LogonUnknown'] = featureVector['LogonTypes'].apply(lambda x: '?' in str(x))
    featureVector['OrientLogon'] = featureVector['AuthOrients'].apply(lambda x: 'LogOn' in str(x))
    featureVector['OrientLogoff'] = featureVector['AuthOrients'].apply(lambda x: 'LogOff' in str(x))
    featureVector['OrientTGS'] = featureVector['AuthOrients'].apply(lambda x: 'TGS' in str(x))
    featureVector['OrientTGT'] = featureVector['AuthOrients'].apply(lambda x: 'TGT' in str(x))
    featureVector['OrientAuthMap'] = featureVector['AuthOrients'].apply(lambda x: 'AuthMap' in str(x))
    featureVector['Machine'] = featureVector['User'].str.contains('^[^U]')
     
    dayFeatureDf = dayFeatureDf.append(featureVector[featureVector['Day']==currentDay])
    if featureVector.Day.max() > currentDay:
        meanValues = dayFeatureDf.groupby(['User']).agg({'UniqueDestUserCnt': [np.nanmean], \
                                                            'UniqueSourceCompCnt': [np.nanmean], \
                                                            'UniqueDestCompCnt': [np.nanmean]})
        meanValues = meanValues.reset_index()
        meanValues.columns = ['User', 'UniqueDestUserMean', 'UniqueSourceCompMean', 'UniqueDestCompMean']
        meanValues = meanValues.astype({'UniqueDestUserMean': 'float', 'UniqueSourceCompMean': 'float', \
                                        'UniqueDestCompMean': 'float'})
        
        dayFeatureDf = dayFeatureDf.merge(meanValues, how='outer', left_on=['User'], right_on=['User'])
        
        dayFeatureDf['UniqueDestUserNorm'] = dayFeatureDf['UniqueDestUserCnt']/ \
        dayFeatureDf['UniqueDestUserMean']
        dayFeatureDf['UniqueSourceCompNorm'] = dayFeatureDf['UniqueSourceCompCnt'] / \
        dayFeatureDf['UniqueSourceCompMean']
        dayFeatureDf['UniqueDestCompNorm'] = dayFeatureDf['UniqueDestCompCnt'] / \
        dayFeatureDf['UniqueDestCompMean']
        
        
        dayFeatureDf.to_pickle(dataDirectory + 'Analysis\\Day_' + str(currentDay).zfill(2) + '_authFeat.pkl', \
                              compression='gzip')
        currentDay+=1
        dayFeatureDf = featureVector[featureVector['Day']==currentDay]
        
    # Get last incomplete hour of this chunk to append to the next chunk
    lastHourDf = chunk[chunk['Hour'] == lastHour]
    
print('Authentication Logs Complete...')

Processing Authentication Events Day: 1,     Hour: 0
Processing Authentication Events Day: 1,     Hour: 2
Processing Authentication Events Day: 1,     Hour: 5
Processing Authentication Events Day: 1,     Hour: 7
Processing Authentication Events Day: 1,     Hour: 8
Processing Authentication Events Day: 1,     Hour: 9
Processing Authentication Events Day: 1,     Hour: 10
Processing Authentication Events Day: 1,     Hour: 11
Processing Authentication Events Day: 1,     Hour: 12
Processing Authentication Events Day: 1,     Hour: 14
Processing Authentication Events Day: 1,     Hour: 15
Processing Authentication Events Day: 1,     Hour: 16
Processing Authentication Events Day: 1,     Hour: 17
Processing Authentication Events Day: 1,     Hour: 19
Processing Authentication Events Day: 1,     Hour: 20
Processing Authentication Events Day: 2,     Hour: 0
Processing Authentication Events Day: 2,     Hour: 0
Processing Authentication Events Day: 2,     Hour: 2
Processing Authentication Events Day:

Processing Authentication Events Day: 10,     Hour: 0
Processing Authentication Events Day: 10,     Hour: 2
Processing Authentication Events Day: 10,     Hour: 3
Processing Authentication Events Day: 10,     Hour: 5
Processing Authentication Events Day: 10,     Hour: 7
Processing Authentication Events Day: 10,     Hour: 8
Processing Authentication Events Day: 10,     Hour: 9
Processing Authentication Events Day: 10,     Hour: 10
Processing Authentication Events Day: 10,     Hour: 11
Processing Authentication Events Day: 10,     Hour: 11
Processing Authentication Events Day: 10,     Hour: 12
Processing Authentication Events Day: 10,     Hour: 14
Processing Authentication Events Day: 10,     Hour: 15
Processing Authentication Events Day: 10,     Hour: 16
Processing Authentication Events Day: 10,     Hour: 17
Processing Authentication Events Day: 10,     Hour: 19
Processing Authentication Events Day: 10,     Hour: 21
Processing Authentication Events Day: 11,     Hour: 0
Processing Authent

Processing Authentication Events Day: 18,     Hour: 18
Processing Authentication Events Day: 18,     Hour: 19
Processing Authentication Events Day: 18,     Hour: 21
Processing Authentication Events Day: 19,     Hour: 0
Processing Authentication Events Day: 19,     Hour: 1
Processing Authentication Events Day: 19,     Hour: 2
Processing Authentication Events Day: 19,     Hour: 4
Processing Authentication Events Day: 19,     Hour: 6
Processing Authentication Events Day: 19,     Hour: 8
Processing Authentication Events Day: 19,     Hour: 10
Processing Authentication Events Day: 19,     Hour: 12
Processing Authentication Events Day: 19,     Hour: 13
Processing Authentication Events Day: 19,     Hour: 15
Processing Authentication Events Day: 19,     Hour: 17
Processing Authentication Events Day: 19,     Hour: 19
Processing Authentication Events Day: 19,     Hour: 21
Processing Authentication Events Day: 20,     Hour: 0
Processing Authentication Events Day: 20,     Hour: 0
Processing Authent

Processing Authentication Events Day: 28,     Hour: 5
Processing Authentication Events Day: 28,     Hour: 7
Processing Authentication Events Day: 28,     Hour: 8
Processing Authentication Events Day: 28,     Hour: 9
Processing Authentication Events Day: 28,     Hour: 10
Processing Authentication Events Day: 28,     Hour: 11
Processing Authentication Events Day: 28,     Hour: 12
Processing Authentication Events Day: 28,     Hour: 13
Processing Authentication Events Day: 28,     Hour: 14
Processing Authentication Events Day: 28,     Hour: 15
Processing Authentication Events Day: 28,     Hour: 15
Processing Authentication Events Day: 28,     Hour: 17
Processing Authentication Events Day: 28,     Hour: 18
Processing Authentication Events Day: 28,     Hour: 19
Processing Authentication Events Day: 28,     Hour: 21
Processing Authentication Events Day: 29,     Hour: 0
Processing Authentication Events Day: 29,     Hour: 0
Processing Authentication Events Day: 29,     Hour: 1
Processing Authen

Processing Authentication Events Day: 36,     Hour: 13
Processing Authentication Events Day: 36,     Hour: 14
Processing Authentication Events Day: 36,     Hour: 15
Processing Authentication Events Day: 36,     Hour: 16
Processing Authentication Events Day: 36,     Hour: 18
Processing Authentication Events Day: 36,     Hour: 20
Processing Authentication Events Day: 36,     Hour: 21
Processing Authentication Events Day: 37,     Hour: 0
Processing Authentication Events Day: 37,     Hour: 0
Processing Authentication Events Day: 37,     Hour: 1
Processing Authentication Events Day: 37,     Hour: 3
Processing Authentication Events Day: 37,     Hour: 5
Processing Authentication Events Day: 37,     Hour: 6
Processing Authentication Events Day: 37,     Hour: 7
Processing Authentication Events Day: 37,     Hour: 8
Processing Authentication Events Day: 37,     Hour: 9
Processing Authentication Events Day: 37,     Hour: 10
Processing Authentication Events Day: 37,     Hour: 12
Processing Authenti

Processing Authentication Events Day: 44,     Hour: 8
Processing Authentication Events Day: 44,     Hour: 9
Processing Authentication Events Day: 44,     Hour: 10
Processing Authentication Events Day: 44,     Hour: 10
Processing Authentication Events Day: 44,     Hour: 11
Processing Authentication Events Day: 44,     Hour: 12
Processing Authentication Events Day: 44,     Hour: 13
Processing Authentication Events Day: 44,     Hour: 14
Processing Authentication Events Day: 44,     Hour: 15
Processing Authentication Events Day: 44,     Hour: 16
Processing Authentication Events Day: 44,     Hour: 17
Processing Authentication Events Day: 44,     Hour: 18
Processing Authentication Events Day: 44,     Hour: 19
Processing Authentication Events Day: 44,     Hour: 20
Processing Authentication Events Day: 44,     Hour: 22
Processing Authentication Events Day: 45,     Hour: 0
Processing Authentication Events Day: 45,     Hour: 0
Processing Authentication Events Day: 45,     Hour: 1
Processing Auth

Processing Authentication Events Day: 51,     Hour: 14
Processing Authentication Events Day: 51,     Hour: 15
Processing Authentication Events Day: 51,     Hour: 16
Processing Authentication Events Day: 51,     Hour: 17
Processing Authentication Events Day: 51,     Hour: 18
Processing Authentication Events Day: 51,     Hour: 19
Processing Authentication Events Day: 51,     Hour: 21
Processing Authentication Events Day: 52,     Hour: 0
Processing Authentication Events Day: 52,     Hour: 0
Processing Authentication Events Day: 52,     Hour: 1
Processing Authentication Events Day: 52,     Hour: 2
Processing Authentication Events Day: 52,     Hour: 4
Processing Authentication Events Day: 52,     Hour: 5
Processing Authentication Events Day: 52,     Hour: 7
Processing Authentication Events Day: 52,     Hour: 8
Processing Authentication Events Day: 52,     Hour: 9
Processing Authentication Events Day: 52,     Hour: 10
Processing Authentication Events Day: 52,     Hour: 11
Processing Authenti

In [31]:
timeOffset = datetime(2015,1,1,0,0).timestamp()
lastHourDf = pd.DataFrame(columns = ['Time', 'Human', 'User', 'Computer', 'ProcessName', 'Status'])
currentDay=1

countDf = pd.DataFrame(columns = ['Time', 'Humans', 'Users', 'Computers', 'Processes'])
countDf.to_csv(dataDirectory + 'Analysis\\procEntityCounts.csv', index=False, header=True)

for chunk in procDataChunks:
    
    # Add new Day, Hour and Timeslice columns
    chunk['Day'] = chunk['Time'].apply(lambda x : int((x/secDay)+1))
    chunk['Hour'] = chunk['Time'].apply(lambda x : int((x/secHour)%24))
    chunk['Slice'] = chunk['Time'].apply(lambda x : int(x/timeSlice))
    chunk = chunk.astype({'Day': 'int', 'Hour': 'int', 'Slice': 'int'})
    chunk['Human'] = chunk['User'].apply(returnUser)
    
    print('Processing Process Events Day: ' + str(int(chunk['Day'].max())) + ', \
    Hour: ' + str(int(chunk['Hour'].min())))
    
    # Get incomplete final day/hour from last chunk and append all complete hours in this chunk 
    lastHour=chunk[chunk['Day'] == chunk.Day.max()].Hour.max()
    procChunk = lastHourDf.append(chunk[chunk['Hour'] != lastHour]) 
    
    # Generate unique User, Computer and Process counts for each hour
    countDf = procChunk.groupby(['Day', 'Hour']).agg({
        'Time': [lambda x: str(pd.Timestamp(datetime.utcfromtimestamp(timeOffset+x.min()-(x.min()%secHour))))], \
        'Human': ['nunique'], 'User': ['nunique'], 'Computer': ['nunique'], 'ProcessName': ['nunique']})
    
    countDf.to_csv(dataDirectory + 'Analysis\\procEntityCounts.csv', mode='a', index=False, header=False)
    
    lastHourDf = chunk[chunk['Hour'] == lastHour]

Processing Process Events Day: 2,     Hour: 0
Processing Process Events Day: 4,     Hour: 0
Processing Process Events Day: 6,     Hour: 0
Processing Process Events Day: 7,     Hour: 0
Processing Process Events Day: 8,     Hour: 0
Processing Process Events Day: 10,     Hour: 0
Processing Process Events Day: 12,     Hour: 0
Processing Process Events Day: 13,     Hour: 0
Processing Process Events Day: 14,     Hour: 0
Processing Process Events Day: 15,     Hour: 0
Processing Process Events Day: 16,     Hour: 0
Processing Process Events Day: 17,     Hour: 0
Processing Process Events Day: 19,     Hour: 0
Processing Process Events Day: 21,     Hour: 0
Processing Process Events Day: 22,     Hour: 0
Processing Process Events Day: 24,     Hour: 0
Processing Process Events Day: 26,     Hour: 0
Processing Process Events Day: 27,     Hour: 0
Processing Process Events Day: 29,     Hour: 0
Processing Process Events Day: 30,     Hour: 0
Processing Process Events Day: 31,     Hour: 0
Processing Process

In [34]:
# Just the authorisation entity counts
timeOffset = datetime(2015,1,1,0,0).timestamp()
lastHourDf = pd.DataFrame(columns = ['Time', 'Human', 'User', 'DestUser', 'SourceComputer', 'DestComputer', 'AuthType', \
          'LogonType', 'AuthOrient', 'SuccessFail'])
currentDay=1

countDf = pd.DataFrame(columns = ['Time', 'Humans', 'Users', 'Dest Users', 'Source Computers', 'Dest Computers'])
countDf.to_csv(dataDirectory + 'Analysis\\authEntityCounts.csv', index=False, header=True)


for chunk in authDataChunks:
    
    # Add new Day, Hour and Timeslice columns and limit AuthType field length
    chunk['Day'] = chunk['Time'].apply(lambda x : int((x/secDay)+1))
    chunk['Hour'] = chunk['Time'].apply(lambda x : int((x/secHour)%24))
    chunk['Slice'] = chunk['Time'].apply(lambda x : int(x/timeSlice))
    chunk = chunk.astype({'Day': 'int', 'Hour': 'int', 'Slice': 'int'})
    chunk['AuthType'] = chunk['AuthType'].apply(lambda x: x[:24] if len(x) > 24 else x)
    chunk['Human'] = chunk['User'].apply(returnUser)
    
    print('Processing Authentication Events Day: ' + str(int(chunk['Day'].max())) + ', \
    Hour: ' + str(int(chunk['Hour'].min())))
    
    # Get incomplete final day/hour from last chunk and append all complete hours in this chunk 
    lastHour=chunk[chunk['Day'] == chunk.Day.max()].Hour.max()
    authChunk = lastHourDf.append(chunk[chunk['Hour'] != lastHour])  
    
     # Generate unique User, Computer and Process counts for each hour
    countDf = authChunk.groupby(['Day', 'Hour']).agg({ \
            'Time': [lambda x: str(pd.Timestamp(datetime.utcfromtimestamp(timeOffset+x.min()-(x.min()%secHour))))], \
            'Human': ['nunique'], 'User': ['nunique'], 'DestUser': ['nunique'], 'SourceComputer': ['nunique'], \
            'DestComputer': ['nunique']})
    
    countDf.to_csv(dataDirectory + 'Analysis\\authEntityCounts.csv', mode='a', index=False, header=False)
    
    lastHourDf = chunk[chunk['Hour'] == lastHour]

Processing Authentication Events Day: 4,     Hour: 0
Processing Authentication Events Day: 7,     Hour: 0
Processing Authentication Events Day: 9,     Hour: 0
Processing Authentication Events Day: 13,     Hour: 0
Processing Authentication Events Day: 15,     Hour: 0
Processing Authentication Events Day: 18,     Hour: 0
Processing Authentication Events Day: 21,     Hour: 0
Processing Authentication Events Day: 24,     Hour: 0
Processing Authentication Events Day: 27,     Hour: 0
Processing Authentication Events Day: 30,     Hour: 0
Processing Authentication Events Day: 33,     Hour: 0
Processing Authentication Events Day: 36,     Hour: 0
Processing Authentication Events Day: 39,     Hour: 0
Processing Authentication Events Day: 41,     Hour: 0
Processing Authentication Events Day: 44,     Hour: 0
Processing Authentication Events Day: 46,     Hour: 0
Processing Authentication Events Day: 49,     Hour: 0
Processing Authentication Events Day: 51,     Hour: 0
Processing Authentication Event

In [22]:
# Just the authorisation counts
timeOffset = datetime(2015,1,1,0,0).timestamp()
lastHourDf = pd.DataFrame(columns = authCols)
currentDay=1

countDf = pd.DataFrame(columns = ['Time', 'Users', 'Dest Users', 'Source Computers', 'Dest Computers'])
countDf.to_csv(dataDirectory + 'Analysis\\authEntityCounts.csv', index=False, header=True)

countDf = pd.DataFrame(columns = ['Time', 'Logon Type', 'Count'])
countDf.to_csv(dataDirectory + 'Analysis\\logonDensityCounts.csv', index=False, header=True)

countDf = pd.DataFrame(columns = ['Time', 'Authorisation Type', 'Count'])
countDf.to_csv(dataDirectory + 'Analysis\\authDensityCounts.csv', index=False, header=True)

countDf = pd.DataFrame(columns = ['Time', 'Auth Orient Type', 'Count'])
countDf.to_csv(dataDirectory + 'Analysis\\orientDensityCounts.csv', index=False, header=True)

for chunk in authDataChunks:
    
    # Add new Day, Hour and Timeslice columns and limit AuthType field length
    chunk['Day'] = chunk['Time'].apply(lambda x : int((x/secDay)+1))
    chunk['Hour'] = chunk['Time'].apply(lambda x : int((x/secHour)%24))
    chunk['Slice'] = chunk['Time'].apply(lambda x : int(x/timeSlice))
    chunk = chunk.astype({'Day': 'int', 'Hour': 'int', 'Slice': 'int'})
    chunk['AuthType'] = chunk['AuthType'].apply(lambda x: x[:24] if len(x) > 24 else x)
    
    print('Processing Authentication Events Day: ' + str(int(chunk['Day'].max())) + ', \
    Hour: ' + str(int(chunk['Hour'].min())))
    
    # Get incomplete final day/hour from last chunk and append all complete hours in this chunk 
    lastHour=chunk[chunk['Day'] == chunk.Day.max()].Hour.max()
    authChunk = lastHourDf.append(chunk[chunk['Hour'] != lastHour])  
    
     # Generate unique User, Computer and Process counts for each hour
    countDf = authChunk.groupby(['Day', 'Hour']).agg({ \
            'Time': [lambda x: str(pd.Timestamp(datetime.utcfromtimestamp(timeOffset+x.min()-(x.min()%secHour))))], \
            'User': ['nunique'], 'DestUser': ['nunique'], 'SourceComputer': ['nunique'], \
            'DestComputer': ['nunique']})
    
    countDf.to_csv(dataDirectory + 'Analysis\\authEntityCounts.csv', mode='a', index=False, header=False)
    
    countDf = authChunk.groupby(['Day', 'Hour', 'LogonType']).agg({
        'Time': [lambda x: str(pd.Timestamp(datetime.utcfromtimestamp(timeOffset+x.min()-(x.min()%secHour))))], \
        'LogonType': [lambda x: x.unique(), 'count']})
    countDf.to_csv(dataDirectory + 'Analysis\\logonDensityCounts.csv', mode='a', index=False, header=False)
    
    countDf = authChunk.groupby(['Day', 'Hour', 'AuthType']).agg({
        'Time': [lambda x: str(pd.Timestamp(datetime.utcfromtimestamp(timeOffset+x.min()-(x.min()%secHour))))], \
        'AuthType': [lambda x: x.unique(), 'count']})
    countDf.to_csv(dataDirectory + 'Analysis\\authDensityCounts.csv', mode='a', index=False, header=False)
    
    countDf = authChunk.groupby(['Day', 'Hour', 'AuthOrient']).agg({
        'Time': [lambda x: str(pd.Timestamp(datetime.utcfromtimestamp(timeOffset+x.min()-(x.min()%secHour))))], \
        'AuthOrient': [lambda x: x.unique(), 'count']})
    countDf.to_csv(dataDirectory + 'Analysis\\orientDensityCounts.csv', mode='a', index=False, header=False)
    
    lastHourDf = chunk[chunk['Hour'] == lastHour]

Processing Authentication Events Day: 1,     Hour: 0
Processing Authentication Events Day: 1,     Hour: 2
Processing Authentication Events Day: 1,     Hour: 5
Processing Authentication Events Day: 1,     Hour: 7
Processing Authentication Events Day: 1,     Hour: 8
Processing Authentication Events Day: 1,     Hour: 9
Processing Authentication Events Day: 1,     Hour: 10
Processing Authentication Events Day: 1,     Hour: 11
Processing Authentication Events Day: 1,     Hour: 12
Processing Authentication Events Day: 1,     Hour: 14
Processing Authentication Events Day: 1,     Hour: 15
Processing Authentication Events Day: 1,     Hour: 16
Processing Authentication Events Day: 1,     Hour: 17
Processing Authentication Events Day: 1,     Hour: 19
Processing Authentication Events Day: 1,     Hour: 20
Processing Authentication Events Day: 2,     Hour: 0
Processing Authentication Events Day: 2,     Hour: 0
Processing Authentication Events Day: 2,     Hour: 2
Processing Authentication Events Day:

Processing Authentication Events Day: 10,     Hour: 0
Processing Authentication Events Day: 10,     Hour: 2
Processing Authentication Events Day: 10,     Hour: 3
Processing Authentication Events Day: 10,     Hour: 5
Processing Authentication Events Day: 10,     Hour: 7
Processing Authentication Events Day: 10,     Hour: 8
Processing Authentication Events Day: 10,     Hour: 9
Processing Authentication Events Day: 10,     Hour: 10
Processing Authentication Events Day: 10,     Hour: 11
Processing Authentication Events Day: 10,     Hour: 11
Processing Authentication Events Day: 10,     Hour: 12
Processing Authentication Events Day: 10,     Hour: 14
Processing Authentication Events Day: 10,     Hour: 15
Processing Authentication Events Day: 10,     Hour: 16
Processing Authentication Events Day: 10,     Hour: 17
Processing Authentication Events Day: 10,     Hour: 19
Processing Authentication Events Day: 10,     Hour: 21
Processing Authentication Events Day: 11,     Hour: 0
Processing Authent

Processing Authentication Events Day: 18,     Hour: 18
Processing Authentication Events Day: 18,     Hour: 19
Processing Authentication Events Day: 18,     Hour: 21
Processing Authentication Events Day: 19,     Hour: 0
Processing Authentication Events Day: 19,     Hour: 1
Processing Authentication Events Day: 19,     Hour: 2
Processing Authentication Events Day: 19,     Hour: 4
Processing Authentication Events Day: 19,     Hour: 6
Processing Authentication Events Day: 19,     Hour: 8
Processing Authentication Events Day: 19,     Hour: 10
Processing Authentication Events Day: 19,     Hour: 12
Processing Authentication Events Day: 19,     Hour: 13
Processing Authentication Events Day: 19,     Hour: 15
Processing Authentication Events Day: 19,     Hour: 17
Processing Authentication Events Day: 19,     Hour: 19
Processing Authentication Events Day: 19,     Hour: 21
Processing Authentication Events Day: 20,     Hour: 0
Processing Authentication Events Day: 20,     Hour: 0
Processing Authent

Processing Authentication Events Day: 28,     Hour: 5
Processing Authentication Events Day: 28,     Hour: 7
Processing Authentication Events Day: 28,     Hour: 8
Processing Authentication Events Day: 28,     Hour: 9
Processing Authentication Events Day: 28,     Hour: 10
Processing Authentication Events Day: 28,     Hour: 11
Processing Authentication Events Day: 28,     Hour: 12
Processing Authentication Events Day: 28,     Hour: 13
Processing Authentication Events Day: 28,     Hour: 14
Processing Authentication Events Day: 28,     Hour: 15
Processing Authentication Events Day: 28,     Hour: 15
Processing Authentication Events Day: 28,     Hour: 17
Processing Authentication Events Day: 28,     Hour: 18
Processing Authentication Events Day: 28,     Hour: 19
Processing Authentication Events Day: 28,     Hour: 21
Processing Authentication Events Day: 29,     Hour: 0
Processing Authentication Events Day: 29,     Hour: 0
Processing Authentication Events Day: 29,     Hour: 1
Processing Authen

Processing Authentication Events Day: 36,     Hour: 13
Processing Authentication Events Day: 36,     Hour: 14
Processing Authentication Events Day: 36,     Hour: 15
Processing Authentication Events Day: 36,     Hour: 16
Processing Authentication Events Day: 36,     Hour: 18
Processing Authentication Events Day: 36,     Hour: 20
Processing Authentication Events Day: 36,     Hour: 21
Processing Authentication Events Day: 37,     Hour: 0
Processing Authentication Events Day: 37,     Hour: 0
Processing Authentication Events Day: 37,     Hour: 1
Processing Authentication Events Day: 37,     Hour: 3
Processing Authentication Events Day: 37,     Hour: 5
Processing Authentication Events Day: 37,     Hour: 6
Processing Authentication Events Day: 37,     Hour: 7
Processing Authentication Events Day: 37,     Hour: 8
Processing Authentication Events Day: 37,     Hour: 9
Processing Authentication Events Day: 37,     Hour: 10
Processing Authentication Events Day: 37,     Hour: 12
Processing Authenti

Processing Authentication Events Day: 44,     Hour: 8
Processing Authentication Events Day: 44,     Hour: 9
Processing Authentication Events Day: 44,     Hour: 10
Processing Authentication Events Day: 44,     Hour: 10
Processing Authentication Events Day: 44,     Hour: 11
Processing Authentication Events Day: 44,     Hour: 12
Processing Authentication Events Day: 44,     Hour: 13
Processing Authentication Events Day: 44,     Hour: 14
Processing Authentication Events Day: 44,     Hour: 15
Processing Authentication Events Day: 44,     Hour: 16
Processing Authentication Events Day: 44,     Hour: 17
Processing Authentication Events Day: 44,     Hour: 18
Processing Authentication Events Day: 44,     Hour: 19
Processing Authentication Events Day: 44,     Hour: 20
Processing Authentication Events Day: 44,     Hour: 22
Processing Authentication Events Day: 45,     Hour: 0
Processing Authentication Events Day: 45,     Hour: 0
Processing Authentication Events Day: 45,     Hour: 1
Processing Auth

Processing Authentication Events Day: 51,     Hour: 14
Processing Authentication Events Day: 51,     Hour: 15
Processing Authentication Events Day: 51,     Hour: 16
Processing Authentication Events Day: 51,     Hour: 17
Processing Authentication Events Day: 51,     Hour: 18
Processing Authentication Events Day: 51,     Hour: 19
Processing Authentication Events Day: 51,     Hour: 21
Processing Authentication Events Day: 52,     Hour: 0
Processing Authentication Events Day: 52,     Hour: 0
Processing Authentication Events Day: 52,     Hour: 1
Processing Authentication Events Day: 52,     Hour: 2
Processing Authentication Events Day: 52,     Hour: 4
Processing Authentication Events Day: 52,     Hour: 5
Processing Authentication Events Day: 52,     Hour: 7
Processing Authentication Events Day: 52,     Hour: 8
Processing Authentication Events Day: 52,     Hour: 9
Processing Authentication Events Day: 52,     Hour: 10
Processing Authentication Events Day: 52,     Hour: 11
Processing Authenti

In [11]:
procRows=0
authRows=0
for chunk in procDataChunks:
    procRows+=len(chunk)
    print('.')
for chunk in authDataChunks:
    authRows+=len(chunk)
print('Process Log Rows: ' + str(procRows))
print('Authorisation Log Rows: ' + str(authRows))

.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.
.


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [12]:
procRows

426045096

In [13]:
authRows

279000000

In [15]:
for chunk in authDataChunks:
    authRows+=len(chunk)
print('Process Log Rows: ' + str(procRows))
print('Authorisation Log Rows: ' + str(authRows))

Process Log Rows: 426045096
Authorisation Log Rows: 1330430459
