In [None]:
# Import all required libraries

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

# Base location of all data
dataDirectory = 'D:\\LANL\\2015\\'

# timestsamp to enable runtime measurement
start = datetime.now()

# Set the timeSlice to 10 minutes for feature generation
timeSlice = 600 

# Variables for calculating the number of seconds in an hour and day
secHour = 3600
secDay = 86400

In [None]:
# Define DataFrame columns (as per descriptions in Cell 1)

procCols = ['Time', 'User', 'Computer', 'ProcessName', 'Status']

authCols = ['Time', 'User', 'DestUser', 'SourceComputer', 'DestComputer', 'AuthType', \
          'LogonType', 'AuthOrient', 'SuccessFail']

redCols = ['Time', 'User', 'SourceComputer', 'DestComputer']

# Open raw files as pandas DataFrames (opening proc and auth data in chunks due to their size)

chunkSize = 1e6

procDataChunks = pd.read_csv(dataDirectory + 'Raw\\proc.txt.gz', compression='gzip', \
                           chunksize=chunkSize, names=procCols)

authDataChunks = pd.read_csv(dataDirectory + 'Raw\\auth.txt.gz', compression='gzip', \
                           chunksize=chunkSize, names=authCols)

redDf = pd.read_csv(dataDirectory + 'Raw\\redteam.txt.gz', names=redCols)

In [None]:
# Cycle through proccess events and generate counts and feature vectors for analysis and modelling

lastHourDf = pd.DataFrame(columns = procCols)
dayFeatureDf = pd.DataFrame()
currentDay=1

for chunk in procDataChunks:
    
    # Add new Day, Hour and Timeslice columns
    chunk['Day'] = chunk['Time'].apply(lambda x : int((x/secDay)+1))
    chunk['Hour'] = chunk['Time'].apply(lambda x : int((x/secHour)%24))
    chunk['Slice'] = chunk['Time'].apply(lambda x : int(x/timeSlice))
    chunk = chunk.astype({'Day': 'int', 'Hour': 'int', 'Slice': 'int'})
    
    print('Processing Process Events Day: ' + str(int(chunk['Day'].max())) + ', \
    Hour: ' + str(int(chunk['Hour'].min())))
    
    # Get incomplete final day/hour from last chunk and append all complete hours in this chunk 
    lastHour=chunk[chunk['Day'] == chunk.Day.max()].Hour.max()
    procChunk = lastHourDf.append(chunk[chunk['Hour'] != lastHour]) 
        
    # Generate features for User activity by time slice (computers used and processes launched)
    featureVector = procChunk.groupby(['Day', 'Hour', 'Slice', 'User']).agg({ \
                                       'ProcessName': [lambda x: set(x), 'count'], \
                                       'Computer': [lambda x: set(x), 'count']})
    
    featureVector = featureVector.reset_index()
    featureVector.columns = ['Day', 'Hour', 'Slice', 'User', 'Processes', 'ProcCnt',\
                             'Computers', 'CompCnt']
   
    featureVector = featureVector.astype({'Day': 'int', 'Hour': 'int', 'Slice': 'int', 'ProcCnt': 'int', \
                             'CompCnt': 'int'})
    featureVector['UniqueProcCnt'] = featureVector['Processes'].apply(lambda x: len(x))
    featureVector['UniqueCompCnt'] = featureVector['Computers'].apply(lambda x: len(x))
    
    dayFeatureDf = dayFeatureDf.append(featureVector[featureVector['Day']==currentDay])
    if featureVector.Day.max() > currentDay:
        meanValues = dayFeatureDf.groupby(['User']).agg({'UniqueProcCnt': [np.nanmean], \
                                                            'UniqueCompCnt': [np.nanmean], \
                                                            'ProcCnt': [np.nanmean], \
                                                            'CompCnt': [np.nanmean]})
        meanValues = meanValues.reset_index()
        meanValues.columns = ['User', 'UniqueProcCntMean', 'UniqueCompCntMean', 'ProcCntMean', 'CompCntMean']
        meanValues = meanValues.astype({'UniqueProcCntMean': 'float', 'UniqueCompCntMean': 'float', \
                                        'ProcCntMean': 'float', 'CompCntMean': 'float'})
        
        dayFeatureDf = dayFeatureDf.merge(meanValues, how='outer', left_on=['User'], right_on=['User'])
        
        dayFeatureDf['ProcCntNorm'] = dayFeatureDf['ProcCnt'] / dayFeatureDf['ProcCntMean']
        dayFeatureDf['CompCntNorm'] = dayFeatureDf['CompCnt'] / dayFeatureDf['CompCntMean']
        dayFeatureDf['UniqueProcCntNorm'] = dayFeatureDf['UniqueProcCnt'] / dayFeatureDf['UniqueProcCntMean']
        dayFeatureDf['UniqueCompCntNorm'] = dayFeatureDf['UniqueCompCnt'] / dayFeatureDf['UniqueCompCntMean']
        
        dayFeatureDf.to_pickle(dataDirectory + 'Analysis\\Day_' + str(currentDay).zfill(2) + '_procFeat.pkl', \
                               compression='gzip')
        currentDay+=1
        dayFeatureDf = featureVector[featureVector['Day']==currentDay]
    
    # Get last incomplete hour of this chunk to append to the next chunk
    lastHourDf = chunk[chunk['Hour'] == lastHour]
    
    
print('Process Logs Complete...')

In [None]:
# Cycle through authentication events and generate counts and feature vector data for analysis and modelling

lastHourDf = pd.DataFrame(columns = authCols)
dayFeatureDf = pd.DataFrame()
currentDay=1

for chunk in authDataChunks:
    
    # Add new Day, Hour and Timeslice columns and limit AuthType field length
    chunk['Day'] = chunk['Time'].apply(lambda x : int((x/secDay)+1))
    chunk['Hour'] = chunk['Time'].apply(lambda x : int((x/secHour)%24))
    chunk['Slice'] = chunk['Time'].apply(lambda x : int(x/timeSlice))
    chunk = chunk.astype({'Day': 'int', 'Hour': 'int', 'Slice': 'int'})
    chunk['AuthType'] = chunk['AuthType'].apply(lambda x: x[:24] if len(x) > 24 else x)
    
    print('Processing Authentication Events Day: ' + str(int(chunk['Day'].max())) + ', \
    Hour: ' + str(int(chunk['Hour'].min())))
    
    # Get incomplete final day/hour from last chunk and append all complete hours in this chunk 
    lastHour=chunk[chunk['Day'] == chunk.Day.max()].Hour.max()
    authChunk = lastHourDf.append(chunk[chunk['Hour'] != lastHour])  
    
    # Generate features for User activity by time slice
    featureVector = authChunk.groupby(['Day', 'Hour', 'Slice', 'User']).agg({ \
                                        'DestUser': [lambda x: set(x),'count'], \
                                        'DestComputer': [lambda x: set(x), 'count'], \
                                        'SourceComputer': [lambda x: set(x), 'count'],
                                        'AuthType': [lambda x: set(x)], \
                                        'LogonType': [lambda x: set(x)], \
                                        'AuthOrient': [lambda x: set(x)], \
                                        'SuccessFail': [ \
                                        lambda x: x[x.values=='Success'].count(), \
                                        lambda x: x[x.values=='Fail'].count()]})
    
    featureVector = featureVector.reset_index()
    featureVector.columns = ['Day', 'Hour', 'Slice', 'User', 'DestUsers', 'DestUserCnt', \
                   'DestComputers', 'DestCompCnt', 'SourceComputers', 'SourceCompCnt', \
                   'AuthTypes', 'LogonTypes', 'AuthOrients', 'SuccessfulLogonCnt', 'FailedLogonCnt'] 
    
    featureVector = featureVector.astype({'Day': 'int', 'Hour': 'int', 'Slice': 'int', 'DestUserCnt': 'int', \
                             'DestCompCnt': 'int', 'SourceCompCnt': 'int', 'SuccessfulLogonCnt': 'int', \
                                'FailedLogonCnt': 'int'})
    
    featureVector['UniqueDestUserCnt'] = featureVector['DestUsers'].apply(lambda x: len(x))
    featureVector['UniqueSourceCompCnt'] = featureVector['SourceComputers'].apply(lambda x: len(x))
    featureVector['UniqueDestCompCnt'] = featureVector['DestComputers'].apply(lambda x: len(x))
    featureVector['AuthKerberos'] = featureVector['AuthTypes'].apply(lambda x: 'Kerberos' in str(x))
    featureVector['AuthNTLM'] = featureVector['AuthTypes'].apply(lambda x: 'NTLM' in str(x))
    featureVector['AuthMicrosoft'] = featureVector['AuthTypes'].apply(lambda x: 'MICROSOFT' in str(x))
    featureVector['AuthNegotiate'] = featureVector['AuthTypes'].apply(lambda x: 'Negotiate' in str(x))
    featureVector['AuthUnknown'] = featureVector['AuthTypes'].apply(lambda x: '?' in str(x))
    featureVector['LogonNet'] = featureVector['LogonTypes'].apply(lambda x: 'Network' in str(x))
    featureVector['LogonService'] = featureVector['LogonTypes'].apply(lambda x: 'Service' in str(x))
    featureVector['LogonClearText'] = featureVector['LogonTypes'].apply(lambda x: 'Cleartext' in str(x))
    featureVector['LogonBatch'] = featureVector['LogonTypes'].apply(lambda x: 'Batch' in str(x))
    featureVector['LogonInteractive'] = featureVector['LogonTypes'].apply(lambda x: 'Interactive' in str(x))
    featureVector['LogonUnlock'] = featureVector['LogonTypes'].apply(lambda x: 'Unlock' in str(x))
    featureVector['LogonCreds'] = featureVector['LogonTypes'].apply(lambda x: 'Credentials' in str(x))
    featureVector['LogonUnknown'] = featureVector['LogonTypes'].apply(lambda x: '?' in str(x))
    featureVector['OrientLogon'] = featureVector['AuthOrients'].apply(lambda x: 'LogOn' in str(x))
    featureVector['OrientLogoff'] = featureVector['AuthOrients'].apply(lambda x: 'LogOff' in str(x))
    featureVector['OrientTGS'] = featureVector['AuthOrients'].apply(lambda x: 'TGS' in str(x))
    featureVector['OrientTGT'] = featureVector['AuthOrients'].apply(lambda x: 'TGT' in str(x))
    featureVector['OrientAuthMap'] = featureVector['AuthOrients'].apply(lambda x: 'AuthMap' in str(x))
    featureVector['Machine'] = featureVector['User'].str.contains('^[^U]')
     
    dayFeatureDf = dayFeatureDf.append(featureVector[featureVector['Day']==currentDay])
    if featureVector.Day.max() > currentDay:
        meanValues = dayFeatureDf.groupby(['User']).agg({'UniqueDestUserCnt': [np.nanmean], \
                                                            'UniqueSourceCompCnt': [np.nanmean], \
                                                            'UniqueDestCompCnt': [np.nanmean]})
        meanValues = meanValues.reset_index()
        meanValues.columns = ['User', 'UniqueDestUserMean', 'UniqueSourceCompMean', 'UniqueDestCompMean']
        meanValues = meanValues.astype({'UniqueDestUserMean': 'float', 'UniqueSourceCompMean': 'float', \
                                        'UniqueDestCompMean': 'float'})
        
        dayFeatureDf = dayFeatureDf.merge(meanValues, how='outer', left_on=['User'], right_on=['User'])
        
        dayFeatureDf['UniqueDestUserNorm'] = dayFeatureDf['UniqueDestUserCnt']/ \
        dayFeatureDf['UniqueDestUserMean']
        dayFeatureDf['UniqueSourceCompNorm'] = dayFeatureDf['UniqueSourceCompCnt'] / \
        dayFeatureDf['UniqueSourceCompMean']
        dayFeatureDf['UniqueDestCompNorm'] = dayFeatureDf['UniqueDestCompCnt'] / \
        dayFeatureDf['UniqueDestCompMean']
        
        
        dayFeatureDf.to_pickle(dataDirectory + 'Analysis\\Day_' + str(currentDay).zfill(2) + '_authFeat.pkl', \
                              compression='gzip')
        currentDay+=1
        dayFeatureDf = featureVector[featureVector['Day']==currentDay]
        
    # Get last incomplete hour of this chunk to append to the next chunk
    lastHourDf = chunk[chunk['Hour'] == lastHour]
    
print('Authentication Logs Complete...')

In [None]:
# Generate the process entity counts

timeOffset = datetime(2015,1,1,0,0).timestamp()
lastHourDf = pd.DataFrame(columns = ['Time', 'Human', 'User', 'Computer', 'ProcessName', 'Status'])
currentDay=1

countDf = pd.DataFrame(columns = ['Time', 'Humans', 'Users', 'Computers', 'Processes'])
countDf.to_csv(dataDirectory + 'Analysis\\procEntityCounts.csv', index=False, header=True)

for chunk in procDataChunks:
    
    # Add new Day, Hour and Timeslice columns
    chunk['Day'] = chunk['Time'].apply(lambda x : int((x/secDay)+1))
    chunk['Hour'] = chunk['Time'].apply(lambda x : int((x/secHour)%24))
    chunk['Slice'] = chunk['Time'].apply(lambda x : int(x/timeSlice))
    chunk = chunk.astype({'Day': 'int', 'Hour': 'int', 'Slice': 'int'})
    chunk['Human'] = chunk['User'].apply(returnUser)
    
    print('Processing Process Events Day: ' + str(int(chunk['Day'].max())) + ', \
    Hour: ' + str(int(chunk['Hour'].min())))
    
    # Get incomplete final day/hour from last chunk and append all complete hours in this chunk 
    lastHour=chunk[chunk['Day'] == chunk.Day.max()].Hour.max()
    procChunk = lastHourDf.append(chunk[chunk['Hour'] != lastHour]) 
    
    # Generate unique User, Computer and Process counts for each hour
    countDf = procChunk.groupby(['Day', 'Hour']).agg({
        'Time': [lambda x: str(pd.Timestamp(datetime.utcfromtimestamp(timeOffset+x.min()-(x.min()%secHour))))], \
        'Human': ['nunique'], 'User': ['nunique'], 'Computer': ['nunique'], 'ProcessName': ['nunique']})
    
    countDf.to_csv(dataDirectory + 'Analysis\\procEntityCounts.csv', mode='a', index=False, header=False)
    
    lastHourDf = chunk[chunk['Hour'] == lastHour]

In [None]:
# Generate the authorisation entity counts

timeOffset = datetime(2015,1,1,0,0).timestamp()
lastHourDf = pd.DataFrame(columns = ['Time', 'Human', 'User', 'DestUser', 'SourceComputer', 'DestComputer', 'AuthType', \
          'LogonType', 'AuthOrient', 'SuccessFail'])
currentDay=1

countDf = pd.DataFrame(columns = ['Time', 'Humans', 'Users', 'Dest Users', 'Source Computers', 'Dest Computers'])
countDf.to_csv(dataDirectory + 'Analysis\\authEntityCounts.csv', index=False, header=True)


for chunk in authDataChunks:
    
    # Add new Day, Hour and Timeslice columns and limit AuthType field length
    chunk['Day'] = chunk['Time'].apply(lambda x : int((x/secDay)+1))
    chunk['Hour'] = chunk['Time'].apply(lambda x : int((x/secHour)%24))
    chunk['Slice'] = chunk['Time'].apply(lambda x : int(x/timeSlice))
    chunk = chunk.astype({'Day': 'int', 'Hour': 'int', 'Slice': 'int'})
    chunk['AuthType'] = chunk['AuthType'].apply(lambda x: x[:24] if len(x) > 24 else x)
    chunk['Human'] = chunk['User'].apply(returnUser)
    
    print('Processing Authentication Events Day: ' + str(int(chunk['Day'].max())) + ', \
    Hour: ' + str(int(chunk['Hour'].min())))
    
    # Get incomplete final day/hour from last chunk and append all complete hours in this chunk 
    lastHour=chunk[chunk['Day'] == chunk.Day.max()].Hour.max()
    authChunk = lastHourDf.append(chunk[chunk['Hour'] != lastHour])  
    
     # Generate unique User, Computer and Process counts for each hour
    countDf = authChunk.groupby(['Day', 'Hour']).agg({ \
            'Time': [lambda x: str(pd.Timestamp(datetime.utcfromtimestamp(timeOffset+x.min()-(x.min()%secHour))))], \
            'Human': ['nunique'], 'User': ['nunique'], 'DestUser': ['nunique'], 'SourceComputer': ['nunique'], \
            'DestComputer': ['nunique']})
    
    countDf.to_csv(dataDirectory + 'Analysis\\authEntityCounts.csv', mode='a', index=False, header=False)
    
    lastHourDf = chunk[chunk['Hour'] == lastHour]