<table style="border:1px solid black;border-collapse:collapse;" align="left">
  <td style="border:1px solid red;">
    <a target="_blank" href="https://colab.research.google.com/github/Grrtzm/word2vec/blob/main/tensorflow_word2vec_windows_eventlog_anomaly_detection.ipynb">Run this notebook in Google Colab</a>
  </td>
  <td style="border:1px solid red;">
    <a target="_blank" href="https://github.com/Grrtzm/word2vec/blob/main/tensorflow_word2vec_windows_eventlog_anomaly_detection.ipynb">View source on GitHub</a>
  </td>
  <td style="border:1px solid red;">
    <a target="_blank" href="https://www.tensorflow.org/tutorials/text/word2vec">View original Word2Vec tutorial on TensorFlow.org</a>
  </td>
  <td style="border:1px solid red;">
    <a href="https://storage.googleapis.com/tensorflow_docs/docs/site/en/tutorials/text/word2vec.ipynb">Download original Tensorflow Word2Vec tutorial notebook</a>
  </td>
</table>

# Tensorflow Word2Vec on Windows Eventlogs
This notebook is part of a project that uses Word2Vec for anomaly detection in Windows 10 event logs.<br>
It uses a dataset which consists of all events derived from the System event log from my own PC.<br>
The data was read using Powershell and Get-WinEvent. You can find the Powershell script and the python script for parsing in my github repository.<br>
This version uses the Tensorflow implementation of Word2Vec, <a target="_blank" href="https://colab.research.google.com/github/Grrtzm/word2vec/blob/main/gensim_word2vec_windows_eventlog_anomaly_detection.ipynb">this version uses a Gensim implementation of Word2Vec</a>.

## Setup

In [None]:
import tensorflow as tf
print(tf.__version__)
!pip install -q tensorflow==2.5.0
if tf.__version__!='2.5.0': wait = input("Tensorflow 2.5.0 was installed. Please restart the runtime and re-run the cells.")

In [1]:
from os.path import exists as file_exists
# Uncomment the type of event log file you would like to use.
# When you want to download a new file, change 'False' to 'True' in the 'if' line.
if file_exists('dataset.csv') == False:
    # !pip install gdown      # uncomment if gdown is missing
    import gdown
    url = 'https://drive.google.com/uc?id=1Kt1FsUwVVTRkpxt7urykgOgDbeOpYRVn' # System events, original dataset, 12-3-2021 to 2-12-2021
    # url = 'https://drive.google.com/uc?id=1-pmm8IR8ninf_ArRgG2rv8P0Lmi4w04b' # System events, dataset 12-3-2021 to 10-1-2022
    # url = 'https://drive.google.com/uc?id=1cLb2GArKPStaggTH5RtYt6aOgCiEif38' # Application events, dataset 12-3-2021 to 10-1-2022
    # url = 'https://drive.google.com/uc?id=1jq6LVEJbOqedzbZYS14njz_N5PKE-aXc' # NTFS operational events, dataset 12-3-2021 to 10-1-2022
    # I didn't supply a Security event log, it's size is 143MB and it only contains 30 different Event ID's.
    output = 'dataset.csv'
    gdown.download(url, output, quiet=True)

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import io
import re
import string
import tqdm
from tensorflow.keras import Model
from tensorflow.keras.layers import Dot, Embedding, Flatten
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

from time import time  # To time our operations
# from collections import defaultdict  # For word frequency
from datetime import datetime # Voor DateTime -> datum bewerkingen

SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

# Set the number of negative samples per positive context.
# Key point: num_ns (number of negative samples per positive context word) 
# between [5, 20] is shown to work best for smaller datasets, while num_ns between [2,5] suffices for larger datasets.
num_ns = 2 # was 5
window_size = 5
embedding_dim = 20 #  20 seems to be sufficient, 128 is the default value from the tutorial   # Dimension of the dense embedding.
vocab_size = 20000 # inital size of the vocabulary. We will resize it later before defining the model.
sequence_length = 40 # Number of words in a sentence.
epochs = 3000 # Number of training epochs for Word2Vec
use_trained_model = False

df = pd.read_csv('dataset.csv', parse_dates=["TimeCreated"]) 
df.head()

Unnamed: 0,TimeCreated,EventRecordID,EventID,Level,Provider,Message
0,2021-12-02 20:05:22.768441+00:00,37593,System_1014,Warning,Microsoft-Windows-DNS-Client,Name resolution for the name config.teams.micr...
1,2021-12-02 20:05:22.255612+00:00,37592,System_32,Information,e1dexpress,
2,2021-12-02 20:05:19.153883+00:00,37591,System_6062,Warning,Netwtw08,6062 - Lso was triggered
3,2021-12-02 20:05:18.362357+00:00,37590,System_27,Warning,e1dexpress,
4,2021-12-02 20:05:18.278408+00:00,37589,System_32,Information,e1dexpress,


## Create Event "word" from multiple columns
This creates a new column containing the words Word2Vec will be trained on.

In [3]:
import string
punct = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{}~'   # `|` is not present here
transtab = str.maketrans(dict.fromkeys(punct, ''))

# Define a function to remove spaces
# https://iqcode.com/code/python/pandas-series-remove-punctuation
# and https://stackoverflow.com/questions/50444346/fast-punctuation-removal-with-pandas
def remove_spaces(text):
    for punctuation in string.punctuation:
        text = text.replace(' ', '')
    return text

# Create a new column. Concatenate the columns, remove unwanted characters and convert to lowercase
df['Event'] = df['EventID'].map(str) + df['Level'].map(str) + df['Provider'].apply(remove_spaces).map(str)
df['Event'] = '|'.join(df['Event'].tolist()).translate(transtab).split('|') # remove all other unwanted characters
df['Event'] = df['Event'].str.lower()

# Delete the redundant columns
df = df.drop(['EventID','Level','Provider'], axis=1)

# Show a preview
df.head(8)

Unnamed: 0,TimeCreated,EventRecordID,Message,Event
0,2021-12-02 20:05:22.768441+00:00,37593,Name resolution for the name config.teams.micr...,system1014warningmicrosoftwindowsdnsclient
1,2021-12-02 20:05:22.255612+00:00,37592,,system32informatione1dexpress
2,2021-12-02 20:05:19.153883+00:00,37591,6062 - Lso was triggered,system6062warningnetwtw08
3,2021-12-02 20:05:18.362357+00:00,37590,,system27warninge1dexpress
4,2021-12-02 20:05:18.278408+00:00,37589,,system32informatione1dexpress
5,2021-12-02 20:05:17.638712+00:00,37588,The system has returned from a low power state...,system1informationmicrosoftwindowspowertrouble...
6,2021-12-02 20:05:17.544184+00:00,37587,7021 - Connection telemetry fields and analysi...,system7021informationnetwtw08
7,2021-12-02 20:05:15.997321+00:00,37586,Windows cannot store Bluetooth authentication ...,system18informationbthusb


In [4]:
num_rows = len(df.axes[0])
print(f"Number of lines/events in the dataset: {num_rows}\n")

Number of lines/events in the dataset: 37593



## Create csv file corpus

Create the text dataset "eventlist", a list containing "eventrow" lists. An "eventrow" list contains all events for that day.

In [5]:
minlength = 1e6
maxlength = 0
eventlist = []
eventrow = []
previous_date = None # datetime.now().date()
for idx, row in df.iterrows():
    date = row['TimeCreated'].date()
    eventrow.append(row['Event'])
    if date != previous_date:
        length = len(eventrow) + 1
        if length > maxlength: maxlength = length
        if length < minlength: minlength = length
        eventrow = []
        eventlist.append(eventrow)
        previous_date = date
        
print(f"Number of lines = {len(eventlist)}, shortest line = {minlength} words, longest line = {maxlength} words\n")

# Ugly, but the easiest way. Save the list to a csv file and read that as a text file.
# Save the dataset as a csv file using the numpy module
events = np.array(eventlist, dtype=object)
np.savetxt('eventlist.csv', events, delimiter=',', fmt ='% s')

Number of lines = 263, shortest line = 2 words, longest line = 1011 words



In [6]:
print(f"`sequence_length` is changed now from {sequence_length} to {maxlength}")
sequence_length = maxlength

`sequence_length` is changed now from 40 to 1011


### Vectorize sentences from the corpus

In [None]:
path_to_file = "eventlist.csv"

print(f"File containing log events: {path_to_file}\n")

# Use the non empty lines to construct a tf.data.TextLineDataset object for next steps.
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

'''
# Uncomment this block to see how the data (tensors) looks like.
for line in text_ds.take(10):
    print(line)
print(f"Those were the tensors\n")

print(text_ds)
print()
'''

# Now, create a custom standardization function to lowercase the text and remove punctuation.
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return tf.strings.regex_replace(lowercase,
    '[%s]' % re.escape(string.punctuation), '')

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size, # None
    output_mode='int',
    output_sequence_length=sequence_length)

# Call adapt on the text dataset to create vocabulary.
vectorize_layer.adapt(text_ds.batch(1024))

# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(f"Number of words in vocabulary (limit is {vocab_size}): {len(inverse_vocab)-1}")

# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

# Obtain number of sequences from the dataset
sequences = list(text_vector_ds.as_numpy_iterator())

print(f"First 10 sequences:\n")
idx = 0
for seq in sequences:
    print(seq)
    idx = idx + 1
    if idx > 10: break

## Show the vocabulary

In [None]:
vocab_len = len(inverse_vocab)
print(f"Number of words in vocabulary (limit is {vocab_size}): {vocab_len - 1}\n")
v=0
for i in range(vocab_len):
    if i>0: 
        # I needed to shift the weight index in order to match the values from u with the inverse_vocab
        # v = vectorize_layer.get_weights()[0][i-2]
        v = vectorize_layer.get_weights()[1][i-2]  # in Tensorflow 2.5.0
    w = inverse_vocab[i]
    print(f"Index {i}:\tWeight:{v}\t{w}")
    #if i > 5: break # only show the first 5

### Helper function to generate training data
Generates skip-gram pairs with negative sampling for a list of sequences (int-encoded sentences) 
based on window size, number of negative samples and vocabulary size.

In [None]:
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
    # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []

    # Build the sampling table for vocab_size tokens.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    # Iterate over all sequences (sentences) in dataset.
    for sequence in tqdm.tqdm(sequences):
        # Generate positive skip-gram pairs for a sequence (sentence).
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
              sequence,
              vocabulary_size=vocab_size,
              sampling_table=sampling_table,
              window_size=window_size,
              negative_samples=0)

        # Iterate over each positive skip-gram pair to produce training examples
        # with positive context word and negative samples.
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(
                tf.constant([context_word], dtype="int64"), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1,
                num_sampled=num_ns,
                unique=True,
                range_max=vocab_size,
                seed=SEED,
                name="negative_sampling")

            # Build context and label vectors (for one target word)
            negative_sampling_candidates = tf.expand_dims(
                negative_sampling_candidates, 1)

            context = tf.concat([context_class, negative_sampling_candidates], 0)
            label = tf.constant([1] + [0]*num_ns, dtype="int64")

            # Append each element from the training example to global lists.
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)

    return targets, contexts, labels

### Generate training examples from sequences
`sequences` is now a list of int encoded sentences.

In [None]:
print(f"Genererate training examples from sequences, using: window_size={window_size}, num_ns={num_ns}, vocab_size={vocab_len}, seed={SEED}")
import time
time.sleep(1) # Nodig om tekst op de juiste plaats terecht te laten komen (i.v.m. multi-threading)
targets, contexts, labels = generate_training_data(
    # sequences=sequences,
    sequences=sequences,
    window_size=window_size,
    num_ns=num_ns,
    vocab_size=vocab_len,
    seed=SEED)
time.sleep(1) # Nodig om tekst op de juiste plaats terecht te laten komen (i.v.m. multi-threading)
print(f"Number of training examples; targets:{len(targets)}, contexts:{len(contexts)}, labels:{len(labels)}")

# Configure the dataset for performance
BATCH_SIZE = len(targets)
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# Add cache() and prefetch() to improve performance.
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)

### Model and Training

In [None]:
class trainingCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('accuracy')>0.9):
            print(f"\nReached 90% accuracy in {epoch+1} epochs. Ended training...")
            self.model.stop_training = True

class Word2Vec(Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
        self.context_embedding = Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)
        self.dots = Dot(axes=(3, 2))
        self.flatten = Flatten()

    def call(self, pair):
        target, context = pair
        word_emb = self.target_embedding(target)
        context_emb = self.context_embedding(context)
        dots = self.dots([context_emb, word_emb])
        
        return self.flatten(dots)

if use_trained_model == False:
    # Define loss function and compile model
    model = Word2Vec(vocab_len, embedding_dim)
    model.compile(optimizer='adam',
                     loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                     metrics=['accuracy'])

    # Train the model with dataset prepared above for some number of epochs.
    print("Training Word2Vec...")
    history = model.fit(dataset, epochs=epochs, verbose=0, callbacks=[trainingCallback()]) # verbose=0 means no output, verbose=1 shows logging

## Save trained model

In [None]:
if use_trained_model == False:
    model.save('tensorflow_word2vec_model')

### Show some training plots

In [None]:
if use_trained_model == False:

    import matplotlib.image  as mpimg
    import matplotlib.pyplot as plt

    acc=history.history['accuracy']
    loss=history.history['loss']

    epochs=range(len(acc)) # Get number of epochs

    plt.title("Tensorflow Word2Vec Training loss and accuracy")
    plt.xlabel("Epochs")
    plt.ylabel("Magnitude")
    plt.plot(epochs, acc, color='r', label='Accuracy')
    plt.plot(epochs, loss, color='b', label='Loss')
    plt.legend()
    plt.savefig("tensorflow_word2vec_training_loss_accuracy.png", format="png")
    plt.show()

## Load trained model (optional)

In [None]:
model = tf.keras.models.load_model('tensorflow_word2vec_model')

In [None]:
model.summary()

### Embedding lookup and analysis
Generate the vectors.tsv and metadata.tsv to analyze the obtained embeddings in the Embedding Projector.

In [None]:
weights = model.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

# Create and save the vectors and metadata file
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index < 1:
        continue  # skip 0, it's padding
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()

Run the cell below to download the `vectors.tsv` and `metadata.tsv` to analyze the obtained embeddings in the [Embedding Projector](https://projector.tensorflow.org/).

In [None]:
try:
    from google.colab import files
    files.download('vectors.tsv')
    files.download('metadata.tsv')
except Exception:
    pass

## Calculate `cosine_similarity` for each event
And add these as a a column to the dataset.

In [None]:
from numpy import dot
from numpy.linalg import norm

def vocab_lookup(eventstr):
    for index, word in enumerate(vocab):
        if index < 1:
            continue  # skip 0, it's padding
        if word == eventstr:
            vec = weights[index]
            break
    return vec

num_events = len(df.axes[0])
# add empty columns
cos_sim = []
for idx, row in df.iterrows():
    current_event = row['Event']
    ce = vocab_lookup(current_event)
    if idx == 0:
        cos_sim.append(float(1))
        previous_event = current_event
        pe = ce
    if idx > 0:
        if idx < num_events + 1:
            cs = dot(pe, ce)/(norm(pe)*norm(ce)) # https://www.statology.org/cosine-similarity-python/
            cos_sim.append(cs)
            previous_event = current_event
            pe = ce

df['cos_sim'] = cos_sim

# Change order of columns by name, so we can display it orderly
df = df[['TimeCreated', 'EventRecordID', 'Event', 'cos_sim', 'Message']]
df.head()

# saving the dataframe
df.to_csv('System-Events-similarity-Tensorflow.csv')

## Plot all anomalies
The lower the line spikes, the more unique it is.

In [None]:
import plotly.graph_objects as go

print("Please note the pan/zoom controls in the ModeBar on the right...")

# Create figure
fig = go.Figure()
config = {'displayModeBar': True}

fig.add_trace(
    go.Scatter(x=list(df.TimeCreated), y=list(df.cos_sim)))

# Set title
fig.update_layout(
    title_text="Time series with range slider and selectors"
)

# Add range slider
fig.update_layout(
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1d",
                     step="day",
                     stepmode="todate"),
                dict(count=7,
                     label="7d",
                     step="day",
                     stepmode="todate"),
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="todate"),
                dict(step="all")
            ])
        ),

        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
)

# fig.update_traces(mode='markers+lines') # displays dots (markers) on each event. Very slow!
fig.show(config=config)

## Show Top 10 of all 'anomalies'

In [None]:
# Pandas display settings
pd.set_option('display.max_columns', None) # Set it to None to display all columns in the dataframe
pd.set_option('display.width',200)
pd.set_option('display.max_colwidth', None)

print("Top 10 of all 'anomalies':\n")
df.nsmallest(n=10, columns=['cos_sim'])

As you can see above, those anomalies are not always not what you expected. We are not interested in 'Information' events. 
<br> Now lets take a look at some more interesting events.
## Show Top 10 of 'critical' anomalies

In [None]:
dfcritical = df[df['Event'].str.contains('critical')]
print(f"Top 10 of 'critical' anomalies from a total of {len(dfcritical)} 'critical' events:\n")
dfcritical.nsmallest(n=10, columns=['cos_sim'])

## Show Top 10 of 'error' anomalies

In [None]:
dferror = df[df['Event'].str.contains('error')]
print(f"Top 10 of 'error' anomalies from a total of {len(dferror)} 'error' events:\n")
dferror.nsmallest(n=10, columns=['cos_sim'])