*New and tidied up notebook for faster and easier testing of the models*

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from scipy.stats import truncnorm
import matplotlib.pyplot as plt
from collections import OrderedDict
import os
import json
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import dask.dataframe as dd
import pandas as pd
from argparse import Namespace
from tqdm import tqdm # Comment out if tqdm is not installed

import train_model
from log_analyzer.model.auxiliary import EarlyStopping
import log_analyzer.data.data_loader as data_utils

cuda = torch.cuda.is_available()
# torch.manual_seed(1)

# Parameter setting

In [None]:
def set_args(bidir, tiered, token_level):
    """Prepares a Namespace of the same format and fields as created by argparse when parsing user input from the commandline.
       (Allows the use of functions from train_model for easier model training)"""
    # Common args (defaults, can be changed)
    args = Namespace(
        batch_size = 64,
        lstm_layers = [128],
        context_layers = [128],
        embed_dim = 128,
        model_dir = 'runs',
        load_from_checkpoint = ''
    )

    args.bidirectional = bidir
    args.tiered = tiered

    if token_level == 'word':
        args.data_folder=os.getcwd() + '/notebooks/data_examples/lanl/lm_feats/word_day_split'
        args.jagged = False
        args.config = os.getcwd() + '/notebooks/safekit/features/specs/lm/lanl_word_config.json'
    elif token_level == 'char':
        args.data_folder=os.getcwd() + '/notebooks/data_examples/lanl/lm_feats/raw_day_split'
        args.jagged = True
        args.config = os.getcwd() + '/notebooks/safekit/features/specs/lm/lanl_char_config.json'
    else:
        print("Error: unexpected token_level, args not prepared.")
        return

    ### If model is both fwd and word tokenized
    if token_level == 'word' and not args.bidirectional:
        args.skipsos = True
    else:
        args.skipsos = False
    
    # Return the prepared args
    return args
    
# Note: hyperparameters, such as learning rate and patience, are read in from the config file.
#       (Change them there if you want to try other values)

# Specify the configuration of the model
# All combinations currently supported except bidir + tiered
bidir = False
tiered = True
token_level = 'word' # 'word' or 'char'

# Get the args for the model version defined above
args = set_args(bidir, tiered, token_level)

# Prepare a dataloader for the data
with open(args.config, 'r') as f:
    conf = json.load(f)
sentence_length = conf["sentence_length"] - 1 - int(args.skipsos) + int(args.bidirectional)
train_days = conf['train_files']
test_days = conf['test_files']


# Train model
### (One epoch, full dataset)


In [None]:
%%time
# Create and train the model
trainer = train_model.create_model(args)
train_losses, test_losses = train_model.train(args, trainer)

In [None]:
# Plot the train loss over time
plt.plot(train_losses)

# naming the x axis
plt.xlabel('x - Number of batch')
# naming the y axis
plt.ylabel('y - Average loss')
plt.title("Training losses")

In [None]:
print(f"Final test loss (avg): {np.mean(test_losses)}")

# Spot check model performance

In [None]:
args.batch_size = 1
_, test_loader = data_utils.load_data(train_days, test_days, args, sentence_length)

# Grab the first batch from the test_loader
for batch in test_loader:
    break

loss, output = trainer.eval_step(batch)
# The input/output of tiered models have an additional dimension, which we have to squeeze out
preds = torch.argmax(output[0], dim=-1)[0] if args.tiered else torch.argmax(output[0], dim=-1)
gt = batch['t'][0][0][:len(preds)] if args.tiered else batch['t'][0][:len(preds)]

# Get the ground truth ('t') of the first line in the batch extracted above
print(f'Ground truth: {gt}')
print(f'Model prediction: {preds}')
print(f'Loss: {loss}')


# Overfit model

### (100+ epochs, 1-10 log lines, train=test set)

### (By overfitting LSTM model on a small dataset, let me check whether the model has ability to learn the relation between input and output)


In [None]:
%%time
# Set batch_size to 1 so we train on a single line only
args.batch_size = 1
train_loader, _ = data_utils.load_data(train_days, test_days, args, sentence_length)

trainer = train_model.create_model(args)

for batch in train_loader:
    break

# Disable verbose of the early_stopping object to avoid large amounts of output
trainer.early_stopping.verbose = False
train_losses = []
epochs = 500
# tqdm provides a nice and minimal progress bar + time estimate for the loop
# If tqdm is not installed, simply comment the line out and use the alternatives below instead
#for i in range(epochs):
#    if i % 250 == 0:
#        print('Epoch: {i}')
for i in tqdm(range(epochs)):
    loss, _ = trainer.train_step(batch) # We ignore the early_stopping flag
    train_losses.append(loss.item())

trainer.early_stopping.verbose = True

In [None]:
# Plot the train loss over time
plt.plot(train_losses)

# naming the x axis
plt.xlabel('x - Number of epoch')
# naming the y axis
plt.ylabel('y - Average loss')


In [None]:
loss, output = trainer.eval_step(batch)
# The input/output of tiered models have an additional dimension, which we have to squeeze out
preds = torch.argmax(output[0], dim=-1)[0] if args.tiered else torch.argmax(output[0], dim=-1)
gt = batch['t'][0][0][:len(preds)] if args.tiered else batch['t'][0][:len(preds)]

# Get the ground truth ('t') of the first line in the batch extracted above
print(f'Ground truth: {gt}')
print(f'Model prediction: {preds}')
print(f'Loss: {loss}')