In [852]:
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import random

import sqlite3
import pandas as pd

In [853]:
filename = "log05"
# filename = "log04"

sqlite_log_filename = 'data/' + filename + ".sq3"
tracks_filename = 'data/' + filename + "_tracks.txt"

json_filename = 'output/' + filename + ".json"
dot_filename = 'output/' + filename + ".dot"

In [854]:
#Preprocess sqlite logs

conn = sqlite3.connect(sqlite_log_filename)

df = pd.read_sql_query('SELECT * FROM Events', conn)

with open(tracks_filename, "w") as f:
    for track_id, track in df.groupby(df.trace):
        print("".join(track.sort_values(by=["timest"], ascending=True).activity.values), file=f)
print("Tracks preprocessed")

Tracks preprocessed


In [855]:
start_token = "_"

with open(tracks_filename) as f:
    tracks = f.read().splitlines()

tracks

['abcdef', 'abcdeg', 'abcdfe', 'abcdfg', 'abd', 'abdg', 'abdef', 'abdeg']

In [856]:
print ('n samples = ',len(tracks))
for x in tracks:
    print(x)

n samples =  8
abcdef
abcdeg
abcdfe
abcdfg
abd
abdg
abdef
abdeg


In [857]:
def unique_tokens(tracks):
    #all unique characters go here
    token_set = set() #add start token
    for track in tracks:
        for symbol in track:
            token_set.add(symbol)

    return sorted(list(token_set) + [' '])
  
tokens = unique_tokens(tracks)


print ('n_tokens = ', len(tokens))
tokens

n_tokens =  8


[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g']

In [858]:
#!token_to_id = <dictionary of symbol -> its identifier (index in tokens list)>
def token2id(tokens):
    return {t:i for i,t in enumerate(tokens) }

token_to_id = token2id(tokens)

#!id_to_token = < dictionary of symbol identifier -> symbol itself>
def id2token(tokens):
    return {i:t for i,t in enumerate(tokens)}

id_to_token = id2token(tokens)

In [859]:
MAX_LEN = max(list(map(len, tracks)))
tracks_ix = list(map(lambda track: list(map(token_to_id.get,track)),tracks))

#pad short ones
for i in range(len(tracks_ix)):
    if len(tracks_ix[i]) < MAX_LEN:
        tracks_ix[i] += [token_to_id[" "]]*(MAX_LEN - len(tracks_ix[i])) #pad too short
    tracks_ix[i] += [token_to_id[" "]]
        
tracks_ix = np.array(tracks_ix)
tracks_ix

array([[1, 2, 3, 4, 5, 6, 0],
       [1, 2, 3, 4, 5, 7, 0],
       [1, 2, 3, 4, 6, 5, 0],
       [1, 2, 3, 4, 6, 7, 0],
       [1, 2, 4, 0, 0, 0, 0],
       [1, 2, 4, 7, 0, 0, 0],
       [1, 2, 4, 5, 6, 0, 0],
       [1, 2, 4, 5, 7, 0, 0]])

In [860]:
torch.autograd.set_detect_anomaly(True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [861]:
# Transformed version
class CustomRNN_2(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, n_neurons):
        super().__init__()

        self.hidden_size = hidden_size

        #Embedding
        self.embedding = nn.Embedding(input_size, emb_size) # получаем на вход одно событие на каждом шаге?
        #First layer with activation function ReLU
        self.linear1 = nn.Linear(hidden_size+emb_size, 8)
        self.relu1 = nn.ReLU()
        # Second layer, linear
        self.linear2 = nn.Linear(8, n_neurons)

        self.sigmoid = nn.Sigmoid()
        # self.gumbel = F.gumbel_softmax(tau=t)

        # Output layer, returns probabilities for the next token
        self.linear3 = nn.Linear(in_features=n_neurons, out_features=input_size)
        self.softmax = nn.Softmax()

    def forward(self, x, hidden_state, t=0.1):
        x = self.embedding(x)

        x = torch.cat([x, hidden_state], 1)

        x = self.relu1(self.linear1(x))

        x = self.linear2(x)

        hidden_state = F.gumbel_softmax(logits=torch.tensor(x), tau=t)
        # hidden_state = self.softmax(x)
        out = self.sigmoid(x)

        out = self.softmax(self.linear3(out))

        return out, hidden_state

    def init_hidden(self, batch_size):
        return nn.init.zeros_(torch.empty(batch_size, self.hidden_size))


In [862]:
input_size = len(tokens)
emb_size = 30
hidden_size = n_neurons = 2
t = 0.1
learning_rate = 0.001

model_try2 = CustomRNN_2(input_size=input_size, emb_size=emb_size, hidden_size=hidden_size, n_neurons=n_neurons)
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model_try2.parameters(), lr=learning_rate)

In [863]:
for name, param in model_try2.state_dict().items():
    print(name, param.size())

embedding.weight torch.Size([8, 30])
linear1.weight torch.Size([8, 32])
linear1.bias torch.Size([8])
linear2.weight torch.Size([2, 8])
linear2.bias torch.Size([2])
linear3.weight torch.Size([8, 2])
linear3.bias torch.Size([8])


In [864]:
def sample_batch(data, batch_size):
    rows = data[np.random.randint(0,len(data),size=batch_size)]
    return rows

#total N iterations
n_epochs = 25

# how many minibatches are there in the epoch 
batches_per_epoch = 100

#how many training sequences are processed in a single function call
batch_size= 10

for epoch in range(n_epochs):
    avg_cost = 0
    for _ in range(batches_per_epoch):
        loss = 0;
        train_try = sample_batch(torch.tensor(tracks_ix), batch_size)
        train_try = train_try.type(torch.LongTensor)
        mask = torch.ne(train_try[:, 1:], token_to_id[" "]) # we want to skip padded tokens in our track, when computing loss
        hidden_state = model_try2.init_hidden(batch_size)
#         pred_track = torch.zeros(size=(batch_size, len(tokens), len(train_try[0])-1), dtype=torch.float32)
        for j in range(len(train_try[0])-1):
            output, hidden_state = model_try2(train_try[:, j], hidden_state)
            for i in range(batch_size):
                loss += criterion(torch.log(output[i]), train_try[i, j+1]) * mask[i, j]
#             pred_track[:, :, j] = output
#             print(output)
#             print(pred_track)
        avg_cost += loss
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        nn.utils.clip_grad_norm_(model_try2.parameters(), 1)
        optimizer.step()
    print("\n\nEpoch {} average loss = {}".format(epoch, avg_cost / batches_per_epoch))





Epoch 0 average loss = 90.25003814697266


Epoch 1 average loss = 79.5781021118164


Epoch 2 average loss = 73.15046691894531


Epoch 3 average loss = 66.95634460449219


Epoch 4 average loss = 62.955543518066406


Epoch 5 average loss = 58.364654541015625


Epoch 6 average loss = 56.30702209472656


Epoch 7 average loss = 54.152156829833984


Epoch 8 average loss = 52.020137786865234


Epoch 9 average loss = 51.537513732910156


Epoch 10 average loss = 50.26982879638672


Epoch 11 average loss = 48.533016204833984


Epoch 12 average loss = 47.342559814453125


Epoch 13 average loss = 46.301734924316406


Epoch 14 average loss = 45.99565505981445


Epoch 15 average loss = 45.06922912597656


Epoch 16 average loss = 44.70333480834961


Epoch 17 average loss = 43.43256759643555


Epoch 18 average loss = 42.8145866394043


Epoch 19 average loss = 42.198795318603516


Epoch 20 average loss = 39.860443115234375


Epoch 21 average loss = 38.8641471862793


Epoch 22 average loss = 37.483196

In [865]:
# Version for output
class CustomRNN_2_output(nn.Module):
    def __init__(self, input_size, emb_size, hidden_size, n_neurons):
        super().__init__()

        self.hidden_size = hidden_size

        #Embedding
        self.embedding = nn.Embedding(input_size, emb_size)
        # self.embedding = nn.Embedding(1, emb_size) 

        #First layer with activation function ReLU
        self.linear1 = nn.Linear(hidden_size+emb_size, 8)
        self.relu1 = nn.ReLU()
        # Second layer, linear
        self.linear2 = nn.Linear(8, n_neurons)

        self.sigmoid = nn.Sigmoid()
        # self.gumbel = F.gumbel_softmax(tau=t)

        # Output layer, returns probabilities for the next token
        self.linear3 = nn.Linear(in_features=n_neurons, out_features=input_size)
        self.softmax = nn.Softmax()

    def forward(self, x, hidden_state, t=0.1):
        x = self.embedding(x)

        x = torch.cat([x, hidden_state], 1)

        x = self.relu1(self.linear1(x))

        x = self.linear2(x)

        # hidden_state = F.gumbel_softmax(logits=torch.tensor(x), tau=t)
        hidden_state = torch.gt(x, 0.5)
        # hidden_state = self.softmax(x)
        out = self.sigmoid(x)

        out = self.softmax(self.linear3(out))

        return out, hidden_state

    def init_hidden(self, batch_size):
        return nn.init.zeros_(torch.empty(batch_size, self.hidden_size))

In [866]:
input_size = len(tokens)
emb_size = 30
hidden_size = n_neurons = 2
t = 0.1
learning_rate = 0.001

model_output = CustomRNN_2_output(input_size=input_size, emb_size=emb_size, hidden_size=hidden_size, n_neurons=n_neurons)
criterion = nn.CrossEntropyLoss() # Make a choice
optimizer = torch.optim.Adam(model_try2.parameters(), lr=learning_rate)

In [867]:
model_output.embedding.weight = model_try2.embedding.weight

model_output.linear1.weight = model_try2.linear1.weight
model_output.linear1.bias = model_try2.linear1.bias

model_output.linear2.weight = model_try2.linear2.weight
model_output.linear2.bias = model_try2.linear2.bias

model_output.linear3.weight = model_try2.linear3.weight
model_output.linear3.bias = model_try2.linear3.bias

In [868]:
# TEST

n_traces = len(tracks_ix)
hidden_state = model_output.init_hidden(n_traces)
train_try = torch.tensor(tracks_ix)
# final_res = np.empty(shape=(8, 7))
# hidden_final = np.zeros(shape=(8, 6, n_neurons))
final_res = np.empty(shape=(n_traces, len(train_try[0])))
hidden_final = np.zeros(shape=(n_traces, len(train_try[0])-1, n_neurons))
for c in range(n_traces):
    final_res[c][0] = 1
    # print(final_res)
for j in range(0, len(train_try[0])-1):
    output, hidden_state = model_output(train_try[:, j], hidden_state)
    # print(output)
#     print(hidden_state[0])
    for i in range(0, len(output)):
        pred = list(output[i]).index(max(output[i]))
        final_res[i][j+1] = int(pred)
        hidden_final[i][j] = hidden_state[i]



In [869]:
# hidden_final

In [870]:
# final_res[:, 1:]

In [871]:
# train_try[:, 1:]

In [872]:
# tracks_ix

In [873]:
def binary_state_to_id(binary_state):
    return str(int(sum(round(val)*2**(index) for index, val in enumerate(binary_state))))

def num2state(states):
    return {j:"s"+str(i) for i,j in enumerate(states)}

In [874]:
import json

def build_json(filename, states): 
    graph = {"states":set(), "transitions":list(), "meta":{"isAccepting":set()}}
    all_states = []
    all_used_states = set()
    
    # inferred_states = infer_states(tracks_ix)
    inferred_states = states

    for binary_states in inferred_states:
        binary_states = np.vstack((np.zeros(n_neurons), binary_states)) # add initial state
        states = list(map(binary_state_to_id, binary_states))
        graph["states"].update(set(states))
        all_states.append(states)
    num_to_state = num2state(np.unique(all_states))
    all_states = list(map(lambda states: list(map(num_to_state.get,states)),all_states))
    
    for states, track_ids in zip(all_states, tracks_ix):
        track = "".join(list(map(id_to_token.get, track_ids))).strip()
        for index, symbol in enumerate(track):
            transition = {"from":states[index], "to":states[index + 1], "track":symbol}
            all_used_states.add(transition["from"])
            all_used_states.add(transition["to"])
            if transition not in graph["transitions"]:
                graph["transitions"].append(transition)  
        graph["meta"]["isAccepting"].add(states[len(track)])
        
    graph["states"] = list(all_used_states)
    graph["meta"]["isAccepting"] = list(graph["meta"]["isAccepting"])
    graph["meta"]["tracksNum"] = len(tracks)
    with open(filename, "w") as json_file:
        json.dump(graph, json_file)
    return graph

In [875]:
# # TEST
# graph = {"states":set(), "transitions":list(), "meta":{"isAccepting":set()}}
# all_states = []
# all_used_states = set()

# inferred_states = hidden_final

# for binary_states in inferred_states:
#     binary_states = np.vstack((np.zeros(n_neurons), binary_states)) # add initial state
#     states = list(map(binary_state_to_id, binary_states))
#     graph["states"].update(set(states))
#     all_states.append(states)
    
# print(all_states)

# def num2state(states):
#     return {j:"s"+str(i) for i,j in enumerate(states)}

# num_to_state = num2state(np.unique(all_states))
# print(num_to_state)
# all_states_enc = list(map(lambda states: list(map(num_to_state.get,states)),all_states))
# print(all_states_enc)

In [876]:
graph = build_json(json_filename, hidden_final)

In [877]:
def build_dot(json_graph, filename):
    graph = "digraph test {\n"
    for state in json_graph["states"]:
        if state in json_graph["meta"]["isAccepting"]:
            graph += "\t" + state + " [shape=doublecircle];\n"
        else:
            graph += "\t" + state + ";\n"
    for transition in json_graph["transitions"]:
        graph += "\t" + transition["from"] + " -> " + transition["to"] 
        graph += " [label=\"" + transition["track"] + "\"];\n"
    graph += "}" 
    
    with open(filename, "w") as graph_file:
        print(graph, file=graph_file, end="")
        
    return graph

In [878]:
dot_graph = build_dot(graph, dot_filename)

In [879]:
!dot output/log05.dot -Tpng -o output/log05_2neurons.png

# !dot output/log04.dot -Tpng -o output/log04.png