<a href="https://colab.research.google.com/github/McKnightA/ExperienceMachine/blob/Transformer-Architecture/Functionalized_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Transformer Exploration

based off https://github.com/JanSchm/CapMarket/blob/master/bot_experiments/IBM_Transformer%2BTimeEmbedding.ipynb
and
Attention Is All You Need https://arxiv.org/pdf/1706.03762.pdf

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Attention, Concatenate, Dense, Dropout, Embedding, LayerNormalization
print(tf.__version__)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
!pip install wandb
import wandb
from wandb.keras import WandbCallback

2.4.0


In [None]:
experimentNumber = 13
note = "setting encoders to 6, setting heads to 6"
wandb.init(name="transformerEncoderTry{}".format(experimentNumber), notes=note, project="Experience Machine", group="Transformer Encoder")

[34m[1mwandb[0m: Currently logged in as: [33mmcniz[0m (use `wandb login --relogin` to force relogin)


#Time Functions

good

In [None]:
#recreation of attention is all you need positional encoding
def encode_position(data): #data shape(batch, seq, feat)
  for i in range(data.shape[1]):
    for j in range(data.shape[2]):
      data[:,i,j] = data[:,i,j] + tf.math.sin(pow(i/10000, 2*j/data.shape[2]))
  
  return data

meh

In [None]:
'''
#recreation of linear periodic method
def time2vec(data): #data comes in form of (batch, seq, feat)
  x = tf.math.reduce_mean(data, axis=-1) #from (batch, seq, feat) to (batch, seq)
  
  time_linear = []
  for i in range(x.shape[1]):
    time_linear.append(x[:,i] * i)
  time_linear = tf.expand_dims(time_linear, axis = -1) #from (batch, seq) to (batch, seq, 1)
  

  time_periodic = tf.math.sin(x)
  time_periodic = tf.expand_dims(time_periodic, axis = -1) #from (batch, seq) to (batch, seq, 1)
  return Concatenate(axis=-1)([data, time_periodic]) 
  '''

'\n#recreation of linear periodic method\ndef time2vec(data): #data comes in form of (batch, seq, feat)\n  x = tf.math.reduce_mean(data, axis=-1) #from (batch, seq, feat) to (batch, seq)\n  \n  time_linear = []\n  for i in range(x.shape[1]):\n    time_linear.append(x[:,i] * i)\n  time_linear = tf.expand_dims(time_linear, axis = -1) #from (batch, seq) to (batch, seq, 1)\n  \n\n  time_periodic = tf.math.sin(x)\n  time_periodic = tf.expand_dims(time_periodic, axis = -1) #from (batch, seq) to (batch, seq, 1)\n  return Concatenate(axis=-1)([data, time_periodic]) \n  '

In [None]:
'''
#another interpretation of attention is all you need positional encoding
def encode_position2(data): #data shape(batch, seq, feat)
  kth = []
  for k in range(data.shape[0]):
    ith = []
    for i in range(data.shape[1]):
      jth = []
      for j in range(data.shape[2]):
        jth.append(tf.math.sin(pow(i/10000, 2*j/data.shape[2])))
      ith.append(jth)
    kth.append(ith)
  
  return Concatenate(axis=-1)([data, kth]) 
  '''

'\n#another interpretation of attention is all you need positional encoding\ndef encode_position2(data): #data shape(batch, seq, feat)\n  kth = []\n  for k in range(data.shape[0]):\n    ith = []\n    for i in range(data.shape[1]):\n      jth = []\n      for j in range(data.shape[2]):\n        jth.append(tf.math.sin(pow(i/10000, 2*j/data.shape[2])))\n      ith.append(jth)\n    kth.append(ith)\n  \n  return Concatenate(axis=-1)([data, kth]) \n  '

#Transformer Functions

In [None]:
def pay_attention(input, d_k, d_v): #inputs = (in_seq, in_seq, in_seq)
  #base process
  q = Dense(d_k, bias_initializer='glorot_uniform')(input[0])
  k = Dense(d_k, bias_initializer='glorot_uniform')(input[1])
  v = Dense(d_v, bias_initializer='glorot_uniform')(input[2])

  return Attention(use_scale=True)([q,k,v]) #expects q.shape(batch, Tq, dim), I may just say we're fine here...

In [None]:
def build_attention_encoder(input, d_k, d_v, n_heads, ff_dim, dropout=0.1): #input = (in_seq, in_seq, in_seq)
  #parallel process

  heads=[]#multiheaded attention start
  for i in range(n_heads): 
    heads.append(pay_attention(input, d_k, d_v))
  
  sub1 = Concatenate(axis=-1)(heads)
  sub1 = Dense(input[0].shape[-1], bias_initializer='glorot_uniform')(sub1) #multiheaded attention end
  sub1 = Dropout(dropout)(sub1) #original paper makes no mention of this layer
  sub1 = LayerNormalization()(input[0] + sub1) #residual connection

  sub2 = Dense(ff_dim, activation='relu')(sub1)
  sub2 = Dropout(dropout)(sub2) #original paper makes no mention of this layer
  sub2 = Dense(input[0].shape[-1])(sub2)
  return LayerNormalization()(sub1 + sub2) #residual connection

In [None]:
def build_encoder_stack_model(): 
  #input =(in_seq) = (batch, seqLen, features)
  #series function
  input = tf.keras.Input(shape = (SequenceLength, Features))
  x = LayerNormalization()(input)
  #x = encode_position(x)
    
  for i in range(nEncoders):
    x = x + build_attention_encoder([x,x,x], dK, dV, nHeads, ffDim) #attempting additional residual connection
    
  x = Dropout(0.1)(x)
  x = Dense(ffDim, activation='relu')(x)
  x = Dropout(0.1)(x)
  out = Dense(1, activation='sigmoid')(x) 

  model = tf.keras.Model(inputs=input, outputs=out)
  #model.summary()
  model.compile(optimizer='Nadam', loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=.1), metrics=[tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

  return model

#Analysis Functions

In [None]:
def make_binary_confusion_matrix(truth, prediction): #assuming shape(none, seqlen, 1) to start
  tp = 0
  fp = 0
  tn = 0
  fn = 0
  if truth.shape == prediction.shape:
    for i in range(truth.shape[0]):
      for j in range(truth.shape[1]):
        if truth[i,j,0] == 1 and prediction[i,j,0] >= .5:
          tp += 1
        elif truth[i,j,0] == 0 and prediction[i,j,0] >= .5:
          fp += 1
        elif truth[i,j,0] == 0 and prediction[i,j,0] < .5:
          tn += 1
        elif truth[i,j,0] == 1 and prediction[i,j,0] < .5:
          fn += 1
    if tp+fp+tn+fn == truth.shape[0]*truth.shape[1]*truth.shape[2]:
      print("all acounted for")
      print("in the table: ", tp+fp+tn+fn)
      print("in existence: ", truth.shape[0]*truth.shape[1]*truth.shape[2])

  else:
    print("you fucked up. truth and perdictions aren't the same shape")
  
  print("-------------------------")
  print("--------pred Y---pred N--")
  print("true Y | ", tp, " | ", fn)
  print("true N | ", fp, " | ", tn)
  print("-------------------------")

In [None]:
def visualize(truth, pred, check_length = 10):
  fig, axs = plt.subplots(1,check_length)
  fig.suptitle('experiment results')
  x=range(SequenceLength)
  for i in range(check_length):
    axs[i].plot(x, truth[i,:], label="signal label")
    axs[i].plot(x, pred[i,:], label="signal prediction")
  
  return fig

#Data Prep

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pathData = "/content/drive/My Drive/Colab Notebooks/BCI/prepedbci/dataTrain.npy" 
data = np.load(pathData)

pathLabel = "/content/drive/My Drive/Colab Notebooks/BCI/prepedbci/labelTrain.npy"
label = np.load(pathLabel)

pathData = "/content/drive/My Drive/Colab Notebooks/BCI/prepedbci/dataTest.npy" 
valD = np.load(pathData)

pathLabel = "/content/drive/My Drive/Colab Notebooks/BCI/prepedbci/labelTest.npy"
valL = np.load(pathLabel)

print(data.shape, label.shape, valD.shape, valL.shape)
data = encode_position(data)
valD = encode_position(valD)

#print(dataTrain.shape, labelTrain.shape, dataVal.shape, labelVal.shape)

(41780, 256, 22) (41780, 256, 1) (9560, 256, 22) (9560, 256, 1)


In [None]:
Generations = 32
BatchSize = 128
SequenceLength = data.shape[1]
#Stride = 23
Features = data.shape[-1]
dK = 256
dV = 256
ffDim = 1024
nHeads = 6
nEncoders = 6

config = wandb.config
config.epochs = Generations
config.batchSize = BatchSize
config.seqLen = SequenceLength
config.features = Features
config.dK = dK
config.dV = dV
config.ffDim = ffDim
config.nHeads = nHeads
config.nEncoders = nEncoders

#Testing

In [None]:
model = build_encoder_stack_model()
model.fit(data, label, batch_size=BatchSize,
                   epochs=Generations, validation_data = (valD, valL),
                   verbose=2, callbacks=[WandbCallback()], shuffle=True)

prediction1 = model.predict(valD)
make_binary_confusion_matrix(valL, prediction1)
fig1 = visualize(valL, prediction1)
prediction2 = model.predict(data)
make_binary_confusion_matrix(label, prediction2)
fig2 = visualize(label, prediction2)

wandb.log({"validation prediction": fig1, "training prediction": fig2})

Epoch 1/32
327/327 - 306s - loss: 0.6726 - precision: 0.3789 - recall: 0.0190 - val_loss: 0.6694 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/32
327/327 - 289s - loss: 0.6653 - precision: 0.4822 - recall: 0.0019 - val_loss: 0.6792 - val_precision: 0.6000 - val_recall: 1.3158e-05
Epoch 3/32
327/327 - 288s - loss: 0.6616 - precision: 0.5126 - recall: 0.0213 - val_loss: 0.6707 - val_precision: 0.3642 - val_recall: 0.0030
Epoch 4/32
327/327 - 289s - loss: 0.6566 - precision: 0.5359 - recall: 0.0710 - val_loss: 0.6742 - val_precision: 0.3180 - val_recall: 0.0120
Epoch 5/32
327/327 - 288s - loss: 0.6507 - precision: 0.5589 - recall: 0.1278 - val_loss: 0.6809 - val_precision: 0.3460 - val_recall: 0.0314
Epoch 6/32
327/327 - 289s - loss: 0.6432 - precision: 0.5782 - recall: 0.1864 - val_loss: 0.6854 - val_precision: 0.3455 - val_recall: 0.0498
Epoch 7/32
327/327 - 289s - loss: 0.6342 - precision: 0.5969 - recall: 0.2510 - val_loss: 0.6860 - val_precision: 0.3672 - val_recall: 0

Note:: all signals have an start and stop and always in a pair, so if able to identify those then could just fill inbetween with on state. if previous is off then only look for on, and if previous is on then only look for off. would be invarient to signal length

Note:: possibly make a decicive function that makes predictions either 1 or 0 