In [45]:
import pickle
import numpy as np
import torch
import pandas as pd
from torch import nn

In [46]:
with open('../processed_data/utrs_embeddings_5_10_50.pkl', 'rb') as inp:
    [genes, tokenized, embedding_keys, embedding_mat] = pickle.load(inp)

In [47]:
fDf = pd.read_csv('../processed_data/fluorescence_dataset.csv')
fDf.dropna(subset=['intensity'],inplace=True)
common_geneset = set(genes) & set(fDf['gene'])
seq_geneset_mask = [g in common_geneset for g in genes]
seq_geneset = [g for g in genes if g in common_geneset]
tokenized = [s for s,g in zip(tokenized, seq_geneset_mask) if g]
fDf = fDf.set_index('gene').loc[seq_geneset, :]

In [48]:
mask = fDf['intensity'] > 0 # negative intensities make no sense
print(f"Removing {sum(~mask)} rows that have negative intensity")
fDf = fDf[mask]
tokenized = [x for m,x in zip(mask, tokenized) if m]
len(fDf), len(tokenized)

Removing 106 rows that have negative intensity


(3646, 3646)

In [49]:
fDf['logInt'] = np.log(fDf['intensity'])
maskLog = fDf['logInt']>-20
print(f"Removing {sum(~maskLog)} rows that have outliers log Intensity values <= -20")
fDf = fDf[fDf['logInt']>-20]
tokenized = [x for m,x in zip(maskLog, tokenized) if m]
y = fDf['logInt'].copy()
len(fDf), len(tokenized)

Removing 6 rows that have outliers log Intensity values <= -20


(3640, 3640)

In [72]:
y_dig = np.digitize(y, np.quantile(y, np.arange(0,1,0.1)))
y_bin = y_dig >= 5

In [50]:
BATCH_SIZE = 32
CLIP = 0.25
LOG_INTERVAL = 200
SEQ_LEN = 35
LR = 20

In [81]:
from sklearn.model_selection import train_test_split
x_train, y_train, x_test, y_test = train_test_split(tokenized, y, test_size=0.1)

In [85]:
from tensorflow.keras.utils import to_categorical
from tensorflow.ragged import constant

In [None]:
# Tensorflow 1.9; Keras 2.2.0 (latest versions)
# should be backwards compatible upto Keras 2.0.9 and tf 1.5
from keras.models import Model
from keras.layers import *
import numpy as np
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)
def create_models():
    #Get a sequence of indexes of words as input:
    # Keras supports dynamic input lengths if you provide (None,) as the 
    #  input shape
    inp = Input((None,))
    #Embed words into vectors of size 10 each:
    # Output shape is (None,10)
    embs = Embedding(embedding_mat.shape[0],
                            embedding_mat.shape[1],
                            weights=[embedding_mat],
                            trainable=False)(inp)
    # Run LSTM on these vectors and return output on each timestep
    # Output shape is (None,5)
    lstm = Bidirectional(LSTM(64, return_sequences=True))(embs)
    ##Attention Block
    #Transform each timestep into 1 value (attention_value) 
    # Output shape is (None,1)
    attention = TimeDistributed(Dense(1))(lstm)
    #By running softmax on axis 1 we force attention_values
    # to sum up to 1. We are effectively assigning a "weight" to each timestep
    # Output shape is still (None,1) but each value changes
    attention_vals = Softmax(axis=1)(attention)
    # Multiply the encoded timestep by the respective weight
    # I.e. we are scaling each timestep based on its weight
    # Output shape is (None,5): (None,5)*(None,1)=(None,5)
    scaled_vecs = Multiply()([lstm,attention_vals])
    # Sum up all scaled timesteps into 1 vector 
    # i.e. obtain a weighted sum of timesteps
    # Output shape is (5,) : Observe the time dimension got collapsed
    context_vector = Lambda(lambda x: tf.keras.backend.sum(x,axis=1))(scaled_vecs)
    ##Attention Block over
    # Get the output out
    x = Dense(1)(context_vector)
    out = Activation('sigmoid', dtype='float32', name='predictions')(x)
    model = Model(inp, out)
    model_with_attention_output = Model(inp, [out, attention_vals])
    model.compile(optimizer='adam',loss='binary_crossentropy')
    return model, model_with_attention_output

model,model_with_attention_output = create_models()


model.fit(constant([np.array(t) for t in tokenized],dtype='int16'),np.array(y_bin),batch_size=32, epochs=30)
print ('Attention Over each word: ',model_with_attention_output.predict(np.array([[1,2,3]]),batch_size=1)[1])

Epoch 1/30




Epoch 2/30
  6/114 [>.............................] - ETA: 4:22 - loss: 0.6535