In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam

import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, KFold
import collections
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow import cast,float32
from keras import backend as kb
from statistics import mean, stdev

In [None]:
def build_kmers(x, k):
    kmer_arr = []
    for seq in x:
        
        kmers = []
        n_kmers = len(seq) - k + 1

        for i in range(0,n_kmers):
            kmer = seq[i:i + k]
            kmers.append(kmer)
        kmer_arr.append(kmers)
     
    return kmer_arr

class Vocab:  #@save

    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []

        counter = count_corpus(tokens)
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                   reverse=True)

        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}
        # self.idx_to_token, self.token_to_idx = [], dict()
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):  
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs

def count_corpus(tokens):  #@save


    if len(tokens) == 0 or isinstance(tokens[0], list):

        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

def truncate_pad(line, num_steps, padding_token):
    if len(line) > num_steps:
        return line[:num_steps] 
    return line + [padding_token] * (num_steps - len(line)) 

def seed_tensorflow(seed=42):
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.set_random_seed(seed)
    
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
    
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
    
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

def test_rmse(model,X_test,Y_test):
    test_preds = model.predict(X_test)
    mse = mean_squared_error(Y_test, test_preds)
    rmse = sqrt(mse)
    return rmse
    
def root_mean_squared_error(y_true, y_pred):
    y_true = cast(y_true,float32)
    return kb.sqrt(kb.mean(kb.square(y_pred - y_true)))

In [None]:
data =pd.read_csv("~/autodl-tmp/full_length_reads.csv")
X = data['sequence']
Y = data['copy_number']
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size = 0.2,random_state = 42)
train_arr = build_kmers(X_train, 6)
test_arr = build_kmers(X_test, 6)


In [None]:
vocab = Vocab(train_arr,min_freq=200)
print(list(vocab.token_to_idx.items())[:10])

In [None]:
num_steps = 1600
maxlen = num_steps
x_train = [vocab[l] for l in train_arr]
x_train = [l + [vocab['<eos>']] for l in x_train]
x_train = [truncate_pad(l, num_steps, vocab['<pad>']) for l in x_train]

x_test = [vocab[l] for l in test_arr]
x_test = [l + [vocab['<eos>']] for l in x_test]
x_test = [truncate_pad(l, num_steps, vocab['<pad>']) for l in x_test]

In [None]:
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(x_test, maxlen)

In [None]:
def create_model(embed_dim,num_heads,ff_dim,vocab_size,d_model):
    inputs = layers.Input(shape=(maxlen,))
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    m = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    m = transformer_block(m)
    m = layers.GlobalAveragePooling1D()(m)
    m = layers.Dense(64, activation="relu")(m)
    m = layers.Dense(32, activation="relu")(m)
    outputs = layers.Dense(1, activation="linear")(m)
    model = keras.Model(inputs=inputs, outputs=outputs)
    learning_rate = CustomSchedule(d_model)
    optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                         epsilon=1e-9,global_clipnorm=0.5)
    model.compile(optimizer=optimizer, loss=root_mean_squared_error)
    return model

# temp_learning_rate_schedule = CustomSchedule(d_model)
# plt.plot(temp_learning_rate_schedule(tf.range(40000, dtype=tf.float32)))
# plt.ylabel("Learning Rate")
# plt.xlabel("Train Step")

In [None]:
multiplicand = int(X.shape[0]*0.2)
X_list = []
Y_list = []
for i in range(0,5,1):
    X_list.append(X[i*multiplicand:(i+1)*multiplicand])
    Y_list.append(Y[i*multiplicand:(i+1)*multiplicand])

In [None]:
embed_dim = 32
num_heads = 2 # Number of Attention Heads
ff_dim = 128  # Hidden layer size in FFN
vocab_size = len(vocab)
d_model = 128
num_steps = 1600
maxlen = num_steps
epoch_num = 20

rmse = []
for i in range(0,5,1):
    X_test = X_list[i]
    Y_test = Y_list[i]
    X_train = []
    Y_train = []
    for j in range(0,5,1):
        if j != i:
            X_train.append(X_list[j])
            Y_train.append(Y_list[j])
    X_train = pd.concat(X_train,axis = 0)
    Y_train = pd.concat(Y_train,axis = 0)
    X_train = X_train.values.reshape(X_train.shape[0], )
    X_test = X_test.values.reshape(X_test.shape[0], )
    train_arr = build_kmers(X_train, 6)
    test_arr = build_kmers(X_test, 6)
    vocab = Vocab(train_arr,min_freq=200)
    x_train = [vocab[l] for l in train_arr]
    x_train = [l + [vocab['<eos>']] for l in x_train]
    x_train = [truncate_pad(l, num_steps, vocab['<pad>']) for l in x_train]
    x_test = [vocab[l] for l in test_arr]
    x_test = [l + [vocab['<eos>']] for l in x_test]
    x_test = [truncate_pad(l, num_steps, vocab['<pad>']) for l in x_test]
    x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen)
    x_val = keras.preprocessing.sequence.pad_sequences(x_test, maxlen)
    model = create_model(embed_dim,num_heads,ff_dim,vocab_size,d_model)
    model.fit(x_train, Y_train, batch_size=64, epochs=epoch_num, validation_data=(x_val, Y_test), verbose=0)
    rmse.append(test_rmse(model,x_val,Y_test))
    print(rmse[i])

In [None]:
pd.DataFrame(rmse,columns=["Transformer"]).to_csv("transformer_full_length.csv",index=False)