In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import keras_nlp
import os
import pickle
import random
import pandas as pd

def main():

    make_reproducible()
    WILL_DIRECTLY_EVALUATE = False
    prepare_actual_datasets()
    BATCH_SIZE, SEQ_LEN, MIN_TRAINING_SEQ_LEN, EMBED_DIM, LSTM_DIM, NUM_LAYERS, VOCAB_SIZE, EPOCHS, NUM_TOKENS_TO_GENERATE, LR = initialize_hyper_parameters()

    raw_train_ds, raw_val_ds, raw_test_ds = prepare_dataset(MIN_TRAINING_SEQ_LEN, BATCH_SIZE)
    print('Dataset was prepared')
    vocab = get_vocab(raw_train_ds, VOCAB_SIZE)
    print('Vocabulary was prepared')
    tokenizer = get_tokenizer(vocab, SEQ_LEN)
    print('Tokenizer was prepared')
    train_ds, val_ds, test_ds = tokenize_dataset(SEQ_LEN, tokenizer, raw_train_ds, raw_val_ds, raw_test_ds)
    print('Dataset was tokenized')

    if WILL_DIRECTLY_EVALUATE:
        print('Model is being read')
        model = keras.models.load_model("./lstm_model_2.h5")
        print('Model was read')
        do_inference(model, tokenizer, NUM_TOKENS_TO_GENERATE, test_ds, 'Test')
    else:
        model = build_model(VOCAB_SIZE, SEQ_LEN, EMBED_DIM, NUM_LAYERS, LSTM_DIM, LR)
        print('Model was built')
        history = train_model(model, train_ds, val_ds, EPOCHS)
        with open('./trainHistoryDict', 'wb') as file_pi:
            pickle.dump(history.history, file_pi)
        print('Model was trained')
        model.save("./lstm_model_2.h5")
        do_inference(model, tokenizer, NUM_TOKENS_TO_GENERATE, val_ds, 'Validation')
        WILL_DIRECTLY_EVALUATE = True
    print('Inference was made')

def initialize_hyper_parameters():
    BATCH_SIZE = 64
    SEQ_LEN = 2**10 - 1
    MIN_TRAINING_SEQ_LEN = 2

    EMBED_DIM = 256
    LSTM_DIM = 256

    NUM_LAYERS = 2
    VOCAB_SIZE = 8

    EPOCHS = 40
    LR = 1e-3

    NUM_TOKENS_TO_GENERATE = SEQ_LEN - 1

    return BATCH_SIZE, SEQ_LEN, MIN_TRAINING_SEQ_LEN, EMBED_DIM, LSTM_DIM, NUM_LAYERS, VOCAB_SIZE, EPOCHS, NUM_TOKENS_TO_GENERATE, LR

def make_reproducible():
    seed_value = 364187
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    random.seed(seed_value)
    np.random.seed(seed_value)
    tf.random.set_seed(seed_value)
    tf.keras.utils.set_random_seed(seed_value)
    tf.config.experimental.enable_op_determinism()

def prepare_actual_datasets():
    prepare_actual_dataset('../cleaned_dataset/train.csv', 'train')
    prepare_actual_dataset('../cleaned_dataset/validation.csv', 'validation')
    prepare_actual_dataset('../cleaned_dataset/test.csv', 'test')

def prepare_actual_dataset(path, name):
    dataframe = pd.read_csv(path)
    gene_nucleotide_sequences = dataframe['NucleotideSequence']
    gene_nucleotide_sequences_list = gene_nucleotide_sequences.tolist()

    if not os.path.exists('./Datasets'):
        os.mkdir('./Datasets')

    f = open('./Datasets/' + name + '.txt', 'w')
    for i in range(len(gene_nucleotide_sequences_list)):
        f.write(' '.join(gene_nucleotide_sequences_list[i][1:-1]))
        f.write('\n\n')

        if i % 100 == 0:
            print(i, '/', len(gene_nucleotide_sequences_list))
    f.close()

def prepare_dataset(MIN_TRAINING_SEQ_LEN, BATCH_SIZE):
    dir = os.path.expanduser("./Datasets/")
    raw_train_ds = (
        tf.data.TextLineDataset(dir + "train.txt")
            .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
            .batch(BATCH_SIZE)
            .shuffle(buffer_size=256)
    )

    raw_val_ds = (
        tf.data.TextLineDataset(dir + "validation.txt")
            .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
            .batch(BATCH_SIZE)
    )

    raw_test_ds = (
        tf.data.TextLineDataset(dir + "test.txt")
            .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
            .batch(BATCH_SIZE)
    )

    return raw_train_ds, raw_val_ds, raw_test_ds

def get_vocab(raw_train_ds, VOCAB_SIZE):
    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
        raw_train_ds,
        vocabulary_size=VOCAB_SIZE,
        lowercase=True,
        reserved_tokens=["[PAD]", "[UNK]", "[BOS]"],
    )
    return vocab

def get_tokenizer(vocab, SEQ_LEN):
    tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
        vocabulary=vocab,
        sequence_length=SEQ_LEN,
        lowercase=True,
    )
    return tokenizer

def tokenize_dataset(SEQ_LEN, tokenizer, raw_train_ds, raw_val_ds, raw_test_ds):
    start_packer = keras_nlp.layers.StartEndPacker(
        sequence_length=SEQ_LEN,
        start_value=tokenizer.token_to_id("[BOS]"),
    )

    def preprocess(inputs):
        outputs = tokenizer(inputs)
        features = start_packer(outputs)
        labels = outputs
        return features, labels

    train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
        tf.data.AUTOTUNE
    )
    val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
        tf.data.AUTOTUNE
    )

    test_ds = raw_test_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
        tf.data.AUTOTUNE
    )

    return train_ds, val_ds, test_ds

def build_model(VOCAB_SIZE, SEQ_LEN, EMBED_DIM, NUM_LAYERS, LSTM_DIM, LR):
    inputs = keras.layers.Input(shape=(None,), dtype=tf.int32)
    embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
        vocabulary_size=VOCAB_SIZE,
        sequence_length=SEQ_LEN,
        embedding_dim=EMBED_DIM,
        mask_zero=True,
    )
    x = embedding_layer(inputs)

    for _ in range(NUM_LAYERS):
        lstm = tf.keras.layers.LSTM(LSTM_DIM, return_sequences=True)
        x = lstm(x)

    outputs = keras.layers.Dense(VOCAB_SIZE)(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
    opt = tf.keras.optimizers.Adam(learning_rate=LR)
    model.compile(optimizer=opt, loss=loss_fn, metrics=[perplexity])
    return model

def train_model(model, train_ds, val_ds, EPOCHS):
    print(model.summary())
    stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
    history = model.fit(train_ds, validation_data=val_ds, verbose=2, epochs=EPOCHS, callbacks=[stop_early])
    return history

def do_inference(model, tokenizer, NUM_TOKENS_TO_GENERATE, ds, name):
    prompt_tokens = tf.convert_to_tensor([tokenizer.token_to_id("[BOS]")])
    res = model.evaluate(ds)
    print(name, 'Set Result:', res)

    def token_logits_fn(inputs):
        cur_len = inputs.shape[1]
        output = model(inputs)
        return output[:, cur_len - 1, :]

    output_tokens = keras_nlp.utils.greedy_search(
        token_logits_fn,
        prompt_tokens,
        max_length=NUM_TOKENS_TO_GENERATE,
    )
    txt = tokenizer.detokenize(output_tokens)

if __name__ == '__main__':
    main()

0 / 22593
100 / 22593
200 / 22593
300 / 22593
400 / 22593
500 / 22593
600 / 22593
700 / 22593
800 / 22593
900 / 22593
1000 / 22593
1100 / 22593
1200 / 22593
1300 / 22593
1400 / 22593
1500 / 22593
1600 / 22593
1700 / 22593
1800 / 22593
1900 / 22593
2000 / 22593
2100 / 22593
2200 / 22593
2300 / 22593
2400 / 22593
2500 / 22593
2600 / 22593
2700 / 22593
2800 / 22593
2900 / 22593
3000 / 22593
3100 / 22593
3200 / 22593
3300 / 22593
3400 / 22593
3500 / 22593
3600 / 22593
3700 / 22593
3800 / 22593
3900 / 22593
4000 / 22593
4100 / 22593
4200 / 22593
4300 / 22593
4400 / 22593
4500 / 22593
4600 / 22593
4700 / 22593
4800 / 22593
4900 / 22593
5000 / 22593
5100 / 22593
5200 / 22593
5300 / 22593
5400 / 22593
5500 / 22593
5600 / 22593
5700 / 22593
5800 / 22593
5900 / 22593
6000 / 22593
6100 / 22593
6200 / 22593
6300 / 22593
6400 / 22593
6500 / 22593
6600 / 22593
6700 / 22593
6800 / 22593
6900 / 22593
7000 / 22593
7100 / 22593
7200 / 22593
7300 / 22593
7400 / 22593
7500 / 22593
7600 / 22593
7700 / 2259

None
Epoch 1/40




354/354 - 63s - 179ms/step - loss: 1.3604 - perplexity: 3.8485 - val_loss: 1.3403 - val_perplexity: 3.7744
Epoch 2/40
354/354 - 56s - 159ms/step - loss: 1.3370 - perplexity: 3.7704 - val_loss: 1.3331 - val_perplexity: 3.7597
Epoch 3/40
354/354 - 82s - 232ms/step - loss: 1.3316 - perplexity: 3.7551 - val_loss: 1.3286 - val_perplexity: 3.7462
Epoch 4/40
354/354 - 56s - 157ms/step - loss: 1.3266 - perplexity: 3.7381 - val_loss: 1.3241 - val_perplexity: 3.7256
Epoch 5/40
354/354 - 83s - 233ms/step - loss: 1.3205 - perplexity: 3.7166 - val_loss: 1.3168 - val_perplexity: 3.7072
Epoch 6/40
354/354 - 82s - 232ms/step - loss: 1.3172 - perplexity: 3.7041 - val_loss: 1.3128 - val_perplexity: 3.6879
Epoch 7/40
354/354 - 56s - 158ms/step - loss: 1.3082 - perplexity: 3.6719 - val_loss: 1.3312 - val_perplexity: 3.7543
Epoch 8/40
354/354 - 56s - 158ms/step - loss: 1.3014 - perplexity: 3.6472 - val_loss: 1.3164 - val_perplexity: 3.7008
Epoch 9/40
354/354 - 142s - 402ms/step - loss: 1.2867 - perplexity:



Model was trained
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 56ms/step - loss: 1.0934 - perplexity: 2.9693
Validation Set Result: [1.0870920419692993, 2.950958251953125]


AttributeError: module 'keras_hub.api.utils' has no attribute 'greedy_search'