In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.notebook import tqdm
import itertools
import pickle
import sentencepiece as spm

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print('GPU device not found')
else:
    print('Found GPU at: {}'.format(device_name))

These are URLs to the training set values and labels. Note that they are likely out of date as they refresh every 24
hours, but you can get new URLs from https://www.drivendata.org/competitions/63/genetic-engineering-attribution/data/.
These can also be replaced with local paths of local versions of the CSVs.

In [None]:
TRAIN_DATA_URL = "https://drivendata-prod.s3.amazonaws.com/data/63/public/train_values.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCYVI2LMPSY%2F20201101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201101T175059Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=b3930e87c598086453a329938cc488411d78ff3ec09266474a08fbb0a069b509"
TRAIN_LABELS_URL = "https://drivendata-prod.s3.amazonaws.com/data/63/public/train_labels.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCYVI2LMPSY%2F20201101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201101T175059Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=461e78cc0f26a1684f968c96be7dd02bc04c992333d95e681579081c06f2a999"

In [None]:
TRAIN_BATCH_SIZE = 38
INFER_BATCH_SIZE = 38
VOCAB_SIZE = 65500
VALIDATION_PERCENT_SPLIT = 0.1 # if you change this will need to build a new file of train_indexes
VALIDATION_THRESHOLD = 40 # if less than this many samples in the training set for respective class, no samples from that class are included in the validation set
TRAIN_SET_FRAC = 1 # fraction of training and validation set to use
BASES = ["G","A","T","C","N"]

This code is used to build the validation set.

In [None]:
# download the features and labels

train_features_file_path = tf.keras.utils.get_file("train_features.csv", TRAIN_DATA_URL)
train_features_df = pd.read_csv(train_features_file_path, index_col="sequence_id")
train_labels_file_path = tf.keras.utils.get_file("train_labels.csv", TRAIN_LABELS_URL)
train_labels_df = pd.read_csv(train_labels_file_path, index_col="sequence_id")

NUM_LABELS = len(train_labels_df.columns)

# seperate training set into a training and validation set

indexes = list(train_features_df.index)
np.random.seed(26082020)
np.random.shuffle(indexes)
# ensure that the number of labels for each class in each subset are balanced
indexes_by_class = {key:[] for key in range(NUM_LABELS)}
for index in indexes:
    indexes_by_class[np.argmax(train_labels_df.loc[index].values)].append(index)
validation_indexes = []
train_indexes = []
for class_no in range(NUM_LABELS):
    number_of_samples = len(indexes_by_class[class_no])
    # if we don't want the whole training set, then at minimum we will take 2 samples (one for each subset), as long as there are at least 2
    number_of_samples_to_take = max(int(number_of_samples*TRAIN_SET_FRAC),min(number_of_samples,2))
    # if there are less than VALIDATION_THRESHOLD samples in a class, we include none of that class in the validation set
    if number_of_samples_to_take > VALIDATION_THRESHOLD:
        validation_samples = int(number_of_samples_to_take*VALIDATION_PERCENT_SPLIT)
    else:
        validation_samples = 0
    for sample_no, sample in enumerate(indexes_by_class[class_no][:number_of_samples_to_take]):
        if sample_no < validation_samples:
            validation_indexes.append(sample)
        else:
            train_indexes.append(sample)

In [None]:
# train the BPE encoder, this takes about 15 minutes
open("train_sequences.txt","w").write("\n".join(train_features_df.loc[train_indexes]["sequence"].values))
spm.SentencePieceTrainer.train(input='train_sequences.txt', model_prefix='GEA_SWEM_encoder', vocab_size=VOCAB_SIZE, model_type="bpe", bos_id=-1, eos_id=-1, pad_id=0, unk_id=1)

In [None]:
# encode the sequences using the trained BPE encoder

encoder = spm.SentencePieceProcessor(model_file='GEA_SWEM_encoder.model')

def encode_sequence(features_df, encoder):
    # if the len(sequence)%N != 0, we discard of the extra characters, we also encode each sequence of N characters seperately as SubwordTextEncoder computes overlapping encodings
    # only keep unique sequences
    new_sequence_column = []
    for sequence in tqdm(features_df["sequence"]):
        # 2 byte int works for vocab up to 65,500 in size, casting it as np.uint16 halves the memory requirements, allowing us to have a large vocabulary
        sequence_encoded = encoder.encode(sequence)
        # get unique encodings whilst preserving the order they occured in
        indexes = np.unique(sequence_encoded, return_index=True)[1]
        sequence_encoded = np.array([sequence_encoded[index] for index in sorted(indexes)],dtype=np.uint16)
        new_sequence_column.append(sequence_encoded)
    features_df["sequence"] = new_sequence_column
    return features_df

train_features_df = encode_sequence(train_features_df, encoder)

# convert one-hot features to int
column_type_dict = {"sequence":object}
for column in train_features_df.columns[1:]:
    column_type_dict[column] = np.int16
train_features_df = train_features_df.astype(column_type_dict)
train_features_df.to_pickle("bpe_{}_encoded_train_features_df.pickle".format(VOCAB_SIZE))

# shuffle again so indexes are not ordered by class
np.random.seed(27082020)
np.random.shuffle(validation_indexes)
np.random.seed(28082020)
np.random.shuffle(train_indexes)
# set up their dataframes
validation_features_df = train_features_df.loc[validation_indexes]
validation_labels_df = train_labels_df.loc[validation_indexes]
train_features_df = train_features_df.loc[train_indexes]
train_labels_df = train_labels_df.loc[train_indexes]

# the only way to get uneven lists into tf.data.Dataset is using ragged tensors, but padded
# batch does not support ragged tensors, and we can not pad before training as we will run out
# of memory, so we just convert the lists to binary and then convert them back to ints in the
# pipeline

train_features_df["sequence"] = [pickle.dumps(sequence) for sequence in train_features_df["sequence"]]
validation_features_df["sequence"] = [pickle.dumps(sequence) for sequence in validation_features_df["sequence"]]


In [None]:
# build datasets
train_dataset = tf.data.Dataset.from_tensor_slices(({"sequence":train_features_df["sequence"].values,"other_features":train_features_df.drop(columns="sequence").values},train_labels_df.values))
validation_dataset = tf.data.Dataset.from_tensor_slices(({"sequence":validation_features_df["sequence"].values,"other_features":validation_features_df.drop(columns="sequence").values},validation_labels_df.values))

# save unshufled train dataset for evaluation
unshuffled_train_dataset = tf.data.Dataset.from_tensor_slices(({"sequence":train_features_df["sequence"].values,"other_features":train_features_df.drop(columns="sequence").values},train_labels_df.values))

# shuffle train
train_dataset = train_dataset.shuffle(len(train_features_df))

# convert binary to ints

def bin_to_int(sequence_tensor):
    sequence = pickle.loads(sequence_tensor.numpy())
    return sequence

def tf_bin_to_int(*tensors):
    if len(tensors) == 2:
        features_dict, labels_tensor = tensors
    else:
        features_dict = tensors[0]
    sequence_tensor = features_dict["sequence"]
    sequence_tensor = tf.py_function(bin_to_int, inp=[sequence_tensor], Tout=tf.int32)
    sequence_tensor.set_shape([None])
    features_dict["sequence"] = sequence_tensor
    if len(tensors) == 2:
        tensors = (features_dict, labels_tensor)
    else:
        tensors = features_dict
    return tensors

train_dataset = train_dataset.map(tf_bin_to_int,
                                  num_parallel_calls=multiprocessing.cpu_count())
unshuffled_train_dataset = unshuffled_train_dataset.map(tf_bin_to_int,
                                  num_parallel_calls=multiprocessing.cpu_count())
validation_dataset = validation_dataset.map(tf_bin_to_int,
                                  num_parallel_calls=multiprocessing.cpu_count())

# pre fetch
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
unshuffled_train_dataset = unshuffled_train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
validation_dataset = validation_dataset.prefetch(tf.data.experimental.AUTOTUNE)

# batch datasets
train_dataset = train_dataset.padded_batch(TRAIN_BATCH_SIZE, padded_shapes=({"sequence": [None], "other_features": [None]},[None]))
unshuffled_train_dataset = unshuffled_train_dataset.padded_batch(INFER_BATCH_SIZE, padded_shapes=({"sequence": [None], "other_features": [None]},[None]))
validation_dataset = validation_dataset.padded_batch(INFER_BATCH_SIZE, padded_shapes=({"sequence": [None], "other_features": [None]},[None]))

# pre fetch
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
unshuffled_train_dataset = unshuffled_train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
validation_dataset = validation_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
# dev decay as proposed in "The Marginal Value of Adaptive Gradient Methods in Machine Learning"

class DevDecayCallback(tf.keras.callbacks.Callback):
    def __init__(self, performance_metric_name, gamma):
        super(DevDecayCallback, self).__init__()
        self.best_test_performance = -float('inf')
        self.performance_metric_name = performance_metric_name
        self.gamma = gamma

    def on_test_end(self, logs):
        test_performance = logs[self.performance_metric_name]
        if test_performance > self.best_test_performance:
            self.best_test_performance = test_performance
        else:
            lr = float(tf.keras.backend.get_value(self.model.optimizer.learning_rate))
            new_lr = lr * self.gamma
            print("Lr decayed from {} to {}".format(lr, new_lr))
            tf.keras.backend.set_value(self.model.optimizer.lr, new_lr)

In [None]:
# other features encoder start
inp_other_features = tf.keras.Input(shape=[39],name="other_features")
x = tf.keras.layers.Dense(128, activation="relu")(inp_other_features)
x = tf.keras.layers.LayerNormalization()(x)
other_features_encoder_end = tf.keras.layers.Dropout(0.6)(x)
# other features encoder end

# sequence encoder start
inp_sequence = tf.keras.Input(shape=[None],name="sequence")
x = tf.keras.layers.Embedding(VOCAB_SIZE+2, 300, mask_zero=True, name="Embedding")(inp_sequence)
x = tf.keras.layers.LayerNormalization()(x)
x = tf.keras.layers.GlobalMaxPool1D()(x)
sequence_encoder_out = tf.keras.layers.Dropout(0.6)(x)
# sequence encoder end

x_concat = tf.keras.layers.concatenate([sequence_encoder_out,other_features_encoder_end])
out = tf.keras.layers.Dense(NUM_LABELS,activation="softmax")(x_concat)
model = tf.keras.Model([inp_sequence,inp_other_features], out)

model.compile(optimizer=tf.keras.optimizers.SGD(lr=5e-3, nesterov=True, momentum=0.8), loss="categorical_crossentropy", metrics=["accuracy",tf.keras.metrics.TopKCategoricalAccuracy(k=10,name="top_10_accuracy")])
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="GEA_SWEM",
    save_weights_only=False,
    monitor='val_top_10_accuracy',
    mode='max',
    save_best_only=True)

dev_decay_callback = DevDecayCallback(performance_metric_name="accuracy", gamma=0.95)

history = model.fit(train_dataset,epochs=50,verbose=2,validation_data=validation_dataset,callbacks=[model_checkpoint_callback,dev_decay_callback])