In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import multiprocessing
from tqdm.notebook import tqdm
import itertools
import pickle
import sentencepiece as spm

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print('GPU device not found')
else:
    print('Found GPU at: {}'.format(device_name))

In [None]:
TEST_DATA_URL = "https://drivendata-prod.s3.amazonaws.com/data/63/public/test_values.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCYVI2LMPSY%2F20201101%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20201101T200602Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=4a1b12dbdb6b13b8b980fe89ef44f342c7773e2d53515f8f3ad8f50395754012"
BPE_ENCODER_PATH = "trained_GEA_SWEM_encoder.model"
MODEL_PATH = "trained_GEA_SWEM"

In [None]:
INFER_BATCH_SIZE = 38

In [None]:
# preprocess the features

test_features_file_path = tf.keras.utils.get_file("test_features.csv", TEST_DATA_URL)
test_features_df = pd.read_csv(test_features_file_path, index_col="sequence_id")
encoder = spm.SentencePieceProcessor(model_file=BPE_ENCODER_PATH)

# encode sequence
def encode_sequence(features_df, encoder):
    # if the len(sequence)%N != 0, we discard of the extra characters, we also encode each sequence of N characters seperately as SubwordTextEncoder computes overlapping encodings
    # only keep unique sequences
    new_sequence_column = []
    for sequence in tqdm(features_df["sequence"]):
        # 2 byte int works for vocab up to 65,500 in size, casting it as np.uint16 halves the memory requirements, allowing us to have a large vocabulary
        sequence_encoded = encoder.encode(sequence)
        # get unique encodings whilst preserving the order they occured in
        indexes = np.unique(sequence_encoded, return_index=True)[1]
        sequence_encoded = np.array([sequence_encoded[index] for index in sorted(indexes)],dtype=np.uint16)
        new_sequence_column.append(sequence_encoded)
    features_df["sequence"] = new_sequence_column
    return features_df

test_features_df = encode_sequence(test_features_df, encoder)

test_features_df["sequence"] = [pickle.dumps(sequence) for sequence in test_features_df["sequence"]]


In [None]:
# build datasets
test_dataset = tf.data.Dataset.from_tensor_slices({"sequence":test_features_df["sequence"].values,"other_features":test_features_df.drop(columns="sequence").values})

# convert binary to ints

def bin_to_int(sequence_tensor):
    sequence = pickle.loads(sequence_tensor.numpy())
    return sequence

def tf_bin_to_int(*tensors):
    if len(tensors) == 2:
        features_dict, labels_tensor = tensors
    else:
        features_dict = tensors[0]
    sequence_tensor = features_dict["sequence"]
    sequence_tensor = tf.py_function(bin_to_int, inp=[sequence_tensor], Tout=tf.int32)
    sequence_tensor.set_shape([None])
    features_dict["sequence"] = sequence_tensor
    if len(tensors) == 2:
        tensors = (features_dict, labels_tensor)
    else:
        tensors = features_dict
    return tensors

test_dataset = test_dataset.map(tf_bin_to_int,
                                num_parallel_calls=multiprocessing.cpu_count())

# pre fetch
test_dataset = test_dataset.prefetch(tf.data.experimental.AUTOTUNE)

# batch datasets
test_dataset = test_dataset.padded_batch(INFER_BATCH_SIZE, padded_shapes={"sequence": [None], "other_features": [None]})

# pre fetch
test_dataset = test_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
# predict test set and save in submission format
model = tf.keras.models.load_model(MODEL_PATH)
test_prob = model.predict(test_dataset)
columns = open("column_names.txt","r").read().split("\n")
test_predicted_labels = pd.DataFrame(test_prob,columns=columns,index=test_features_df.index)
test_predicted_labels.to_csv("predicted_labels.csv")