In [1]:
import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import os
import random
import tensorflow as tf
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, InputLayer
from keras.utils import print_summary
from tqdm import tqdm_notebook as tqdm
import json

Using TensorFlow backend.


In [2]:
tf.logging.set_verbosity(tf.logging.ERROR)

In [3]:
def set_seed(seed:int):
    """Set the random state of the various random extractions.
        seed:int, the seed to set the random state to.
    """
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)

In [4]:
def load_dataset(path:str)->Tuple[np.ndarray, np.ndarray]:
    """Return the dataset at given path."""
    # Since we do not have the dataset as of now, we return noise
    N = 1000
    x = np.random.uniform(size=(N, 26))
    y = np.random.binomial(100, 0.7, size=(N, 1)) > 80
    return x, y

In [5]:
def scale(train:np.ndarray, test:np.ndarray):
    """Return scaler, scaled training and test vectors based on given training vector."""
    scaler = MinMaxScaler().fit(train)
    return scaler.transform(train), scaler.transform(test)

In [6]:
def split_dataset(dataset:Tuple[np.ndarray, np.ndarray], seed:int, test_size:float=0.3)->Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Return the given dataset split among training a test set for the given random seed.
        dataset:Tuple[np.ndarray, np.ndarray], the dataset to split.
        seed:int, the seed to use for the random split.
    """
    return train_test_split(*dataset, test_size=test_size, random_state=seed)

In [7]:
def scale_split_dataset(dataset, seed:int, test_size:float=0.3):
    """Return split and scaled dataset."""
    x_train, x_test, y_train, y_test = split_dataset(dataset, seed, test_size)
    return (*scale(x_train, x_test), y_train, y_test)

In [8]:
def auprc(y_true, y_pred)->float:
    score = tf.metrics.auc(y_true, y_pred, curve="PR", summation_method="careful_interpolation")[1]
    K.get_session().run(tf.local_variables_initializer())
    return score

In [9]:
def mlp(input_size:int):
    """Return a multi-layer perceptron."""
    set_seed(42)
    model = Sequential([
        InputLayer(input_shape=(input_size,)),
        *[Dense(input_size, activation="relu") for i in range(3)],
        Dense(1, activation="sigmoid")
    ])
    model.compile(
        optimizer="nadam",
        loss='mean_squared_error',
        metrics=[auprc]
    )
    return model

In [10]:
def fit(model:Sequential, x_train:np.ndarray, x_test:np.ndarray, y_train:np.ndarray, y_test:np.ndarray, epochs:int, batch_size:int):
    """Train the given model on given train data for the given epochs number.
        model:Sequential, the model to be trained.
        x_train:np.ndarray, the input for training the model.
        x_test:np.ndarray, the input for testing the model.
        y_train:np.ndarray, the output labels for training the model.
        y_test:np.ndarray, the output labels for testing the model.
        epochs:int, number of epochs for which to train the model.
        initial_epoch:int, starting epoch.
        batch_size:int, number of datapoints per training batch.
    """
    return model.fit(
        x_train,
        y_train,
        shuffle=True,
        verbose=0,
        validation_data=(x_test, y_test),
        epochs=epochs,
        batch_size=batch_size
    )

In [11]:
def train_holdouts(holdouts:int, batch_size:int, path:str, epochs:int):
    dataset = load_dataset(path)
    return np.mean([fit(
        mlp(26),
        *scale_split_dataset(dataset, holdout),
        epochs, 
        batch_size
    ).history["val_auprc"][-1] for holdout in tqdm(range(holdouts), desc="Holdouts", leave=False)])

In [12]:
def train_batch_sizes(min_batch:int, max_batch:int, path:str, holdouts:int, epochs:int):
    return list(zip(*[
        (batch_size, train_holdouts(holdouts, batch_size, path, epochs)) for batch_size in tqdm(range(min_batch, max_batch), desc="Batch sizes")
    ]))

In [14]:
path = ""
holdouts = 2
epochs = 10
auprcs = train_batch_sizes(1, 10, path, holdouts, epochs)

HBox(children=(IntProgress(value=0, description='Batch sizes', max=9, style=ProgressStyle(description_width='i…

HBox(children=(IntProgress(value=0, description='Holdouts', max=2, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Holdouts', max=2, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Holdouts', max=2, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Holdouts', max=2, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Holdouts', max=2, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Holdouts', max=2, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Holdouts', max=2, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Holdouts', max=2, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Holdouts', max=2, style=ProgressStyle(description_width='init…




In [None]:
with open("auprcs.json", "w") as f:
    json.dump(auprcs, f)