# Create List Novel Size

In [10]:
import numpy as np
import scipy
from scipy import stats

In [None]:
def create_list_novel_sizes(cfg, sp, red_light_batch_index):
    """
        Use beta distribution to determine where novel examples appear.

    Returns: list x corresponding to the novel batches, where x[i] is the number of novel examples
     for the i^th batch (where i is the index into the list of novel batches, length(x) = number of novel batches)

     "beta_dist_params": {
        "low": [1.2,1.8],
        "mid": [2,2],
        "high": [1.8,1.2],
        "flat": [1,1]
    }

    """
    distparams = cfg["beta_dist_params"][sp["dist_type"]]

    n_batch_novel = int((cfg["batch_number"] - red_light_batch_index) * sp["prop_unknown"])
    novel_size = n_batch_novel * cfg["batch_size"]

    # Prepare a probability bin for a given novel distribution
    bin_prob = np.linspace(0, 1, cfg["batch_number"] - red_light_batch_index + 1).tolist()
    list_unknown_sizes = []
    for i in range(len(bin_prob) - 1):
        list_unknown_sizes.append(
            int((scipy.stats.beta.cdf(bin_prob[i + 1], distparams[0], distparams[1], loc=0, scale=1) -
                 scipy.stats.beta.cdf(bin_prob[i], distparams[0], distparams[1], loc=0, scale=1)) * novel_size))

    list_unknown_sizes = [max(0, min(cfg["batch_size"], i)) for i in list_unknown_sizes]
    return list_unknown_sizes

In [18]:
distparams = [1.2, 1.8]
n_batch_novel = int((30 - 6) * 0.3)
novel_size = n_batch_novel * 40
novel_size

280

In [23]:
# Prepare a probability bin for a given novel distribution
bin_prob = np.linspace(0, 1, 40 - 6 + 1).tolist()
print(len(bin_prob))
bin_prob

35


[0.0,
 0.029411764705882353,
 0.058823529411764705,
 0.08823529411764705,
 0.11764705882352941,
 0.14705882352941177,
 0.1764705882352941,
 0.20588235294117646,
 0.23529411764705882,
 0.2647058823529412,
 0.29411764705882354,
 0.3235294117647059,
 0.3529411764705882,
 0.38235294117647056,
 0.4117647058823529,
 0.4411764705882353,
 0.47058823529411764,
 0.5,
 0.5294117647058824,
 0.5588235294117647,
 0.5882352941176471,
 0.6176470588235294,
 0.6470588235294118,
 0.6764705882352942,
 0.7058823529411764,
 0.7352941176470588,
 0.7647058823529411,
 0.7941176470588235,
 0.8235294117647058,
 0.8529411764705882,
 0.8823529411764706,
 0.9117647058823529,
 0.9411764705882353,
 0.9705882352941176,
 1.0]

In [24]:
cdf_1 = stats.beta.cdf(bin_prob[2], distparams[0], distparams[1], loc=0, scale=1)
cdf_1

0.0633750792491789

In [25]:
cdf_2 = stats.beta.cdf(bin_prob[1], distparams[0], distparams[1], loc=0, scale=1)
cdf_2

0.027951263750381622

In [26]:
(cdf_2 - cdf_1) * novel_size

-9.918668339663238

In [27]:
list_unknown_sizes = []
for i in range(len(bin_prob) - 1):
    list_unknown_sizes.append(
        int((scipy.stats.beta.cdf(bin_prob[i + 1], distparams[0], distparams[1], loc=0, scale=1) -
             scipy.stats.beta.cdf(bin_prob[i], distparams[0], distparams[1], loc=0, scale=1)) * novel_size))

list_unknown_sizes = [max(0, min(30, i)) for i in list_unknown_sizes]

In [28]:
print(len(list_unknown_sizes))

34
