In this assignment, we are going to implement see if we can optimally select a subset of training instances for supervised learning.

In [1]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

2024-01-06 18:15:13.507172: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-06 18:15:13.530424: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-06 18:15:13.646180: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-06 18:15:13.646213: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-06 18:15:13.646884: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

We are going to work with the MNIST dataset, a popular dataset for hand-written digit recognition. Here we load the datatset.

In [2]:
# Model / data parameters
num_classes = 10
input_shape = (28, 28, 1)

# Load the data and split it between train and test sets
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255
# Make sure images have shape (28, 28, 1)
x_train = np.expand_dims(x_train, -1) # -1 means the last axis
x_test = np.expand_dims(x_test, -1)
print("x_train shape:", x_train.shape)
print("Loaded {} train samples".format(x_train.shape[0]))
print("Loaded {} test samples".format(x_test.shape[0]))

#! scale down the training set to 10_000 samples
# import random
# random.seed(42)
# cut_factor = 3
# x_train = x_train[:int(x_train.shape[0]/cut_factor)]
# y_train = y_train[:int(y_train.shape[0]/cut_factor)]
# x_test = x_test[:int(x_test.shape[0]/cut_factor)]
# y_test = y_test[:int(y_test.shape[0]/cut_factor)]
# print("x_train shape:", x_train.shape)
# print("Loaded {} train samples".format(x_train.shape[0]))
# print("Loaded {} test samples".format(x_test.shape[0]))


x_train shape: (60000, 28, 28, 1)
Loaded 60000 train samples
Loaded 10000 test samples


Now corrupt the labels with common types of mistakes. The variable 'noise_probability' controls the amount of errors introduced.

In [3]:
import random
noise_probability = 0.5
SEED = 314159

random.seed(SEED)

def index(array, item):
    for i in range(len(array)):
        if item == array[i]:
            return i
    return -1

def corrupt_label(y, y_index, err):
    n = len(err)
    # select an element at random (index != found)
    if (y_index == n-1):
        noisy_label = err[0]
    else:
        noisy_label = err[(y_index + 1)%n]
    return noisy_label

# We corrupt the MNIST data with some common mistakes, such as 3-->8, 8-->3, 1-->{4, 7}, 5-->6 etc.
def corrupt_labels(y_train, noise_probability):
    num_samples = y_train.shape[0]
    err_es_1 = np.array([0, 2, 3, 5, 6, 8, 9])
    err_es_2 = np.array([1, 4, 7])

    corruptions = {}
    corrupted_indexes = {}

    for i in range(num_samples):
        p = random.random()

        if p < noise_probability:
            y = y_train[i]

            y_index = index(err_es_1, y)
            if y_index >= 0:
                y_noisy = corrupt_label(y, y_index, err_es_1)
            else:
                y_index = index(err_es_2, y)
                y_noisy = corrupt_label(y, y_index, err_es_2)

            key = str(y_train[i]) + '->' + str(y_noisy)
            corrupted_indexes[i] = i

            if key in corruptions:
                corruptions[key] += 1
            else:
                corruptions[key] = 0

            y_train[i] = y_noisy

    return corruptions, corrupted_indexes

corruptions, corrupted_indexes = corrupt_labels(y_train, noise_probability)
print ("Corruptions: " + str(corruptions))
print ("Number of corruptions: {}".format(len(list(corrupted_indexes.keys()))))


Corruptions: {'5->6': 2666, '0->2': 2917, '4->7': 2888, '1->4': 3385, '9->0': 2997, '2->3': 2969, '3->5': 3027, '7->1': 3204, '8->9': 2911, '6->8': 2960}
Number of corruptions: 29934


In [4]:
# convert class vectors to binary class matrices
y_train_onehot = keras.utils.to_categorical(y_train, num_classes)
y_test_onehot = keras.utils.to_categorical(y_test, num_classes)

Supervised (parametric) training with the (noisy) labeled examples. Note that this model is trained on the entire dataset (the value of the parameter pruned_indexes is null here, which means that we leave out no points), which is noisy (20% of the labels are corrupted). Now the question is: is this the best model that we can train or can we do better?

In [5]:
batch_size = 128
epochs = 3
validation_split=0.1


model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation="softmax"),
    ]
)
model.summary()
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

def prune_points(x_train, y_train, pruned_indexes):
    num_samples = x_train.shape[0]
    x_train_pruned = []
    y_train_pruned = []
    for i in range(num_samples):
        if not i in pruned_indexes:
            x_train_pruned.append(x_train[i])
            y_train_pruned.append(y_train[i])

    return np.array(x_train_pruned), np.array(y_train_pruned)

def trainAndEvaluateModel(x_train, y_train, x_test, y_test, model, pruned_indexes):

    if not pruned_indexes == None:
        x_train_pruned, y_train_pruned = prune_points(x_train, y_train, pruned_indexes)
    else:
        x_train_pruned = x_train
        y_train_pruned = y_train

    model.fit(x_train_pruned, y_train_pruned, batch_size=batch_size, epochs=epochs)
    loss, accuracy = model.evaluate(x_test, y_test)
    keras.backend.clear_session() # remove previous training weights
    
    return loss, accuracy
    


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 26, 26, 32)        320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 13, 13, 32)        0         
 D)                                                              
                                                                 
 flatten (Flatten)           (None, 5408)              0         
                                                                 
 dropout (Dropout)           (None, 5408)              0         
                                                                 
 dense (Dense)               (None, 10)                54090     
                                                                 
Total params: 54410 (212.54 KB)
Trainable params: 54410 (212.54 KB)
Non-trainable params: 0 (0.00 Byte)
__________________

And we call the following function to train a model on the entire dataset and evaluate it on the test set. The accuracy on the test set is quite good, but can we do better?

In [6]:
trainAndEvaluateModel(x_train, y_train_onehot, x_test, y_test_onehot, model, None)

Epoch 1/3
Epoch 2/3
Epoch 3/3


(0.8952493071556091, 0.4449000060558319)

You need to implement a subset selection function that when called will return a subset of instances which will be used to train the model. This setup ensures that you also pass in another dictionary which contains the indexes of the instances that you would not want to use while training the model, i.e., it should contain a list of indexes that you would decide to **leave out** for training.

Here's the code and a sample implementation that returns a randomly chosen set of instances that you are to be left out. Since we chose 70% probability of label corruption (check the **noise_probability** parameter), we also select a subset where we leave out the same proportion of points. This is a baseline implementation and obviously you should aim to achieve better results than this.

In [7]:
# Here 'x_train', 'y_train' and model' are an unused parameters. But you may get better results by leveraging these.
def baseLinePrunedSubsetMethod(x_train, y_train, model):
    pruned_indexes = {}
    num_samples = x_train.shape[0]
    for i in range(num_samples):
        p = random.random()

        if p < noise_probability: # this is the global variable (only useful for this naive approach)
            pruned_indexes[i] = i
    return pruned_indexes

Let's see how this naive baseline works.

In [8]:
pruned_indexes = baseLinePrunedSubsetMethod(x_train, y_train, model)
trainAndEvaluateModel(x_train, y_train_onehot, x_test, y_test_onehot, model, pruned_indexes)

Epoch 1/3
Epoch 2/3
Epoch 3/3


(0.826309859752655, 0.5539000034332275)

Let's now see if we had known what points were actually corrupted (more of a hypothetical unrealistic situation), does leaving out those points actually improve the model's effectiveness. It turns out that it does!

In [9]:
trainAndEvaluateModel(x_train, y_train_onehot, x_test, y_test_onehot, model, corrupted_indexes)

Epoch 1/3
Epoch 2/3
Epoch 3/3


(0.09345085918903351, 0.9732000231742859)

Your task is to implement your own version of (say of name **myPrunedSubsetMethod** (which should take as arguments x_train, y_train, and the model). The function should return a dictionary of indexes that are to be left out. Plug your function in and evaluate the results. Write a thorough report on the methodology and analyse the results.

Some hints:
You can approach this as a discrete state space optimisation problem, where firstly you can define a "selection batch size" (this is not the same as training batch size), which decides which batch of instances you're going to leave out. For instance, if you are in a state where the training set is $X$, you may select (by some heuristics) which points you're gonna leave out (let that set be $\delta \subset X$) so that a child state becomes $X' = X - \delta$. Similarly, if you choose a different $\delta$ you get a different child state. You then need to train and evaluate (call the function *trainAndEvaluateModel*) to see if that child state led to an improvement or not.

You are free to use any algorithm, e.g., simulated annealing, A* search, genetic algorithm etc. to implement this discrete state space optimisation.

# Using Genetic Algorithm:

In [27]:
INPUT_SIZE = x_train.shape[0]
# OUTPUT_SIZE = INPUT_SIZE
# Define GA parameters
POPULATION_SIZE = 100 # number of individuals in population
MUTATION_RATE = 0.01 # probability of mutating each individual
CROSSOVER_RATE = 0.3 # probability of crossing over two individuals
GENERATIONS = 1 # number of generations
ELITE_NUM = 2 # number of elite individuals to keep from one generation to the next

# different from the training batch size. This is the batch size used for selecting the number of points to prune.
SELECTION_BATCH_SIZE = int(POPULATION_SIZE * 1)

## Initialise Population:

In [28]:
# 0 means prune the point, 1 means keep the point:
population = []

# all ones:
if len(population) < POPULATION_SIZE:
    print(f"making population of all ones")
    # need hashable type for set so use tuple:
    population.append(np.ones(INPUT_SIZE, dtype=int))

# half ones, half zeros
half_size = int(INPUT_SIZE/2)
if len(population) < POPULATION_SIZE:
    print(f"making population of half ones, half zeros")
    population.append(np.concatenate((np.ones(half_size, dtype=int), np.zeros(
        half_size, dtype=int))))

# half zeros, half ones
if len(population) < POPULATION_SIZE:
    print(f"making population of half zeros, half ones")
    population.append(np.concatenate((np.zeros(half_size, dtype=int), np.ones(
        half_size, dtype=int))))
    
# 1/4 ones, 1/4 zeros, 1/4 ones, 1/4 zeros:
quarter_size = int(INPUT_SIZE/4)
if len(population) < POPULATION_SIZE:
    print(f"making population of 1/4 ones, 1/4 zeros, 1/4 ones, 1/4 zeros")
    population.append(np.concatenate((np.ones(quarter_size, dtype=int), np.zeros(
        quarter_size, dtype=int), np.ones(quarter_size, dtype=int), np.zeros(quarter_size, dtype=int))))

# 1/4 zeros, 1/4 ones, 1/4 zeros, 1/4 ones:
if len(population) < POPULATION_SIZE:
    print(f"making population of 1/4 zeros, 1/4 ones, 1/4 zeros, 1/4 ones")
    population.append(np.concatenate((np.zeros(quarter_size, dtype=int), np.ones(
        quarter_size, dtype=int), np.zeros(quarter_size, dtype=int), np.ones(quarter_size, dtype=int))))

# random
if len(population) < POPULATION_SIZE:
    print(f"making population of random")
    
while len(population) < POPULATION_SIZE:
    individual = np.random.choice([0, 1], size=INPUT_SIZE)
    population.append(individual)

    
print(f"Sample individual: {population[0]}")
print(f"We created {len(population)} individuals in the population, each with {len(population[0])} genes")

making population of all ones
making population of half ones, half zeros
making population of half zeros, half ones
making population of 1/4 ones, 1/4 zeros, 1/4 ones, 1/4 zeros
making population of 1/4 zeros, 1/4 ones, 1/4 zeros, 1/4 ones
making population of random
Sample individual: [1 1 1 ... 1 1 1]
We created 100 individuals in the population, each with 60000 genes


## Run Genetic Algorithm:
- Evaluate fitness of each individual
- Select parents
- Crossover
- Mutation

In [29]:
import time

def evaluate_individual(individual):
    corrupted_indexes = {index: index for index, value in enumerate(individual) if value == 0}
    return trainAndEvaluateModel(x_train, y_train_onehot, x_test, y_test_onehot, model, corrupted_indexes)[1] # return the accuracy only

def myPrunedSubsetMethod(x_train, y_train, model):
    start_time = time.time()
    global population
    good_enough = []
    good_enough_fitness = 0
    # Run GA
    for generation in range(GENERATIONS):
        random.seed(SEED) # reset the seed so that we get the same results each time
        
        #' Evaluate fitness
        # fitness is the accuracy of the model:
        print("Evaluating fitness...")
        fitness_scores = []
        for individual in population:
            corrupted_indexes = {index: index for index, value in enumerate(individual) if value == 0}
            accuracy = trainAndEvaluateModel(x_train, y_train_onehot, x_test, y_test_onehot, model, corrupted_indexes)[1]
            fitness_scores.append(accuracy)
            print(f"Accuracy: {accuracy}")
            if accuracy > good_enough_fitness:
                good_enough_fitness = accuracy
                good_enough = individual
                if accuracy > 0.6:
                    print("Found a good enough solution!")
                    good_enough = population[fitness_scores.index(max(fitness_scores))]
                    return {index: index for index, value in enumerate(good_enough) if value == 0}
        
        
        #' Selection
        print("Selecting...")
        selected_population = []
        # Elitism:
        sorted_population = sorted(zip(population, fitness_scores), key=lambda x: x[1], reverse=True)
        for i in range(ELITE_NUM):
            selected_population.append(sorted_population[i][0])

        sorted_population = sorted_population[ELITE_NUM:]
            
            
        while len(selected_population) < SELECTION_BATCH_SIZE:
            # select two individuals at random
            individual_index1 , individual_index2 = random.randint(0, len(sorted_population)-1), random.randint(0, len(sorted_population)-1)
            # select the fitter individual (who has the higher fitness score):
            selected_population.append(sorted_population[individual_index1][0] if fitness_scores[individual_index1] > fitness_scores[individual_index2] else sorted_population[individual_index2][0])
        
        
        if generation < GENERATIONS - 1:
        
            #' Crossover
            print("Crossing over...")
            for i, individual in enumerate(selected_population):
                if random.random() < CROSSOVER_RATE:
                    # select another individual at random:
                    another_individual_index = random.randint(0, len(selected_population)-1)
                    # select a random crossover point:
                    crossover_point = random.randint(0, INPUT_SIZE-1)
                    # swap the genes after the crossover point:
                    selected_population[i] = np.concatenate((individual[:crossover_point], selected_population[another_individual_index][crossover_point:]))

            
            #' Mutation
            print("Mutating...")
            for i, individual in enumerate(selected_population):
                for j, gene in enumerate(individual):
                    if random.random() < MUTATION_RATE:
                        # flip the gene:
                        selected_population[i][j] = 0 if gene == 1 else 1
                        
        # replace the old population with the new one for the next generation:
        population = selected_population
        
        
        #' Report the progress
        estimated_time_remaining = (time.time() - start_time) * (GENERATIONS - generation - 1) # in seconds
        units = "seconds"
        if estimated_time_remaining > 60:
            estimated_time_remaining = estimated_time_remaining / 60 # in minutes
            units = "minutes" if estimated_time_remaining > 1 else "minute"
        if estimated_time_remaining > 60:
            estimated_time_remaining = estimated_time_remaining / 60 # in hours
            units = "hours" if estimated_time_remaining > 1 else "hour"
        if estimated_time_remaining > 24:
            estimated_time_remaining = estimated_time_remaining / 24 # in days
            units = "days" if estimated_time_remaining > 1 else "day"
        print(f"Estimated time remaining: {round(estimated_time_remaining, 2)} {units}.")
        print(f"Best solution so far: {good_enough_fitness} at generation {generation}")
        
    
    best_solution = sorted(zip(population, fitness_scores), key=lambda x: x[1])[0][0]
    
    # Return the indexes of the points to prune
    return {index: index for index, value in enumerate(good_enough) if value == 0}

pruned_indexes = myPrunedSubsetMethod(x_train, y_train, model)
print(f"Num of points to prune: {len(pruned_indexes)}")

Evaluating fitness...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.4828000068664551
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.4812999963760376
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.516700029373169
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.5072000026702881
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.47850000858306885
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.4918999969959259
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.48410001397132874
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.491100013256073
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.5040000081062317
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.5023000240325928
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.48980000615119934
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.48570001125335693
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.5149999856948853
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.492000013589859
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.49939998984336853
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.4805000126361847
Epoch 1/3
Epoch 2/3
Epoch 3/3
Ac

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

19 mins for 100 population, 1 generation

In [None]:

trainAndEvaluateModel(x_train, y_train_onehot, x_test, y_test_onehot, model, pruned_indexes)