In this assignment, we are going to implement see if we can optimally select a subset of training instances for supervised learning.

In [1]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

We are going to work with the MNIST dataset, a popular dataset for hand-written digit recognition. Here we load the datatset.

In [2]:
# Model / data parameters
num_classes = 10
input_shape = (28, 28, 1)

# Load the data and split it between train and test sets
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255
# Make sure images have shape (28, 28, 1)
x_train = np.expand_dims(x_train, -1) # -1 means the last axis
x_test = np.expand_dims(x_test, -1)
print("x_train shape:", x_train.shape)
print("Loaded {} train samples".format(x_train.shape[0]))
print("Loaded {} test samples".format(x_test.shape[0]))

#! scale down the training set to 10_000 samples
import random
random.seed(42)
train_size = 10_200
test_size = 200
x_train = x_train[:train_size]
y_train = y_train[:train_size]
# get test sets from x_train and y_train:
random_indices = np.random.choice(x_train.shape[0], size=test_size, replace=False)
x_test = x_train[random_indices]
y_test = y_train[random_indices]
# delete the test sets from x_train and y_train:
x_train = np.delete(x_train, random_indices, axis=0)
y_train = np.delete(y_train, random_indices)
# x_test = x_test[:int(x_test.shape[0]/cut_factor)]
# y_test = y_test[:int(y_test.shape[0]/cut_factor)]
print("x_train shape:", x_train.shape)
print("Loaded {} train samples".format(x_train.shape[0]))
print("Loaded {} test samples".format(x_test.shape[0]))


x_train shape: (60000, 28, 28, 1)
Loaded 60000 train samples
Loaded 10000 test samples
x_train shape: (10000, 28, 28, 1)
Loaded 10000 train samples
Loaded 200 test samples


Now corrupt the labels with common types of mistakes. The variable 'noise_probability' controls the amount of errors introduced.

In [3]:
import random
noise_probability = 0.5
SEED = 314159

random.seed(SEED)

def index(array, item):
    for i in range(len(array)):
        if item == array[i]:
            return i
    return -1

def corrupt_label(y, y_index, err):
    n = len(err)
    # select an element at random (index != found)
    if (y_index == n-1):
        noisy_label = err[0]
    else:
        noisy_label = err[(y_index + 1)%n]
    return noisy_label

# We corrupt the MNIST data with some common mistakes, such as 3-->8, 8-->3, 1-->{4, 7}, 5-->6 etc.
def corrupt_labels(y_train, noise_probability):
    num_samples = y_train.shape[0]
    err_es_1 = np.array([0, 2, 3, 5, 6, 8, 9])
    err_es_2 = np.array([1, 4, 7])

    corruptions = {}
    corrupted_indexes = {}

    for i in range(num_samples):
        p = random.random()

        if p < noise_probability:
            y = y_train[i]

            y_index = index(err_es_1, y)
            if y_index >= 0:
                y_noisy = corrupt_label(y, y_index, err_es_1)
            else:
                y_index = index(err_es_2, y)
                y_noisy = corrupt_label(y, y_index, err_es_2)

            key = str(y_train[i]) + '->' + str(y_noisy)
            corrupted_indexes[i] = i

            if key in corruptions:
                corruptions[key] += 1
            else:
                corruptions[key] = 0

            y_train[i] = y_noisy

    return corruptions, corrupted_indexes

corruptions, corrupted_indexes = corrupt_labels(y_train, noise_probability)
print ("Corruptions: " + str(corruptions))
print ("Number of corruptions: {}".format(len(list(corrupted_indexes.keys()))))


Corruptions: {'5->6': 461, '0->2': 488, '4->7': 470, '1->4': 567, '9->0': 501, '2->3': 492, '6->8': 498, '8->9': 469, '7->1': 513, '3->5': 520}
Number of corruptions: 4989


In [4]:
# convert class vectors to binary class matrices
y_train_onehot = keras.utils.to_categorical(y_train, num_classes)
y_test_onehot = keras.utils.to_categorical(y_test, num_classes)

Supervised (parametric) training with the (noisy) labeled examples. Note that this model is trained on the entire dataset (the value of the parameter pruned_indexes is null here, which means that we leave out no points), which is noisy (20% of the labels are corrupted). Now the question is: is this the best model that we can train or can we do better?

In [5]:
batch_size = 128
epochs = 3
validation_split=0.1


model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation="softmax"),
    ]
)
model.summary()
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

def prune_points(x_train, y_train, pruned_indexes):
    num_samples = x_train.shape[0]
    x_train_pruned = []
    y_train_pruned = []
    for i in range(num_samples):
        if not i in pruned_indexes:
            x_train_pruned.append(x_train[i])
            y_train_pruned.append(y_train[i])

    return np.array(x_train_pruned), np.array(y_train_pruned)

def trainAndEvaluateModel(x_train, y_train, x_test, y_test, model, pruned_indexes):

    if not pruned_indexes == None:
        x_train_pruned, y_train_pruned = prune_points(x_train, y_train, pruned_indexes)
    else:
        x_train_pruned = x_train
        y_train_pruned = y_train

    model.fit(x_train_pruned, y_train_pruned, batch_size=batch_size, epochs=epochs)
    loss, accuracy = model.evaluate(x_test, y_test)
    keras.backend.clear_session() # remove previous training weights
    
    return loss, accuracy
    


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 26, 26, 32)        320       
                                                                 
 max_pooling2d (MaxPooling2  (None, 13, 13, 32)        0         
 D)                                                              
                                                                 
 flatten (Flatten)           (None, 5408)              0         
                                                                 
 dropout (Dropout)           (None, 5408)              0         
                                                                 
 dense (Dense)               (None, 10)                54090     
                                                                 
Total params: 54410 (212.54 KB)
Trainable params: 54410 (212.54 KB)
Non-trainable params: 0 (0.00 Byte)
__________________

And we call the following function to train a model on the entire dataset and evaluate it on the test set. The accuracy on the test set is quite good, but can we do better?

In [6]:
trainAndEvaluateModel(x_train, y_train_onehot, x_test, y_test_onehot, model, None)

Epoch 1/3
Epoch 2/3
Epoch 3/3


(1.083177924156189, 0.46000000834465027)

You need to implement a subset selection function that when called will return a subset of instances which will be used to train the model. This setup ensures that you also pass in another dictionary which contains the indexes of the instances that you would not want to use while training the model, i.e., it should contain a list of indexes that you would decide to **leave out** for training.

Here's the code and a sample implementation that returns a randomly chosen set of instances that you are to be left out. Since we chose 70% probability of label corruption (check the **noise_probability** parameter), we also select a subset where we leave out the same proportion of points. This is a baseline implementation and obviously you should aim to achieve better results than this.

In [7]:
# Here 'x_train', 'y_train' and model' are an unused parameters. But you may get better results by leveraging these.
def baseLinePrunedSubsetMethod(x_train, y_train, model):
    pruned_indexes = {}
    num_samples = x_train.shape[0]
    for i in range(num_samples):
        p = random.random()

        if p < noise_probability: # this is the global variable (only useful for this naive approach)
            pruned_indexes[i] = i
    return pruned_indexes

Let's see how this naive baseline works.

In [8]:
pruned_indexes = baseLinePrunedSubsetMethod(x_train, y_train, model)
trainAndEvaluateModel(x_train, y_train_onehot, x_test, y_test_onehot, model, pruned_indexes)

Epoch 1/3
Epoch 2/3
Epoch 3/3


(1.013536810874939, 0.49000000953674316)

Let's now see if we had known what points were actually corrupted (more of a hypothetical unrealistic situation), does leaving out those points actually improve the model's effectiveness. It turns out that it does!

In [9]:
trainAndEvaluateModel(x_train, y_train_onehot, x_test, y_test_onehot, model, corrupted_indexes)

Epoch 1/3
Epoch 2/3
Epoch 3/3


(0.19734010100364685, 0.9350000023841858)

Your task is to implement your own version of (say of name **myPrunedSubsetMethod** (which should take as arguments x_train, y_train, and the model). The function should return a dictionary of indexes that are to be left out. Plug your function in and evaluate the results. Write a thorough report on the methodology and analyse the results.

Some hints:
You can approach this as a discrete state space optimisation problem, where firstly you can define a "selection batch size" (this is not the same as training batch size), which decides which batch of instances you're going to leave out. For instance, if you are in a state where the training set is $X$, you may select (by some heuristics) which points you're gonna leave out (let that set be $\delta \subset X$) so that a child state becomes $X' = X - \delta$. Similarly, if you choose a different $\delta$ you get a different child state. You then need to train and evaluate (call the function *trainAndEvaluateModel*) to see if that child state led to an improvement or not.

You are free to use any algorithm, e.g., simulated annealing, A* search, genetic algorithm etc. to implement this discrete state space optimisation.

# Using Genetic Algorithm:

In [10]:
INPUT_SIZE = x_train.shape[0]

#' Define GA parameters
POPULATION_SIZE = 1 # number of individuals in population
SELECTION_SIZE = 4 # number of individuals to select only one of them to cross over
MUTATION_RATE = 0.01 # probability of mutating each individual
CROSSOVER_RATE = 0.3 # probability of crossing over two individuals
GENERATIONS = 1 # number of generations
ELITE_NUM = 2 # number of elite individuals to keep from one generation to the next

## Initialise Population:

In [11]:
def create_population(population_size, input_size):
    # 0 means prune the point, 1 means keep the point:
    population = []
    # all ones:
    if len(population) < POPULATION_SIZE:
        # print(f"making population of all ones")
        # need hashable type for set so use tuple:
        population.append(np.ones(INPUT_SIZE, dtype=int))

    # half ones, half zeros
    half_size = int(INPUT_SIZE/2)
    if len(population) < POPULATION_SIZE:
        # print(f"making population of half ones, half zeros")
        population.append(np.concatenate((np.ones(half_size, dtype=int), np.zeros(
            half_size, dtype=int))))

    # half zeros, half ones
    if len(population) < POPULATION_SIZE:
        # print(f"making population of half zeros, half ones")
        population.append(np.concatenate((np.zeros(half_size, dtype=int), np.ones(
            half_size, dtype=int))))
        
    # 1/4 ones, 1/4 zeros, 1/4 ones, 1/4 zeros:
    quarter_size = int(INPUT_SIZE/4)
    if len(population) < POPULATION_SIZE:
        # print(f"making population of 1/4 ones, 1/4 zeros, 1/4 ones, 1/4 zeros")
        population.append(np.concatenate((np.ones(quarter_size, dtype=int), np.zeros(
            quarter_size, dtype=int), np.ones(quarter_size, dtype=int), np.zeros(quarter_size, dtype=int))))

    # 1/4 zeros, 1/4 ones, 1/4 zeros, 1/4 ones:
    if len(population) < POPULATION_SIZE:
        # print(f"making population of 1/4 zeros, 1/4 ones, 1/4 zeros, 1/4 ones")
        population.append(np.concatenate((np.zeros(quarter_size, dtype=int), np.ones(
            quarter_size, dtype=int), np.zeros(quarter_size, dtype=int), np.ones(quarter_size, dtype=int))))

    # random
    # if len(population) < POPULATION_SIZE:
        # print(f"making population of random")
    while len(population) < POPULATION_SIZE:
        individual = np.random.choice([0, 1], size=INPUT_SIZE)
        population.append(individual)

    # convert to tuple for hashability:
    population = [tuple(individual) for individual in population]

    return population

# population = create_population(POPULATION_SIZE, INPUT_SIZE)
# print(f"Sample individual: {population[0]}")
# print(f"We created {len(population)} individuals in the population, each with {len(population[0])} genes")

## Run Genetic Algorithm:
- Evaluate fitness of each individual
- Select parents
- Crossover
- Mutation

In [12]:
from functools import lru_cache
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from threading import Lock

result_file_lock = Lock()

# cache the results of this function so that it doesn't have to be recalculated each time. Cache size is unlimited:
@lru_cache(maxsize=None)
def evaluate_fitness(population):
    '''Returns a list of fitness scores for each individual in the population.'''
    global x_train, y_train_onehot, x_test, y_test_onehot, model
    fitness_scores = []
    for individual in population:
        corrupted_indexes = {index: index for index, value in enumerate(individual) if value == 0}
        accuracy = trainAndEvaluateModel(x_train, y_train_onehot, x_test, y_test_onehot, model, corrupted_indexes)[1]
        if accuracy > 0.6:
            print("Found a good enough solution!")
            return (individual, accuracy)
        # print(f"Accuracy: {accuracy}")
        fitness_scores.append(accuracy)
    return tuple(fitness_scores)

def myPrunedSubsetMethod(x_train, y_train, model, population_size, selection_size, mutation_rate, crossover_rate, generations, elite_num):
    #! global population, ELITE_NUM, SELECTION_SIZE
    good_enough = []
    good_enough_fitness = 0.00
    
    POPULATION_SIZE = population_size
    SELECTION_SIZE = selection_size
    MUTATION_RATE = mutation_rate
    CROSSOVER_RATE = crossover_rate
    GENERATIONS = generations
    ELITE_NUM = elite_num
    
    start_time = time.time()

    population = create_population(POPULATION_SIZE, INPUT_SIZE)
    
    # Run GA
    for generation in range(GENERATIONS):
        random.seed(SEED) # reset the seed so that we get the same results each time
        
        #' Evaluate fitness
        # fitness is the accuracy of the model:
        # print("Evaluating fitness...")
        # evaluate fitness returns a tuple:
        fitness_scores = evaluate_fitness(tuple(population))
        if len(fitness_scores) == 2 and type(fitness_scores[1]) == type(0.0):
            # we found a good enough solution:
            good_enough = fitness_scores[0]
            good_enough_fitness = fitness_scores[1]
            break

        if max(fitness_scores) > good_enough_fitness:
            good_enough = population[fitness_scores.index(max(fitness_scores))]
            good_enough_fitness = round(max(fitness_scores), 2)
        
        fitness_scores = list(fitness_scores)
        
        selected_population = []
        
        #' Elitism:
        ELITE_NUM = min(ELITE_NUM, POPULATION_SIZE) # make sure that the number of elite individuals is not greater than the population size
        zipped_population = list(zip(population, fitness_scores))
        if len(population) == ELITE_NUM:
            selected_population = population
        else:
            sorted_population = sorted(zipped_population, key=lambda x: x[1], reverse=True)
            # sorted_population[index of individual][0 for individual, 1 for fitness score]:
            selected_population = [individual[0] for individual in sorted_population[:ELITE_NUM]]
            sorted_population = sorted_population[ELITE_NUM:]
        
        
        #' Selection
        # print("Selecting...")
        SELECTION_SIZE = min(SELECTION_SIZE, POPULATION_SIZE) # make sure that the number of selected individuals is not greater than the population size
        while len(selected_population) < POPULATION_SIZE:
            random.seed(SEED)
            to_battle = random.sample(zipped_population, SELECTION_SIZE)
            # select the fitter individual (who has the higher fitness score). x[1] is the fitness score, max()[0] to get individual array only (not fitness score):
            selected_population.append(max(to_battle, key=lambda x: x[1])[0])
        
        # replace the old population with the new one for the next generation:
        population = selected_population
        
        if generation < GENERATIONS - 1:
        
            #' Crossover
            # print("Crossing over...")
            for i, individual in enumerate(population):
                random.seed(SEED)
                if random.random() < CROSSOVER_RATE:
                    # select another individual at random:
                    random.seed(SEED)
                    another_individual_index = random.randint(0, len(population)-1)
                    # select a random crossover point:
                    random.seed(SEED)
                    crossover_point = random.randint(0, INPUT_SIZE-1)
                    # swap the genes after the crossover point:
                    population[i] = np.concatenate((individual[:crossover_point], population[another_individual_index][crossover_point:]))

            
            #' Mutation
            # print("Mutating...")
            for i, individual in enumerate(population):
                for j, gene in enumerate(individual):
                    random.seed(SEED)
                    if random.random() < MUTATION_RATE:
                        # flip the gene:
                        population[i][j] = 0 if gene == 1 else 1
        
        
        #' Report the progress
        # print(f"Best at generation {generation}: {sorted(zip(population, fitness_scores), key=lambda x: x[1], reverse=True)[0][1]}")
        print(f"Best solution so far: {good_enough_fitness} at generation {generation}")
        
    
    # best_solution = sorted(zip(population, fitness_scores), key=lambda x: x[1])[0][0]
    
    # write this result to CSV file with the hyperparameters:
    global result_file_lock
    with result_file_lock:
        with open('results.csv', 'a') as f:
            f.write(f"{SEED},{POPULATION_SIZE},{SELECTION_SIZE},{MUTATION_RATE},{CROSSOVER_RATE},{GENERATIONS},{ELITE_NUM},{good_enough_fitness},{round(time.time() - start_time, 2)}\n")
        
    
    # Return the indexes of the points to prune
    return {index: index for index, value in enumerate(good_enough) if value == 0}


# ------------------------------
# set up the CSV file:
with open('results.csv', 'w') as f:
    # reset it first:
    f.write("")
    f.write("SEED,POPULATION_SIZE,SELECTION_SIZE,MUTATION_RATE,CROSSOVER_RATE,GENERATIONS,ELITE_NUM,ACCURACY,TIME_TAKEN\n")

def process(population_size, selection_size, mutation_rate, crossover_rate, generations, elite_num):
    # print(f"population_size: {population_size}, selection_size: {selection_size}, mutation_rate: {mutation_rate}, crossover_rate: {crossover_rate}, generations: {generations}, elite_num: {elite_num}")
    start_time = time.time()
    myPrunedSubsetMethod(x_train, y_train, model, population_size, selection_size, mutation_rate, crossover_rate, generations, elite_num)
    # print(f"Num of points to prune: {len(pruned_indexes)}")
    return time.time() - start_time

# global POPULATION_SIZE, SELECTION_SIZE, MUTATION_RATE, CROSSOVER_RATE, GENERATIONS, ELITE_NUM
# population_size_list = sorted([50, 100, 150, 200])
# selection_size_list = [2, 4, 8, 10]
# mutation_rate_list = [0.01, 0.05, 0.1, 0.3]
# crossover_rate_list = [0.1, 0.3, 0.5]
# generations_list = [10, 30, 100]
# elite_num_list = [1, 2]
population_size_list = [1]
selection_size_list = [1]
mutation_rate_list = [0.01]
crossover_rate_list = [0.3]
generations_list = [1]
elite_num_list = [0]

with ThreadPoolExecutor() as executor:
    futures = []
    for population_size in population_size_list:
        # Clear the cache with each new population size:
        evaluate_fitness.cache_clear()
        
        for selection_size in selection_size_list:
            for mutation_rate in mutation_rate_list:
                for crossover_rate in crossover_rate_list:
                    if crossover_rate > mutation_rate:
                        for generations in generations_list:
                            for elite_num in elite_num_list:
                                # Dispatch to the executor
                                futures.append(executor.submit(process, population_size, selection_size, mutation_rate, crossover_rate, generations, elite_num))

# Wait for all threads to complete
for future in as_completed(futures):
    # If your function returns results, you can collect them here
    time_elapsed = round(future.result(), 2)
    unit = "seconds"
    if time_elapsed > 60:
        time_elapsed = time_elapsed/60
        unit = "minutes"
    if time_elapsed > 60:
        time_elapsed = time_elapsed/60
        unit = "hours"
    print(f"Thread finished in {time_elapsed} {unit}")


Epoch 1/3
Epoch 2/3
Epoch 3/3
Best solution so far: 0.51 at generation 0
Thread finished in 5.29 seconds


19 mins for 100 population, 1 generation

In [13]:

# trainAndEvaluateModel(x_train, y_train_onehot, x_test, y_test_onehot, model, pruned_indexes)