# Assignment 11: What’s Wrong With Our Data? - Kai Ponel & Hannan Mahadik


# Happy new Year 2023! 
(Finally, I'm first)

# Data Stuff 

In [None]:
import os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow import keras
from keras import layers
from keras.layers import RandomFlip, RandomRotation, Dense, Conv2D, MaxPooling2D, Flatten, GlobalAveragePooling2D, BatchNormalization, Dropout, InputLayer
from keras.models import Sequential

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
os.chdir('/content/drive/MyDrive/ColabNotebooks/IDL_Hannan_Kai/Task 11/data')

In [None]:
### Unzipping files

# !unzip data/cifar_attempts.zip

In [None]:
### Load all the data into memory (If this causes mem issues, load it one at-a-time)

label_def = ['airplane','automobile','bird','cat','deer','dog','frog','horse','ship','truck']

data = []
for i in range(1,5):
  data.append(np.load("data{}.npz".format(i)))

# Some inspections

In [None]:
### Check the size of the dataset 
print("Available data : {}".format(dir(data[0].f)))
for index, dataset in enumerate(data):
  print("Checking dataset {}:".format(i))
  print("Available Train images: {}".format(len(dataset["train_imgs"])))
  print("Available Test images: {}".format(len(dataset["test_imgs"])))
  print("Available Val images: {}".format(len(dataset["val_imgs"])))

Available data : ['test_imgs', 'test_lbls', 'train_imgs', 'train_lbls', 'val_imgs', 'val_lbls']
Checking dataset 0:
Available Train images: 36004
Available Test images: 10000
Available Val images: 3996
Checking dataset 1:
Available Train images: 50000
Available Test images: 10000
Available Val images: 5000
Checking dataset 2:
Available Train images: 45000
Available Test images: 10000
Available Val images: 5000
Checking dataset 3:
Available Train images: 45000
Available Test images: 10000
Available Val images: 5000


In [None]:
### Check the distribution of the labels within the sets:
import collections
for index, dataset in enumerate(data):
   print("dataset {}".format(index))
   for subset in ["train", "test", "val"]:
     count_map = {}
     for a in dataset["{}_lbls".format(subset)]:
       if a not in count_map:
         count_map[a]=1
       else:
         count_map[a]+=1
     print("{}: {}".format(subset, collections.OrderedDict(sorted(count_map.items()))))

dataset 0
train: OrderedDict([(0, 4507), (1, 4512), (2, 4521), (4, 4525), (5, 4471), (6, 4512), (8, 4468), (9, 4488)])
test: OrderedDict([(0, 1000), (1, 1000), (2, 1000), (3, 1000), (4, 1000), (5, 1000), (6, 1000), (7, 1000), (8, 1000), (9, 1000)])
val: OrderedDict([(0, 493), (1, 488), (2, 479), (4, 475), (5, 529), (6, 488), (8, 532), (9, 512)])
dataset 1
train: OrderedDict([(0, 5000), (1, 5000), (2, 5000), (3, 5000), (4, 5000), (5, 5000), (6, 5000), (7, 5000), (8, 5000), (9, 5000)])
test: OrderedDict([(0, 1000), (1, 1000), (2, 1000), (3, 1000), (4, 1000), (5, 1000), (6, 1000), (7, 1000), (8, 1000), (9, 1000)])
val: OrderedDict([(0, 493), (1, 488), (2, 479), (3, 519), (4, 475), (5, 529), (6, 488), (7, 485), (8, 532), (9, 512)])
dataset 2
train: OrderedDict([(0, 4507), (1, 4512), (2, 4521), (3, 4481), (4, 4525), (5, 4471), (6, 4512), (7, 4515), (8, 4468), (9, 4488)])
test: OrderedDict([(0, 1000), (1, 1000), (2, 1000), (3, 1000), (4, 1000), (5, 1000), (6, 1000), (7, 1000), (8, 1000), (9,

In [None]:
### Plot results of col*rows images for normal images
columns = 4
rows = 4

def print_some_images(images, labels):
  fig = plt.figure(figsize=(7,7))
  for i in range(1, columns*rows + 1):
    img = tf.cast(images[i+20-1] * 255, tf.int32)
    fig.add_subplot(rows, columns, i)
    plt.title(label_def[labels[i+20-1]])
    plt.imshow(img)
  plt.show()

In [None]:
for index, dataset in enumerate(data):
  print("Images of the dataset: {}".format(index))
  for subset in ["train", "test", "val"]:
    print("{}-set:".format(subset))
    print_some_images(images=dataset["{}_imgs".format(subset)], labels=dataset["{}_lbls".format(subset)])


Output hidden; open in https://colab.research.google.com to view.

# Model

## Model Definition
(Copied from the adversarial Training task)

In [None]:
### Hyperparameters
initializer = tf.keras.initializers.HeNormal()
regularizer = tf.keras.regularizers.L2(1e-4)

### Define the early stopping callback 
early_stopping = tf.keras.callbacks.EarlyStopping(
  monitor="val_loss",
  patience=3,  
  restore_best_weights=True  
)

In [None]:
### A simple CNN model for classifying the CIFAR10 images
model_base = [
  # InputLayer(32,32,3),
  RandomFlip("horizontal_and_vertical"),
  RandomRotation(0.2),
  InputLayer(32,32,3),
  Conv2D(32, 3, activation='relu', kernel_regularizer=regularizer, kernel_initializer=initializer),
  MaxPooling2D((2, 2)),
  Conv2D(64, 3, activation='relu', padding='same', kernel_regularizer=regularizer, kernel_initializer=initializer),
  MaxPooling2D((2, 2)),
  Conv2D(128, 3, activation='relu', padding='same', kernel_regularizer=regularizer, kernel_initializer=initializer),
  MaxPooling2D((2, 2)),
  Flatten(),
  Dense(128, activation='relu'),
  Dense(10, activation='softmax')
]

## Model Training (Iteratively!)

In [None]:
models = []
# Loop over all the datasets and train equivalent models after one another
for index, dataset in enumerate(data):
  print("Model trained on dataset {}".format(index+1))
  # Load / Create the model
  model = Sequential(model_base)
  model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=tf.keras.metrics.SparseCategoricalAccuracy())

  train_ds = tf.data.Dataset.from_tensor_slices((dataset["train_imgs"], dataset["train_lbls"])).batch(64).repeat()
  test_ds = tf.data.Dataset.from_tensor_slices((dataset["test_imgs"], dataset["test_lbls"])).batch(64)
  val_ds = tf.data.Dataset.from_tensor_slices((dataset["val_imgs"], dataset["val_lbls"])).batch(64)


  # Train the model
  # Let's disable early_stopping and set the epochs fixed to 20 to see what happens...
  # model.fit(train_ds, epochs=100, steps_per_epoch=256, validation_data=test_ds, callbacks=[early_stopping])
  model.fit(train_ds, epochs=25, steps_per_epoch=256, validation_data=test_ds)
  # Store the model
  models.append(model)
  print("Accuracy on the {}. validation set: {}".format(index, model.evaluate(val_ds)[1]))


Model trained on dataset 1
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Accuracy on the 0. validation set: 0.7207207083702087
Model trained on dataset 2
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Accuracy on the 1. validation set: 0.6931999921798706
Model trained on dataset 3
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/2

In [None]:
### Some evaluation stats on the test set:
# Note: Storing the models in an list does not appear to work... The test_acc is kinda fishy which does not happen immediately after training. 
for index, model in enumerate(models):
  current_data = data[index]
  test_images = current_data["test_imgs"]
  test_labels = current_data["test_lbls"]

  test_ds = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(64)
  print("Loss/Test accuracy on {}. dataset: {}".format(index+1, model.evaluate(test_ds)))

# Summary

## Dataset 1: 
train and val set do not contain examples of labels 3 and 7, but test set does <br>
*This may happen in the real world because of different sets originate from different enviorments or due to human error.*
## Dataset 2:
Seems alright at first glance - Val set is not perfectly balanced but this also holds for 0 and 2. <br>
Validation Set is a subset of the training set. 
*This may happen in the real world due to normal inaccuracies? Perhaps I missed something here*
## Dataset 3:
- Pixel values appear to be "invalid" on the test data, but are alright on the train/val data. This causes issues when trying to maximize the test acc.
- Train and val set is not perfectly balanced as all things should be (+/- ~20-50 images). This might be impactful when measuring performance since the val set is way smaller. <br>

*This may happen in the real world due to messaurment errors when creating the subsets in different enviorments / at different times*

## Dataset 4: 
- Images are included in multiple subsets and are not mutually exclusive.
- Train and val set only contain very few images, but these ones are repeated very often. <br>
 
*This may happen in the real world due to basic human error or unsuited enviorments when collecting the data (?)*

## General notes:
- Images are shared across the datasets (but this is wanted, I assume)
- Set 3&4 have the same amount of images in the subsets, while 2 contains 5k more train images and 1 about 10k less (which is probably caused by the missing classes in ds 1)


For Dataset 3, using BN will improve the model performance (before every conv.)
It is not noise, it was a form of Normalisation. 