# Cifar-10 : Computer Vision Classification

In [None]:
#!pip install numpy==1.26.4

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt
import tensorflow.keras.ops as ops
import pandas as pd

import random

import pickle

from sklearn.metrics import confusion_matrix, classification_report
import itertools

In [None]:
#tfds.list_builders()

In [None]:
(data_train, data_test, data_val), infos = tfds.load('cifar10',split=['train[:48000]','train[48000:]+test[:4000]','test[4000:]'], shuffle_files=True, # 80/10/10%
                      as_supervised=True, with_info=True)

## Explore data :

In [None]:
class_names=infos.features['label'].names

In [None]:
data_train.cardinality(), data_test.cardinality(), data_val.cardinality()

In [None]:
tfds.show_examples(data_train,infos);

In [None]:
for image, label in data_train.take(1):
    print(image.dtype)
    print(image.numpy().min())
    print(image.numpy().max())

## Preprocessing :

### Rescale, Resize, Batch :

In [None]:
# Preprocessing data : ensure they all have the same size (32x32x3), datas are scaled [0,1] and type float32
resize_rescale = tf.keras.Sequential([
    tf.keras.layers.Resizing(32,32),
    tf.keras.layers.Rescaling(1./255)
])

data_augmentation = tf.keras.Sequential([
  tf.keras.layers.RandomFlip("horizontal"),
  tf.keras.layers.RandomRotation(0.2),
  tf.keras.layers.RandomZoom(0.2),
  tf.keras.layers.RandomHeight(0.2),
  tf.keras.layers.RandomWidth(0.2)
])

data_augmentation_01 = tf.keras.Sequential([
  tf.keras.layers.RandomFlip("horizontal"),
  tf.keras.layers.RandomRotation(0.1),
  tf.keras.layers.RandomZoom(0.1),
  tf.keras.layers.RandomHeight(0.1),
  tf.keras.layers.RandomWidth(0.1)
])

def preprocess(data, val=0, augment=False) :
  data = data.shuffle(buffer_size=1000) #Shuffle data

  if augment and val==0.1:
    data=data.map(lambda x, y: (data_augmentation_01(x), y), num_parallel_calls=tf.data.AUTOTUNE) #Augment if necessary

  if augment and val==0.2:
    data=data.map(lambda x, y: (data_augmentation(x), y), num_parallel_calls=tf.data.AUTOTUNE) #Augment if necessary

  data=data.map(lambda x, y: (resize_rescale(x), y), num_parallel_calls=tf.data.AUTOTUNE) #Resize, rescale

  data=data.batch(64) # Batch de 64, CHOIX

  return data.prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
batch_train=preprocess(data_train)
batch_test=preprocess(data_test)
batch_val=preprocess(data_val)

batch_train, batch_test, batch_val

In [None]:
aug_train=preprocess(data_train,augment=True, val=0.2)

aug_train_01=preprocess(data_train,augment=True, val=0.1)

### Visualisation :

In [None]:
plt.figure(figsize=(5,5))

plt.subplot(2,2,1)
plt.imshow(image)
plt.title('Original')
plt.axis(False)

plt.subplot(2,2,2)
plt.imshow(resize_rescale(image))
plt.title('Scaled')
plt.axis(False)

plt.subplot(2,2,3)
plt.imshow(resize_rescale(data_augmentation(image)))
plt.title('aug 0.2')
plt.axis(False)

plt.subplot(2,2,4)
plt.imshow(resize_rescale(data_augmentation_01(image)))
plt.title('aug 0.1')
plt.axis(False)

## Modelling :

In [None]:
# Enable mixed precision training : Better use of GPU's Memory by using lfoat32 and float16 when possible
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

In [None]:
mixed_precision.global_policy()

In [None]:
checkpoint_path="best_model_cifar10.weights.h5"

# Callbacks :

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1)

LrReducer = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1, min_lr=1e-7) # Only when the val_loss doesn't go down for 2 epochs

checkpoint = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_accuracy', verbose=0, save_best_only=True, save_weights_only=True, mode='max')

### Baseline & data augmentation or not :

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_0=tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'), # One bloc only
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)

])

model_0.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

history_0=model_0.fit(batch_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint])

In [None]:
model_0.summary()

In [None]:
res_0=model_0.evaluate(batch_test) # 65% accuracy

In [None]:
df_0=pd.DataFrame(history_0.history).drop('learning_rate',axis=1)
df_0.columns=['accuracy_0','loss_0','val_accuracy_0','val_loss_0']
df_0_acc=df_0.loc[:,['accuracy_0','val_accuracy_0']]
df_0_loss=df_0.loc[:,['loss_0','val_loss_0']]

In [None]:
model_0.save('model_0.keras')

with open('history_0', 'wb') as f:
    pickle.dump(history_0.history, f)

#### With data augmentation (val=0.2) :

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_0_aug=tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'), 
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)
])

model_0_aug.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

history_0_aug=model_0_aug.fit(aug_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint]) # data augmentation

In [None]:
model_0_aug.evaluate(batch_test) # 41% accuracy

The data augmentation with val=0.2 doesn't help with generalization at all it even worsen the learning.

In [None]:
del model_0_aug, history_0_aug, aug_train

#### With data augmentation (val=0.1) :

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_0_aug01=tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'), 
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)
])

model_0_aug01.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

history_0_aug01=model_0_aug01.fit(aug_train_01,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint]) # Data augmentation

In [None]:
model_0_aug01.evaluate(batch_test) # 50% accuracy

#### Conclusion :

In [None]:
pd.concat([pd.DataFrame(history_0.history).drop('learning_rate',axis=1),pd.DataFrame(history_0_aug01.history).drop('learning_rate',axis=1)],axis=1).plot()

Data augmentation doesn't help with generalization and worsen the learning in general. The benchmark for amelioration is model_0 (68% accuracy).

In [None]:
del aug_train_01, model_0_aug01, history_0_aug01, data_augmentation_01

### Valid vs same padding

#### Valid :

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_0_valid=tf.keras.Sequential([ #Same architecture as previously
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='valid'), # using valid padding
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='valid'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)
])

model_0_valid.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

history_0_valid=model_0_valid.fit(batch_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint])

In [None]:
model_0_valid.evaluate(batch_test) # 64%

#### Conclusion  :

In [None]:
df_valid=pd.DataFrame(history_0_valid.history).drop('learning_rate',axis=1)
df_valid.columns=['accuracy_valid','loss_valid','val_accuracy_valid','val_loss_valid']
df_valid_acc=df_valid.loc[:,['accuracy_valid','val_accuracy_valid']]
df_valid_loss=df_valid.loc[:,['loss_valid','val_loss_valid']]

In [None]:
pd.concat([df_0_acc,df_valid_acc],axis=1).plot()
pd.concat([df_0_loss,df_valid_loss],axis=1).plot()

Same padding seems to be better in loss and accuraccy in this case but they're close to each other.

In [None]:
del model_0_valid, history_0_valid, df_valid, df_valid_acc, df_valid_loss

### Relu vs tanh vs sigmoid

#### Sigmoid :

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_0_sig=tf.keras.Sequential([ #Same architecture as previously
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='sigmoid', padding='same'), # using sigmoid activation
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='sigmoid', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)

])

model_0_sig.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

history_0_sig=model_0_sig.fit(batch_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint])

In [None]:
model_0_sig.evaluate(batch_test) # 53%

#### Tanh :

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_0_tanh=tf.keras.Sequential([ #Same architecture as previously
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='tanh', padding='same'), # using tanh activation
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='tanh', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)

])

model_0_tanh.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

history_0_tanh=model_0_tanh.fit(batch_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint])

In [None]:
model_0_tanh.evaluate(batch_test) # 55%

#### Conclusion :

In [None]:
df_sig=pd.DataFrame(history_0_sig.history).drop('learning_rate',axis=1)
df_sig.columns=['accuracy_sig','loss_sig','val_accuracy_sig','val_loss_sig']
df_sig_acc=df_sig.loc[:,['accuracy_sig','val_accuracy_sig']]
df_sig_loss=df_sig.loc[:,['loss_sig','val_loss_sig']]

df_tanh=pd.DataFrame(history_0_tanh.history).drop('learning_rate',axis=1)
df_tanh.columns=['accuracy_tanh','loss_tanh','val_accuracy_tanh','val_loss_tanh']
df_tanh_acc=df_tanh.loc[:,['accuracy_tanh','val_accuracy_tanh']]
df_tanh_loss=df_tanh.loc[:,['loss_tanh','val_loss_tanh']]

In [None]:
pd.concat([df_0_acc,df_sig_acc,df_tanh_acc],axis=1).plot()
pd.concat([df_0_loss,df_sig_loss,df_tanh_loss],axis=1).plot()

ReLU is the best option here.

In [None]:
del model_0_sig, history_0_sig, model_0_tanh, history_0_tanh, df_sig, df_sig_acc, df_sig_loss, df_tanh, df_tanh_acc, df_tanh_loss

### Adam vs AdamW

#### AdamW

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_0_adamw=tf.keras.Sequential([ #Same architecture as previously
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)

])

model_0_adamw.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.AdamW(weight_decay=0.004), #AdamW with decay=0.004, the default
                metrics=['accuracy'])

history_0_adamw=model_0_adamw.fit(batch_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint])

In [None]:
model_0_adamw.evaluate(batch_test) # 66%

#### Conclusion :

In [None]:
df_adamw=pd.DataFrame(history_0_adamw.history).drop('learning_rate',axis=1)
df_adamw.columns=['accuracy_adamw','loss_adamw','val_accuracy_adamw','val_loss_adamw']
df_adamw_acc=df_adamw.loc[:,['accuracy_adamw','val_accuracy_adamw']]
df_adamw_loss=df_adamw.loc[:,['loss_adamw','val_loss_adamw']]

In [None]:
pd.concat([df_0_acc,df_adamw_acc],axis=1).plot()
pd.concat([df_0_loss,df_adamw_loss],axis=1).plot()

Adam is better here.

### Architecture (Can start from here) :

In [None]:
model_0 = tf.keras.models.load_model('model_0.keras')

with open('history_0', "rb") as f:
    history_0 = pickle.load(f)

df_0=pd.DataFrame(history_0).drop('learning_rate',axis=1)
df_0.columns=['accuracy_0','loss_0','val_accuracy_0','val_loss_0']
df_0_acc=df_0.loc[:,['accuracy_0','val_accuracy_0']]
df_0_loss=df_0.loc[:,['loss_0','val_loss_0']]

#### Model 1 : 1 Bloc 64 filters

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_1=tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'), #More filter in the bloc 32->64
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)

])

model_1.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

history_1=model_1.fit(batch_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint])

In [None]:
res_1=model_1.evaluate(batch_test) #70%

In [None]:
df_1 = pd.DataFrame(history_1.history).drop('learning_rate',axis=1)
df_1.columns=['accuracy_1','loss_1','val_accuracy_1','val_loss_1']
df_1_acc=df_1.loc[:,['accuracy_1','val_accuracy_1']]
df_1_loss=df_1.loc[:,['loss_1','val_loss_1']]

In [None]:
pd.concat([df_0_acc,df_1_acc],axis=1).plot()
pd.concat([df_0_loss,df_1_loss],axis=1).plot()

64 filters is better than 32 filters, converge faster and better accuracy.

In [None]:
model_1.save('model_1.keras')

with open('history_1', 'wb') as f:
    pickle.dump(history_1.history, f)

#### Model 2 : 2 Blocs 32->64 filters

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_2=tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),


    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'), #Another bloc
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)

])

model_2.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

history_2=model_2.fit(batch_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint])

In [None]:
res_2=model_2.evaluate(batch_test) # 81%

In [None]:
df_2 = pd.DataFrame(history_2.history).drop('learning_rate',axis=1)
df_2.columns=['accuracy_2','loss_2','val_accuracy_2','val_loss_2']
df_2_acc=df_2.loc[:,['accuracy_2','val_accuracy_2']]
df_2_loss=df_2.loc[:,['loss_2','val_loss_2']]

In [None]:
pd.concat([df_0_acc,df_1_acc,df_2_acc],axis=1).plot()
pd.concat([df_0_loss,df_1_loss,df_2_loss],axis=1).plot()

In [None]:
model_2.save('model_2.keras')

with open('history_2', 'wb') as f:
    pickle.dump(history_2.history, f)

#### Model 3 : 3 Blocs 32->64->128 filters

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_3=tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu', padding='same'), # Add another bloc
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.GlobalAveragePooling2D(),

    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)
])

model_3.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

history_3=model_3.fit(batch_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint])

In [None]:
res_3=model_3.evaluate(batch_test) # 85%

In [None]:
df_3 = pd.DataFrame(history_3.history).drop('learning_rate',axis=1)
df_3.columns=['accuracy_3','loss_3','val_accuracy_3','val_loss_3']
df_3_acc=df_3.loc[:,['accuracy_3','val_accuracy_3']]
df_3_loss=df_3.loc[:,['loss_3','val_loss_3']]

In [None]:
pd.concat([df_0_acc,df_1_acc,df_2_acc,df_3_acc],axis=1).plot()
pd.concat([df_0_loss,df_1_loss,df_2_loss,df_3_loss],axis=1).plot()

Model start to overfit, i'll try to add dropout after convolution to see if it helps. I can stop following model 0 and 1.

In [None]:
model_3.save('model_3.keras')

with open('history_3', 'wb') as f:
    pickle.dump(history_3.history, f)

#### Model 4 : 3 blocs and dropout (best model in the end)

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_4=tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu', padding='same'), # Add another bloc
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.GlobalAveragePooling2D(),

    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)
])

model_4.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

history_4=model_4.fit(batch_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint])

In [None]:
res_4=model_4.evaluate(batch_test) # 87%

In [None]:
df_4 = pd.DataFrame(history_4.history).drop('learning_rate',axis=1)
df_4.columns=['accuracy_4','loss_4','val_accuracy_4','val_loss_4']
df_4_acc=df_4.loc[:,['accuracy_4','val_accuracy_4']]
df_4_loss=df_4.loc[:,['loss_4','val_loss_4']]

In [None]:
pd.concat([df_2_acc,df_3_acc,df_4_acc],axis=1).plot()
pd.concat([df_2_loss,df_3_loss,df_4_loss],axis=1).plot()

In [None]:
model_4.save('model_4.keras')

with open('history_4', 'wb') as f:
    pickle.dump(history_4.history, f)

Model 4 is a bit better in validation and with less overfitting here it seems.

#### Model 5 : Dropout after the GlobalAveragePooling

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_5=tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu', padding='same'), # Add another bloc
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),

    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dropout(0.3),

    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)
])

model_5.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

history_5=model_5.fit(batch_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint])

In [None]:
res_5=model_5.evaluate(batch_test) # 85%

In [None]:
df_5 = pd.DataFrame(history_5.history).drop('learning_rate',axis=1)
df_5.columns=['accuracy_5','loss_5','val_accuracy_5','val_loss_5']
df_5_acc=df_5.loc[:,['accuracy_5','val_accuracy_5']]
df_5_loss=df_5.loc[:,['loss_5','val_loss_5']]

In [None]:
pd.concat([df_2_acc,df_3_acc,df_4_acc,df_5_acc],axis=1).plot()
pd.concat([df_2_loss,df_3_loss,df_4_loss,df_5_loss],axis=1).plot()

In [None]:
# model_5.save('model_5.keras')

# with open('history_5', 'wb') as f:
#     pickle.dump(history_5.history, f)

Model 4 is better. Let's try to get better acc on model 4.

#### Model 6 : model 4 with a better classifier

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_6=tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu', padding='same'), # Add another bloc
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.GlobalAveragePooling2D(),

    tf.keras.layers.Dense(units=256,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(units=128,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)
])

model_6.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

history_6=model_6.fit(batch_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint])

In [None]:
res_6=model_6.evaluate(batch_test) # 86%

In [None]:
df_6 = pd.DataFrame(history_6.history).drop('learning_rate',axis=1)
df_6.columns=['accuracy_6','loss_6','val_accuracy_6','val_loss_6']
df_6_acc=df_6.loc[:,['accuracy_6','val_accuracy_6']]
df_6_loss=df_6.loc[:,['loss_6','val_loss_6']]

In [None]:
pd.concat([df_2_acc,df_4_acc,df_6_acc],axis=1).plot()
pd.concat([df_2_loss,df_4_loss,df_6_loss],axis=1).plot()

In [None]:
# model_6.save('model_6.keras')

# with open('history_6', 'wb') as f:
#     pickle.dump(history_6.history, f)

Very close to model 4, let's try to add a bloc of convolution to model 6 (as it overfit a bit less than model 4)

#### Model 7 : Model 6 + 1 blocs

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_7=tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu', padding='same'), # Add another bloc
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv2D(filters=256,kernel_size=3,activation='relu', padding='same'), # Add another bloc
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=256,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.GlobalAveragePooling2D(),

    tf.keras.layers.Dense(units=256,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(units=128,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)
])

model_7.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

history_7=model_7.fit(batch_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint])

In [None]:
res_7=model_7.evaluate(batch_test) # 87%

In [None]:
df_7=pd.DataFrame(history_7.history).drop('learning_rate',axis=1)
df_7.columns=['accuracy_7','loss_7','val_accuracy_7','val_loss_7']
df_7_acc=df_7.loc[:,['accuracy_7','val_accuracy_7']]
df_7_loss=df_7.loc[:,['loss_7','val_loss_7']]

In [None]:
pd.concat([df_2_acc,df_4_acc,df_6_acc,df_7_acc],axis=1).plot()
pd.concat([df_2_loss,df_4_loss,df_6_loss,df_7_loss],axis=1).plot()

In [None]:
# model_7.save('model_7.keras')

# with open('history_7', 'wb') as f:
#     pickle.dump(history_7.history, f)

Once again very close to model 4. Let's try AdamW with the weight decay maybe to help with the overfitting and the learning.

#### Model 8 : AdamW decay = 5e-3 (relatively big architecture)

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_8=tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu', padding='same'), # Add another bloc
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv2D(filters=256,kernel_size=3,activation='relu', padding='same'), # Add another bloc
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=256,kernel_size=3,activation='relu', padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.GlobalAveragePooling2D(),

    tf.keras.layers.Dense(units=256,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(units=128,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)
])

model_8.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.AdamW(weight_decay=0.005),
                metrics=['accuracy'])

history_8=model_8.fit(batch_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint])

In [None]:
res_8=model_8.evaluate(batch_test) # 87%

In [None]:
df_8=pd.DataFrame(history_8.history).drop('learning_rate',axis=1)
df_8.columns=['accuracy_8','loss_8','val_accuracy_8','val_loss_8']
df_8_acc=df_8.loc[:,['accuracy_8','val_accuracy_8']]
df_8_loss=df_8.loc[:,['loss_8','val_loss_8']]

In [None]:
pd.concat([df_2_acc,df_4_acc,df_6_acc,df_7_acc,df_8_acc],axis=1).plot()
pd.concat([df_2_loss,df_4_loss,df_6_loss,df_7_loss,df_8_loss],axis=1).plot()

In [None]:
# model_8.save('model_8.keras')

# with open('history_8', 'wb') as f:
#     pickle.dump(history_8.history, f)

Looks like i'm stuck in the performance of my model, let's try a last one with many differences to see if i just don't push enough or if it's really the max i can do.

#### Model 9 : ALL IN

In [None]:
tf.random.set_seed(42)
random.seed(42)

model_9=tf.keras.Sequential([
    tf.keras.layers.Input(shape=(32,32,3)),

    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu',padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=32,kernel_size=3,activation='relu',padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu',padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=64,kernel_size=3,activation='relu',padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.2),

    tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu',padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=128,kernel_size=3,activation='relu',padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.3),

    tf.keras.layers.Conv2D(filters=256,kernel_size=3,activation='relu',padding='same'), #Un de plus
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=256,kernel_size=3,activation='relu',padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.4),

    tf.keras.layers.Conv2D(filters=512,kernel_size=3,activation='relu',padding='same'), #Un de plus
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(filters=512,kernel_size=3,activation='relu',padding='same'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPool2D(pool_size=2),
    tf.keras.layers.Dropout(0.5),

    tf.keras.layers.GlobalAveragePooling2D(),

    tf.keras.layers.Dense(units=512,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(units=256,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(units=128,activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(units=10),
    tf.keras.layers.Activation('softmax', dtype=tf.float32)
])

model_9.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=['accuracy'])

history_9=model_9.fit(batch_train,epochs=30,validation_data=batch_val,callbacks=[early_stop,LrReducer,checkpoint])

In [None]:
res_9=model_9.evaluate(batch_test)

In [None]:
df_9=pd.DataFrame(history_9.history).drop('learning_rate',axis=1)
df_9.columns=['accuracy_9','loss_9','val_accuracy_9','val_loss_9']
df_9_acc=df_9.loc[:,['accuracy_9','val_accuracy_9']]
df_9_loss=df_9.loc[:,['loss_9','val_loss_9']]

In [None]:
pd.concat([df_4_acc,df_6_acc,df_7_acc,df_8_acc,df_9_acc],axis=1).plot()
pd.concat([df_4_loss,df_6_loss,df_7_loss,df_8_loss,df_9_loss],axis=1).plot()

In [None]:
# model_9.save('model_9.keras')

# with open('history_9', 'wb') as f:
#     pickle.dump(history_9.history, f)

Model 9 is bad compared to the other. Let's stop here

## Conclusion :

In [None]:
res=pd.DataFrame([res_0,res_1,res_2,res_3,res_4,res_5,res_6,res_7,res_8,res_9])
res.columns=['loss','accuracy']
res.index=['model_0','model_1','model_2','model_3','model_4','model_5','model_6','model_7','model_8','model_9']

In [None]:
res.sort_values(by='accuracy',ascending=False).plot(kind='bar')

We will go with model 4 as it is one of the best in accuracy and loss combined and also is (amongst model 8 and 7) the one with the less parameters (3M for 8 and 7 and 800k for 4). We have an accuracy of 86% on test.

### Confusion matrix :

In [None]:
model_4 = tf.keras.models.load_model('model_4.keras')

In [None]:
# https://stackoverflow.com/questions/64622210/how-to-extract-classes-from-prefetched-dataset-in-tensorflow-for-confusion-matri

y_pred = []  # store predicted labels
y_true = []  # store true labels

# iterate over the dataset
for image_batch, label_batch in batch_test:   # use dataset.unbatch() with repeat
   # append true labels
   y_true.append(label_batch)
   # compute predictions
   preds = model_4.predict(image_batch,verbose=0)
   # append predicted labels
   y_pred.append(np.argmax(preds, axis = 1))

# convert the true and predicted labels into tensors
correct_labels = tf.concat([item for item in y_true], axis = 0)
predicted_labels = tf.concat([item for item in y_pred], axis = 0)

In [None]:
# https://github.com/mrdbourke/tensorflow-deep-learning/blob/main/06_transfer_learning_in_tensorflow_part_3_scaling_up.ipynb

n_classes=len(class_names)

cm = confusion_matrix(correct_labels, predicted_labels)

# Plot the figure and make it pretty
fig, ax = plt.subplots(figsize=(8,8))
cax = ax.matshow(cm, cmap=plt.cm.Blues) # colors will represent how 'correct' a class is, darker == better
fig.colorbar(cax)

# Label the axes
ax.set(title="Confusion Matrix",
        xlabel="Predicted label",
        ylabel="True label",
        xticks=np.arange(n_classes), # create enough axis slots for each class
        yticks=np.arange(n_classes),
        xticklabels=class_names,
        yticklabels=class_names)

# Make x-axis labels appear on bottom
ax.xaxis.set_label_position("bottom")
ax.xaxis.tick_bottom()

### Added: Rotate xticks for readability & increase font size (required due to such a large confusion matrix)
plt.xticks(rotation=70, fontsize=12)
plt.yticks(fontsize=12)

# Set the threshold for different colors
threshold = (cm.max() + cm.min()) / 2.

# Plot the text on each cell
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
  plt.text(j, i, f"{cm[i, j]}",
          ha="center",
          va='center',
          color="white" if cm[i, j] > threshold else "black",
          size=12)