In [1]:
# https://www.kaggle.com/c/digit-recognizer/submit

from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from keras.models import Sequential
%matplotlib inline

# create the training & test sets, skipping the header row with [1:]
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
xtrain = train.drop('label',axis=1).values.reshape((42000,28,28,1))/ 255.0
ytrain = train['label'].values
# Maybe need to use trainLabels = np_utils.to_categorical(trainLabels, 10)
xtest = test.values.reshape((28000,28,28,1))/ 255.0

Using TensorFlow backend.


In [2]:
# http://www.pyimagesearch.com/2016/08/01/lenet-convolutional-neural-network-in-python/
# http://deeplearning.net/tutorial/lenet.html
# https://arxiv.org/pdf/1606.02228v2.pdf

# conv layer argument: stride, size, padding
# stride is how much the filter moves. 
# With no padding, a stride of 1 and a size nxn, the size of the output is reduce by n-1.
# With padding and a stride of 1, the size is not reduced.
# Stide bigger than 1 reduce the size of the output.
# O = (I - K + P)/S + 1
# O: output size, I = input size, K = filter size, P is the padding, S is the stride.
# The 2 padding choice in keras are valid (no padding) and same (output as the same size as input)
# It seems strange that same would be O = I for stride different than 1 (as it would add purely 0 result). Is it an error of the 
# documentation?
# Someone say than "same" means that for filter size k, the padding is round down k/2 on the RHS and round up k/2 on the LHS.
# By default stride is 1.
# It doesn't seem useful to have a different stride for the convolution since the reduction of the size can be done my pooling
# Pooling should be done with a stride of same size as the kernel size.

from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers.core import Activation
from keras.layers.core import Flatten
from keras.layers.core import Dense

model = Sequential()
# 2D convolution filters, where each filter is of size 5 x 5
# 28 x 28 inputs with a single channel for depth 
# padding="same" means that padding is done such that the output as the same size as the input.
model.add(Conv2D(filters=20, kernel_size=5, padding="same", input_shape=(28, 28, 1)))
model.add(Activation("relu"))
# 2 x 2 max-pooling moving by step of 2 in both directions. Reduce shape to 14 x 14
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
# 50 convolution filters, where each filter is of size 5 x 5
model.add(Conv2D(filters=50, kernel_size=5, padding="same"))
model.add(Activation("relu"))
# Reduce shape to 7 x 7 with 50 channels
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
# Fully connected layers
model.add(Flatten())
model.add(Dense(500))
model.add(Activation("relu"))
# softmax classifier  For 10 results
model.add(Dense(10))
model.add(Activation("softmax"))
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [3]:
from keras.utils import to_categorical
label = to_categorical(ytrain, num_classes=10)
print(xtrain.shape)
print(label.shape)

(42000, 28, 28, 1)
(42000, 10)


In [None]:
# Need to execute this if I don't execute the following cells.

from sklearn.metrics import accuracy_score
from numpy import argmax
# Split into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(xtrain, label, test_size=0.2, random_state=42)

from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping

In [4]:
# Use only the first 1000
xtrain0 = xtrain[:1000]
print(xtrain0.shape)
label0 = label[:1000]
print(label0.shape)

(1000, 28, 28, 1)
(1000, 10)


In [5]:
model.fit(xtrain0, label0, epochs=1, batch_size=32,verbose=2)

Epoch 1/1
3s - loss: 1.0754 - acc: 0.6560


<keras.callbacks.History at 0x201154ea048>

In [6]:
from sklearn.metrics import accuracy_score
from numpy import argmax

x1 = xtrain[1000:2000]
y1 = ytrain[1000:2000]

ans = model.predict(x1)
print(ans.shape)
ans = argmax(ans, axis=-1)
print(ans.shape)

accuracy_score(y1, ans)

(1000, 10)
(1000,)


0.82899999999999996

In [8]:
# Split into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(xtrain, label, test_size=0.2, random_state=42)

model.fit(X_train, y_train, epochs=1, batch_size=32,verbose=2)
ans = argmax(model.predict(X_test), axis=-1)
truth = argmax(y_test, axis=-1)
accuracy_score(truth, ans)

Epoch 1/1
52s - loss: 0.0446 - acc: 0.9863


0.98642857142857143

In [11]:
# Save result for Kaggle test, to get the Kaggle test score.

ans = argmax(model.predict(xtest), axis=-1)
ans_s = pd.Series(data=ans,index=range(1,28001)).rename('Label')
ans_s.to_csv(path='Result0',header=True,index_label='ImageId')

# I scored 0.98500, that is 420 mistakes

In [12]:
model.fit(X_train, y_train, epochs=5, batch_size=32,verbose=2)
ans = argmax(model.predict(X_test), axis=-1)
truth = argmax(y_test, axis=-1)
print(accuracy_score(truth, ans))


# Save result for Kaggle test, to get the Kaggle test score.

ans = argmax(model.predict(xtest), axis=-1)
ans_s = pd.Series(data=ans,index=range(1,28001)).rename('Label')
ans_s.to_csv(path='Result1',header=True,index_label='ImageId')

# I scored 0.98514, that is 416 mistakes

Epoch 1/5
54s - loss: 0.0332 - acc: 0.9900
Epoch 2/5
52s - loss: 0.0247 - acc: 0.9928
Epoch 3/5
52s - loss: 0.0193 - acc: 0.9938
Epoch 4/5
51s - loss: 0.0181 - acc: 0.9954
Epoch 5/5
51s - loss: 0.0146 - acc: 0.9960
0.986547619048


In [14]:
from keras.callbacks import EarlyStopping
# val_acc or val_loss
stop = [EarlyStopping(monitor='val_acc', min_delta=0, patience=0)]

model.fit(X_train, y_train, epochs=5, batch_size=32,verbose=2,callbacks=stop,shuffle=True, 
          validation_data=(X_test, y_test))

# My val_acc is 0.9912, my test_acc should be of this order, but it is likely to be slightly lower because I picked this
# val_acc because it's the biggest one which is a bias.
# 0.9912 is significantly better than 0.98514 my previous result even if we consider that the real result might be slightly lower.

Train on 33600 samples, validate on 8400 samples
Epoch 1/5
54s - loss: 0.0132 - acc: 0.9965 - val_loss: 0.0715 - val_acc: 0.9871
Epoch 2/5
54s - loss: 0.0133 - acc: 0.9964 - val_loss: 0.0465 - val_acc: 0.9912
Epoch 3/5
54s - loss: 0.0109 - acc: 0.9973 - val_loss: 0.0535 - val_acc: 0.9907


<keras.callbacks.History at 0x1df00847d68>

In [7]:
from keras.callbacks import ModelCheckpoint

callbacks = [EarlyStopping(monitor='val_acc', min_delta=0, patience=0),
             ModelCheckpoint(filepath='SavedModel{epoch:02d}.hdf5', monitor='val_acc', save_weights_only=True, period=1)]

model.fit(X_train, y_train, epochs=100, batch_size=32,verbose=2,callbacks=callbacks,shuffle=True, 
          validation_data=(X_test, y_test))

Train on 33600 samples, validate on 8400 samples
Epoch 1/100
56s - loss: 0.0168 - acc: 0.9954 - val_loss: 0.0517 - val_acc: 0.9892
Epoch 2/100
57s - loss: 0.0141 - acc: 0.9965 - val_loss: 0.0551 - val_acc: 0.9889


<keras.callbacks.History at 0x1ee0e7c9400>

In [8]:
callbacks = [EarlyStopping(monitor='val_acc', min_delta=0, patience=3),
             ModelCheckpoint(filepath='SavedModel{epoch:02d}.hdf5', monitor='val_acc', save_weights_only=True, period=1)]

model.fit(X_train, y_train, epochs=100, batch_size=32,verbose=2,callbacks=callbacks,shuffle=True, 
          validation_data=(X_test, y_test))

# I interupted it. My 'patience=3' was too large.

# I didn't get a better result than before even though it's the same model.

Train on 33600 samples, validate on 8400 samples
Epoch 1/100
55s - loss: 0.0109 - acc: 0.9970 - val_loss: 0.0743 - val_acc: 0.9881
Epoch 2/100
54s - loss: 0.0094 - acc: 0.9973 - val_loss: 0.0590 - val_acc: 0.9899
Epoch 3/100
55s - loss: 0.0075 - acc: 0.9983 - val_loss: 0.0685 - val_acc: 0.9894
Epoch 4/100
54s - loss: 0.0063 - acc: 0.9985 - val_loss: 0.0781 - val_acc: 0.9900
Epoch 5/100
54s - loss: 0.0055 - acc: 0.9987 - val_loss: 0.0989 - val_acc: 0.9892
Epoch 6/100
53s - loss: 0.0052 - acc: 0.9986 - val_loss: 0.0888 - val_acc: 0.9900
Epoch 7/100
55s - loss: 0.0045 - acc: 0.9989 - val_loss: 0.0725 - val_acc: 0.9901
Epoch 8/100
55s - loss: 0.0050 - acc: 0.9991 - val_loss: 0.0924 - val_acc: 0.9895
Epoch 9/100
56s - loss: 0.0050 - acc: 0.9991 - val_loss: 0.0874 - val_acc: 0.9890
Epoch 10/100
56s - loss: 0.0037 - acc: 0.9992 - val_loss: 0.0982 - val_acc: 0.9901
Epoch 11/100


KeyboardInterrupt: 

In [11]:
# Epoch starts at 0 even though they write 1/100.

for epoch in range(6):
    filename = 'SavedModel0'+str(epoch)+'.hdf5'
    print(filename)
    model.load_weights(filename)
    ans = argmax(model.predict(xtest), axis=-1)
    ans_s = pd.Series(data=ans,index=range(1,28001)).rename('Label')
    filename = 'Result'+str(epoch)
    ans_s.to_csv(path=filename,header=True,index_label='ImageId')
    
    
# Result0: 0.98814  333 mistakes  It's better than my previous result, but it's just luck since it's still only one epoch.
# I got a very smilar result as my CV result 0.9881
# Result3: 0.98686  369 mistakes   It's less good than the CV result 0.9900 which is normal since I picked it because of its
# large CV

# Both are less than my best val_acc is 0.9912, but I didn't check the real result for this one.

SavedModel00.hdf5
SavedModel01.hdf5
SavedModel02.hdf5
SavedModel03.hdf5
SavedModel04.hdf5
SavedModel05.hdf5


In [13]:
    epoch = 6
    filename = 'SavedModel0'+str(epoch)+'.hdf5'
    model.load_weights(filename)
    ans = argmax(model.predict(xtest), axis=-1)
    ans_s = pd.Series(data=ans,index=range(1,28001)).rename('Label')
    filename = 'Result'+str(epoch)
    ans_s.to_csv(path=filename,header=True,index_label='ImageId')
# Result6: 0.98914   305 mistakes   My CV result was 0.9901

In [None]:
# It is clear that my model overfit quickly since the training accuracy is nearly 1 and the validation accuracy doesn't increase
# much if at all after the first epoch which takes less than 1 minute.
# It is thus a good idea to have a regularizator such as dropout or weight decay.
# Dropout is bad for small network. 
# It is important that the probabily that the hidden layer be smaller than the output be very small.

# A good way to get good results is to try to vary:
# Learning Rate
# Momentum
# Dropout P robability
# Weight Decay Rate

# I need to see if I want to change relu to leaky relu, parametric relu or random relu, but it's not simple and not everybody
# agree on which one is better.

In [17]:
# One can write model architecture as:
# model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(100, 100, 3)))
# model.add(Conv2D(32, (3, 3), activation='relu'))

# One can add dropout after MaxPool
# model.add(Dropout(0.5))

from keras.layers.core import Dropout

model = Sequential()
# 2D convolution filters, where each filter is of size 5 x 5
# 28 x 28 inputs with a single channel for depth 
# padding="same" means that padding is done such that the output as the same size as the input.
model.add(Conv2D(filters=40, kernel_size=5, padding="same", input_shape=(28, 28, 1), activation='relu'))
# 2 x 2 max-pooling moving by step of 2 in both directions. Reduce shape to 14 x 14
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
# Dropout
model.add(Dropout(0.5))
# 50 convolution filters, where each filter is of size 5 x 5
model.add(Conv2D(filters=100, kernel_size=5, padding="same", activation='relu'))
# Reduce shape to 7 x 7 with 50 channels
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
# Dropout
model.add(Dropout(0.5))
# Fully connected layers
model.add(Flatten())
model.add(Dense(1000, activation='relu'))
# Dropout
model.add(Dropout(0.5))
# softmax classifier  For 10 results
model.add(Dense(10, activation='softmax'))
          
# Maybe change optimizer
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [19]:
callbacks = [EarlyStopping(monitor='val_acc', min_delta=0.002, patience=2),
             ModelCheckpoint(filepath='SavedModelDropout{epoch:02d}.hdf5', monitor='val_acc', save_weights_only=True, period=1)]

model.fit(X_train, y_train, epochs=100, batch_size=32,verbose=2,callbacks=callbacks,shuffle=True, 
          validation_data=(X_test, y_test))

Train on 33600 samples, validate on 8400 samples
Epoch 1/100
176s - loss: 0.1445 - acc: 0.9562 - val_loss: 0.0572 - val_acc: 0.9802
Epoch 2/100
173s - loss: 0.0940 - acc: 0.9722 - val_loss: 0.0443 - val_acc: 0.9863
Epoch 3/100
170s - loss: 0.0878 - acc: 0.9747 - val_loss: 0.0406 - val_acc: 0.9869
Epoch 4/100
7132s - loss: 0.0885 - acc: 0.9766 - val_loss: 0.1175 - val_acc: 0.9804
Epoch 5/100
175s - loss: 0.0958 - acc: 0.9749 - val_loss: 0.1530 - val_acc: 0.9844


<keras.callbacks.History at 0x1ee0f9a4160>

In [20]:
for epoch in range(5):
    filename = 'SavedModelDropout0'+str(epoch)+'.hdf5'
    print(filename)
    model.load_weights(filename)
    ans = argmax(model.predict(xtest), axis=-1)
    ans_s = pd.Series(data=ans,index=range(1,28001)).rename('Label')
    filename = 'ResultDropout'+str(epoch)
    ans_s.to_csv(path=filename,header=True,index_label='ImageId')
    
    
# ResultDropout0: 0.98143  520 mistakes, very bad but it's normal since learning is slower with dropout. It's also close of my CV
# ResultDropout2: 0.98614  388 mistakes  which is not better than my best result, but it's not tha surprising since the CV wasn't
# better.

# The main reason why it didn't improve are probably:
# I should have kept patience=3 or even higher. In my previous result Result6 was obtained after 2 bad result.
# The NN was underfitting. In fact, training accuracy was lower than validation accuracy, which is not supposed to happen.
# It must be because dropout is not used for validating and the use of the full network is better than one with dropout.
# I should use more training time with a bigger NN, so that my network be able to fit the data.

# Also, I read that with dropout one must use a bigger learning rate than without it. I should look at how the learning rate is
# determined
# However, because of the higher learnig rate, a lot of Relu units die. It's better to replace them with ELU or Maxout.
# LRelu, PRelu and RRelu that have been created to solve the same problem since to be less good.
# Maxout seems to have the best performance, but it's slower.

# rmsprop is a creation of G. Hinton which seems to be better than momentum. It's a variation of Stochastic Gradient Descent
# in which the learning rate change, like momentum is.

# Choice that one must test on:
# non-linearity (ReLU, ELU, maxout, compatability with batch normalization)
# pooling variants (stochastic, max, average, mixed)
# network width
# classifier design (convolutional, fully-connected, SPP)
# image pre-processing
# learning parameters: learning rate, batch size, cleanliness of the data
# Should I use Batch normalization? If yes, before or after Relu.
#    Batch normalization potentially helps in two ways: faster learning and higher overall accuracy.
#    It is said that Batch normalisation washes out differences between ReLU-family variants, so we should use Relu.
#    It's seems it is better to use it than using ELU, and it should usually be after Relu.

# In this case: https://arxiv.org/pdf/1606.02228v2.pdf
# Which is however for a neuronet that has been trained a very long time on a more complex problem.
# Maxout is best, but ELU at the beggining and Maxout at the end is a good compromise to reduce computational coat.
# Mixing of Max pooling and Average pooling is the best.
# Linear decay of learning rate is the best learning rate policy. lr = L0(1 − i/M) where M is the number of learning iterations
# Final recommendations:
# use ELU non-linearity without batchnorm or ReLU with it.
# use the linear learning rate decay policy.
# use a sum of the average and max pooling layers.
# use mini-batch size around 128 or 256. If this is too big for your GPU,
#   decrease the learning rate proportionally to the batch size.
# use fully-connected layers as convolutional and average the predictions for the final decision

# It is often the case that the model stops improving and then jump to a better result. So one should have a large patience.

# For batch normalization, put it after Relu
# from keras.layers.normalization import BatchNormalization

SavedModelDropout00.hdf5
SavedModelDropout01.hdf5
SavedModelDropout02.hdf5
SavedModelDropout03.hdf5
SavedModelDropout04.hdf5


In [23]:
# elu
# patience = 3
# Wider network

# I should maybe make a grid that select the best parameter to get the best val-acc
# I will use save_best_only=True for ModelCheckPoint to get only the best model.

# I can change learning rate of rmsprop. It is recommended not to change the other parameters.
# keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

from keras.optimizers import RMSprop

for Nfilters1 in [40, 80]:
    for Nfilters2 in [100, 200]:
        for kernelsize1 in [3, 5]:
            for kernelsize2 in [3, 5]:
                for learningRate in [0.001, 0.01, 0.1]:
                    for dropout in [0.2, 0.5]:
                        model = Sequential()
                        model.add(Conv2D(filters=Nfilters1, kernel_size=kernelsize1, padding="same", input_shape=(28, 28, 1), activation='elu'))
                        model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
                        model.add(Dropout(0.5))
                        model.add(Conv2D(filters=Nfilters2, kernel_size=kernelsize2, padding="same", activation='elu'))
                        model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
                        model.add(Dropout(0.5))
                        model.add(Flatten())
                        model.add(Dense(2000, activation='elu'))
                        model.add(Dropout(0.5))
                        model.add(Dense(10, activation='softmax'))

                        model.compile(optimizer=RMSprop(lr=learningRate),
                                      loss='categorical_crossentropy',
                                      metrics=['accuracy'])
                        
                        filename = ("SavedModel_" + str(Nfilters1) + "_" + str(Nfilters2) + "_" + str(kernelsize1) + "_" + 
                                    str(kernelsize2) + "_" + str(learningRate) + "_" + str(dropout) + ".hdf5")

                        callbacks = [EarlyStopping(monitor='val_acc', min_delta=0., patience=3),
                                     ModelCheckpoint(filepath=filename, monitor='val_acc', save_best_only=True, save_weights_only=True, 
                                                     period=1)]
                        
                        print("Nfilters1: "+ str(Nfilters1) +", Nfilters2: "+ str(Nfilters2) +", kernelsize1: "+ 
                              str(kernelsize1) +", "+ "kernelsize2: "+ str(kernelsize2) +", learningRate: "+ str(learningRate) +
                              ", dropout: "+ str(dropout))

                        model.fit(X_train, y_train, epochs=100, batch_size=32,verbose=2,callbacks=callbacks,shuffle=True, 
                                  validation_data=(X_test, y_test))
                        
                        print("_______________________________________________________________________________")
                        print("_______________________________________________________________________________")
                        print("_______________________________________________________________________________")
                        
                        

Nfilters1: 40, Nfilters2: 100, kernelsize1: 3, kernelsize2: 3, learningRate: 0.001, dropout: 0.2
Train on 33600 samples, validate on 8400 samples
Epoch 1/100
219s - loss: 0.2752 - acc: 0.9201 - val_loss: 0.1056 - val_acc: 0.9744
Epoch 2/100
215s - loss: 0.1778 - acc: 0.9526 - val_loss: 0.1054 - val_acc: 0.9735
Epoch 3/100
214s - loss: 0.1665 - acc: 0.9575 - val_loss: 0.0852 - val_acc: 0.9787
Epoch 4/100
209s - loss: 0.1596 - acc: 0.9607 - val_loss: 0.1011 - val_acc: 0.9751
Epoch 5/100
212s - loss: 0.1603 - acc: 0.9638 - val_loss: 0.0885 - val_acc: 0.9799
Epoch 6/100
209s - loss: 0.1590 - acc: 0.9658 - val_loss: 0.1057 - val_acc: 0.9765
Epoch 7/100
212s - loss: 0.1503 - acc: 0.9677 - val_loss: 0.0880 - val_acc: 0.9821
Epoch 8/100
209s - loss: 0.1589 - acc: 0.9685 - val_loss: 0.1375 - val_acc: 0.9719
Epoch 9/100
218s - loss: 0.1644 - acc: 0.9704 - val_loss: 0.0754 - val_acc: 0.9850
Epoch 10/100
205s - loss: 0.1438 - acc: 0.9748 - val_loss: 0.1192 - val_acc: 0.9819
Epoch 11/100
3737s - lo

KeyboardInterrupt: 

In [24]:
# I forgot to vary the number of units in the dense layer. I should do that next.
# I forgot to vary batch_size, and I read it's recommended to use 128 or 256.
# I made a mistake, dropout is always 0.5

# Big time are due to my computer going to sleep.

# Conclusion of the variation:
# Learning rate of 0.01 and 0.1 are too big. They are a waste of time.

# Nfilters1: 40, Nfilters2: 100, kernelsize1: 3, kernelsize2: 3, learningRate: 0.001, dropout: 0.5
# 0.9900   225s by epoch, 22 epochs
# The same thing got me:    0.9854   210s by epoch, 13 epochs     
# but it might be because patience is too low since it's still underfitting.
# It beats kernelsize2 = 5   0.9875   250s by epoch, 14 epochs
# It beats again kernelsize2 = 5   0.9879   250s by epoch, 10 epochs


# What I read about gradient descent:
# Momentum is good to make the parameters evolve well when it's a ravine. Nesterov accelerated gradient is an modified momentum.
# Momentum: Compute the gradient, then add a fraction of previous step.
# Nesterov accelerated: Add fraction of previoud step before computing the gradient.
# However, it seems that people are prefer momentum over Nesterov now.

# Adadelta is similar to momentum, but the learning rate is replaced by the root mean squared error of parameter updates
# so we don't need a learning rate.
# RMSprop is similar but need a learning rate that the author say should be 0.001 by default.
# Both Adadelta and RMSprop have been developped at the same time independantly.

# Adam is a combination of Momentum and RMSprop, while NAdam is a combination of Nesterov and RMSprop.

# I read that: "Adam might be the best overall choice."

# Advice about gradient descent:
# Shuffle data after each epoch. (Unless we want to have them by increasing difficulty which is not my case.)
# One should thus always monitor error on a validation set during training and stop (with some patience) if the validation error does not improve enough.
# Batch normalization is good. It allows higher learning rate and reduce the need for dropout.

# Adam is considered as the state of art optimizer.
# Other say that SGD with momentum is better.
# It seems that for sparse data Adam is better, for non-sparse data SDG with momentum can be better, but it needs to tune the
# learning rate.



# I read:
# The patience is often set somewhere between 10 and 100 (10 or 20 is more common).

In [27]:
# I just want to compare val-acc, I don't need to save model.

from keras.layers.normalization import BatchNormalization

for Nfilters1 in [40, 80]:
    for Nfilters2 in [100, 200]:
        for kernelsize1 in [3, 5]:
            for kernelsize2 in [3, 5]:
                for Nunits in [1000, 2000]:
                    for dropout in [0., 0.2, 0.5]:
                      for batchsize in [128, 256]:
                        model = Sequential()
                        model.add(Conv2D(filters=Nfilters1, kernel_size=kernelsize1, padding="same", input_shape=(28, 28, 1), 
                                         activation='relu'))
                        model.add(BatchNormalization())
                        model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
                        model.add(Dropout(dropout))
                        model.add(Conv2D(filters=Nfilters2, kernel_size=kernelsize2, padding="same", activation='relu'))
                        model.add(BatchNormalization())
                        model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
                        model.add(Dropout(dropout))
                        model.add(Flatten())
                        model.add(Dense(Nunits, activation='relu'))
                        model.add(BatchNormalization())
                        model.add(Dropout(dropout))
                        model.add(Dense(10, activation='softmax'))

                        model.compile(optimizer='adam',
                                      loss='categorical_crossentropy',
                                      metrics=['accuracy'])

                        callbacks = [EarlyStopping(monitor='val_acc', min_delta=0., patience=10)]
                        
                        print("Nfilters1: "+ str(Nfilters1) +", Nfilters2: "+ str(Nfilters2) +", kernelsize1: "+ 
                              str(kernelsize1) +", "+ "kernelsize2: "+ str(kernelsize2) +", Nunits: "+ str(Nunits) +
                              ", dropout: "+ str(dropout) + ", batchsize: "+ str(batchsize))

                        model.fit(X_train, y_train, epochs=1000, batch_size=32,verbose=2,callbacks=callbacks,shuffle=True, 
                                  validation_data=(X_test, y_test))
                        
                        print("_______________________________________________________________________________")
                        print("_______________________________________________________________________________")
                        print("_______________________________________________________________________________")
                        
                        

Nfilters1: 40, Nfilters2: 100, kernelsize1: 3, kernelsize2: 3, Nunits: 1000, dropout: 0.0, batchsize: 128
Train on 33600 samples, validate on 8400 samples
Epoch 1/1000
224s - loss: 0.1380 - acc: 0.9598 - val_loss: 0.0680 - val_acc: 0.9788
Epoch 2/1000
220s - loss: 0.0577 - acc: 0.9816 - val_loss: 0.0715 - val_acc: 0.9767
Epoch 3/1000
213s - loss: 0.0448 - acc: 0.9857 - val_loss: 0.0647 - val_acc: 0.9804
Epoch 4/1000
231s - loss: 0.0305 - acc: 0.9900 - val_loss: 0.0673 - val_acc: 0.9796
Epoch 5/1000
226s - loss: 0.0300 - acc: 0.9906 - val_loss: 0.0600 - val_acc: 0.9824
Epoch 6/1000
225s - loss: 0.0266 - acc: 0.9915 - val_loss: 0.0626 - val_acc: 0.9840
Epoch 7/1000
224s - loss: 0.0194 - acc: 0.9936 - val_loss: 0.0317 - val_acc: 0.9917
Epoch 8/1000
211s - loss: 0.0182 - acc: 0.9944 - val_loss: 0.0423 - val_acc: 0.9885
Epoch 9/1000
208s - loss: 0.0169 - acc: 0.9947 - val_loss: 0.0349 - val_acc: 0.9906
Epoch 10/1000
208s - loss: 0.0154 - acc: 0.9948 - val_loss: 0.0435 - val_acc: 0.9885
Epoc

KeyboardInterrupt: 

In [None]:
# I made a mistake, batchsize doesn't vary. It allows me to compare two runs with the same parameters.

# Nfilters1: 40, Nfilters2: 100, kernelsize1: 3, kernelsize2: 3, Nunits: 1000, dropout: 0.0, batchsize: 32
# Best val_acc 0.9917 at 7
# identical run: Best val_acc 0.9926 at 25

# With dropout = 0.2
# Best val_acc 0.9921 at 17

# So the result is similar with dropout.

In [1]:
# Each run takes more than a hour. I cannot do that 128 times. So I need to do a less complete search.

# Let's read
# https://arxiv.org/pdf/1512.00567.pdf
# http://papers.nips.cc/paper/4522-practical-bayesian-optimization-of-machine-learning-algorithms.pdf

In [2]:
# Test to compare gpu, cpu. I have gpu in the notebook tensorflowgpu.ipynb

from IPython.display import display
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from keras.models import Sequential
%matplotlib inline

# create the training & test sets, skipping the header row with [1:]
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
xtrain = train.drop('label',axis=1).values.reshape((42000,28,28,1))/ 255.0
ytrain = train['label'].values
# Maybe need to use trainLabels = np_utils.to_categorical(trainLabels, 10)
xtest = test.values.reshape((28000,28,28,1))/ 255.0

from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers.core import Activation
from keras.layers.core import Flatten
from keras.layers.core import Dense
from keras.layers.core import Dropout
from keras.layers.normalization import BatchNormalization

from keras.utils import to_categorical
label = to_categorical(ytrain, num_classes=10)
print(xtrain.shape)
print(label.shape)

from sklearn.metrics import accuracy_score
from numpy import argmax
# Split into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(xtrain, label, test_size=0.2, random_state=42)

from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping

model = Sequential()
model.add(Conv2D(filters=20, kernel_size=3, padding="same", input_shape=(28, 28, 1), 
                 activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Dropout(0.2))
model.add(Conv2D(filters=50, kernel_size=3, padding="same", activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(500, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(10, activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

callbacks = [EarlyStopping(monitor='val_acc', min_delta=0., patience=10)]

for batchsize in [32, 64, 128, 256]:
    model.fit(X_train, y_train, epochs=1, batch_size=32,verbose=2,callbacks=callbacks,shuffle=True, 
              validation_data=(X_test, y_test))

Using TensorFlow backend.


(42000, 28, 28, 1)
(42000, 10)
Train on 33600 samples, validate on 8400 samples
Epoch 1/1
79s - loss: 0.1773 - acc: 0.9464 - val_loss: 0.0846 - val_acc: 0.9736
Train on 33600 samples, validate on 8400 samples
Epoch 1/1
73s - loss: 0.0847 - acc: 0.9734 - val_loss: 0.0667 - val_acc: 0.9799
Train on 33600 samples, validate on 8400 samples
Epoch 1/1
73s - loss: 0.0650 - acc: 0.9786 - val_loss: 0.0580 - val_acc: 0.9813
Train on 33600 samples, validate on 8400 samples
Epoch 1/1
72s - loss: 0.0515 - acc: 0.9838 - val_loss: 0.0820 - val_acc: 0.9788
