In [50]:
import numpy as np
import pandas as pd
from keras.models import Sequential

from keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten
from keras.utils import np_utils
from sklearn.model_selection import train_test_split

from keras.utils import to_categorical
from keras.optimizers import RMSprop
from keras.callbacks import ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator

import matplotlib.pyplot as plt
%matplotlib inline

### Load training data

Load `train.csv` from Kaggle into a pandas DataFrame.

In [2]:
df = pd.read_csv('train.csv')   # (42000, 785)

### Set up X and y

NOTE: Keras requires a `numpy` matrix, it doesn't work with `pandas`.

In [3]:
X = df.iloc[:, 1:].values
y = df['label'].values

### Preprocessing

1. When dealing with image data, I normalized `X` by dividing each value by the max number of pixels (255).
2. Since this is a multiclass classification problem, keras needs `y` to be a one-hot encoded matrix

In [None]:
X = X/255.
y = to_categorical(y)

### Train/Test Split

We want to create a validation set that the model will never see to approximate how it's going to do with Kaggle's `test.csv`. Use `sklearn`'s `train_test_split` to do this.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
X_train.shape

(33600, 784)

### Create neural network

Create a neural network using the `Dense` and `Dropout` layers from `keras`. Activation function for the final output layer needs to be `softmax` to accomidate the ten different classes.

In [8]:
# Basic neural network model
model = Sequential()
model.add(Dense(X_train.shape[1],input_shape=(784,),activation='relu'))
model.add(Dropout(.4))
model.add(Dense(y_train.shape[1], activation='softmax'))

In [16]:
# CNN model

model = Sequential()

model.add(Conv2D(filters=32, kernel_size=(5,5), padding='Same',
                activation='relu', input_shape=(28,28,1)))
model.add(Conv2D(filters=32, kernel_size=(5,5), padding='Same',
                activation='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(.25))

model.add(Conv2D(filters=64, kernel_size=(3,3), padding='Same',
                activation='relu'))
model.add(Conv2D(filters=64, kernel_size=(3,3), padding='Same',
                activation='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model.add(Dropout(.25))

model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(.5))
model.add(Dense(10, activation='softmax'))

### Compile model

Since this is a multiclass classification problem, your loss function is `categorical_crossentropy`. I created optimizer by myself instead of `'adam'`.

In [21]:
optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=0.00000008, decay=0.0)

model.compile(optimizer=optimizer, loss='categorical_crossentropy',metrics=['accuracy'])

### Fit the model

Use X_test, y_test from the `train_test_split` step for the `validation_data` parameter.

In [25]:
# Set a learning rate annealer
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc',
                                           patience=5,
                                           factor=0.5,
                                           min_lr=0.00001)
epochs = 50
batch_size = 100

In [33]:
# Reshape input dataset into four dimension

X_train = X_train.reshape(-1, 28, 28, 1)
X_test = X_test.reshape(-1, 28, 28, 1)

In [28]:
# With data augmentation to prevent overfitting

datagen = ImageDataGenerator(
       featurewise_center = False,
       samplewise_center = False,
       featurewise_std_normalization = False,
       samplewise_std_normalization = False,
       zca_whitening = False,
       rotation_range = 10,
       zoom_range = 0.1,
       width_shift_range = 0.1,
       height_shift_range = 0.1,
       horizontal_flip = False,
       vertical_flip = False)

datagen.fit(X_train)

In [34]:
model.fit_generator(datagen.flow(X_train, y_train, batch_size=batch_size),
                   epochs=epochs, validation_data=(X_test, y_test),
                   steps_per_epoch=X_train.shape[0] // batch_size,
                   callbacks=[learning_rate_reduction])

Epoch 1/5
 - 167s - loss: 0.4785 - acc: 0.8463 - val_loss: 0.0661 - val_acc: 0.9796
Epoch 2/5
 - 171s - loss: 0.1455 - acc: 0.9565 - val_loss: 0.0607 - val_acc: 0.9808
Epoch 3/5
 - 167s - loss: 0.1084 - acc: 0.9684 - val_loss: 0.0559 - val_acc: 0.9820
Epoch 4/5
 - 166s - loss: 0.0904 - acc: 0.9731 - val_loss: 0.0410 - val_acc: 0.9888
Epoch 5/5
 - 166s - loss: 0.0878 - acc: 0.9745 - val_loss: 0.0361 - val_acc: 0.9899


<keras.callbacks.History at 0x152c78a20>

In [15]:
# fit result with using basic model
model.fit(X_train, y_train, validation_data=(X_test,y_test),epochs=7,batch_size=100)

Train on 33600 samples, validate on 8400 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x125d1dc88>

### Load in Kaggle's `test.csv`

Do the **same** preprocessing you did for the training `X`.

In [39]:
test = pd.read_csv('test.csv')
test = test/255.

In [41]:
test = test.values.reshape(-1, 28, 28, 1)

### Create your predictions

Use `predict_classes` to get the actual numerical values (0-9).

In [42]:
pred = model.predict_classes(test)



### Prepare your submission

1. Add predictions to a column called `Label`
2. I need to manually create the `ImageId` column, which is just a list of 1..[NUMBER OF TEST SAMPLES]

In [52]:
preds = pd.Series(pred, name='Label')

In [69]:
submission = pd.DataFrame(preds)

In [70]:
submission['ImageId'] = range(1, 28001)

In [72]:
submission = submission[['ImageId','Label']]

In [73]:
submission.head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,0
4,5,3


### Create submission csv

Need to set `index=False`!

In [74]:
submission.to_csv('submission.csv', index=False)