# MNIST dataset

http://yann.lecun.com/exdb/mnist/

<img width="400" src=
"https://3qeqpr26caki16dnhd19sv6by6v-wpengine.netdna-ssl.com/wp-content/uploads/2016/05/Examples-from-the-MNIST-dataset.png">

## Bytes reading

In [1]:
# unzipping files from data folder
# how to read a compressed file
# https://docs.python.org/3/library/gzip.html

import gzip
with gzip.open('data/t10k-images-idx3-ubyte.gz', 'rb') as f:
    file_content = f.read()

In [2]:
file_content[0:4]

b'\x00\x00\x08\x03'

In [3]:
# tells us what type the data is in(bytes/int etc.)
type(file_content)

bytes

## Endian

In [4]:
# will output the bytes into an int
# We see the first 4 bytes in the content have the value of 2051
int.from_bytes(file_content[0:4], byteorder='big')

2051

In [5]:
# bytes from 4-8 have a value of 10000
# this is the number of images in the data
int.from_bytes(file_content[4:8], byteorder='big')

10000

In [6]:
# bytes 8-12 give us the number of rows
int.from_bytes(file_content[8:12], byteorder='big')

28

In [7]:
# bytes 12-16 give the number of columns
int.from_bytes(file_content[12:16], byteorder='big')

28

In [8]:
int.from_bytes(file_content[278:279], byteorder='big')

163

## Outputting an image

In [9]:
l = file_content[16:800]

In [10]:
type(l)

bytes

In [11]:
import numpy as np

image = ~np.array(list(file_content[16:800])).reshape(28,28).astype(np.uint8)

In [12]:
import matplotlib.pyplot as plt

plt.imshow(image, cmap='gray')
plt.show()

<Figure size 640x480 with 1 Axes>

## Reading labels

In [13]:
import gzip

with gzip.open('data/t10k-labels-idx1-ubyte.gz', 'rb') as f:
    labels = f.read()

In [14]:
int.from_bytes(labels[8:9], byteorder="big")

7

## Neural Network

In [15]:
# Import Library
import keras as kr

# Start a neural network, building its layers.
model = kr.models.Sequential()

# Add a hidden layer with 1000 neurons(units) and an input layer with dimension 784.
model.add(kr.layers.Dense(units=1000, activation='relu', input_dim=784))
# Add a three neuron output layer(units).
model.add(kr.layers.Dense(units=10, activation='softmax'))

# Build the graph.
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [16]:
# being able to read the images
# and then train the image and the image labels
with gzip.open('data/train-images-idx3-ubyte.gz', 'rb') as f:
    train_img = f.read()

with gzip.open('data/train-labels-idx1-ubyte.gz', 'rb') as f:
    train_lbl = f.read()
    
train_img = ~np.array(list(train_img[16:])).reshape(60000, 28, 28).astype(np.uint8)
train_lbl =  np.array(list(train_lbl[ 8:])).astype(np.uint8)

In [17]:
inputs = train_img.reshape(60000, 784)

In [18]:
# For encoding categorical variables.
import sklearn.preprocessing as pre

# Binarize labels in a one-vs-all fashion
encoder = pre.LabelBinarizer()
encoder.fit(train_lbl)
outputs = encoder.transform(train_lbl) # Encode labels with value between 0 and n_classes-1.

outputs[0]

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [19]:
# loads in sets of 10 which is very slow
# took 73 seconds to load the first epoch
# has the accuracy of 1 in 10
# Would become increasingly slower with a bigger batch size
model.fit(inputs, outputs, epochs=15, batch_size=10)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x2e2d145e0f0>