# MNIST

[http://yann.lecun.com/exdb/mnist/](http://yann.lecun.com/exdb/mnist/)

## Reading bytes from files

In [1]:
# Adapted from: https://docs.python.org/3/library/gzip.html

import gzip

with gzip.open('data/t10k-images-idx3-ubyte.gz', 'rb') as f:
    file_content = f.read()

In [2]:
type(file_content)

bytes

In [3]:
file_content[0:4]

b'\x00\x00\x08\x03'

## Little and big endian

In [4]:
# Adapted from: https://stackoverflow.com/questions/51220161/how-to-convert-from-bytes-to-int

int.from_bytes(file_content[0:4], byteorder='big')

2051

In [5]:
int.from_bytes(file_content[4:8], byteorder='big')

10000

In [6]:
int.from_bytes(file_content[8:12], byteorder='big')

28

In [7]:
int.from_bytes(file_content[12:16], byteorder='big')

28

In [8]:
int.from_bytes(file_content[278:279], byteorder='big')

163

## Reading a single image

In [9]:
l = file_content[16:800]

In [10]:
type(l)

bytes

In [11]:
import numpy as np

image = ~np.array(list(file_content[16:800])).reshape(28,28).astype(np.uint8)

In [12]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.imshow(image, cmap='gray')

ImportError: cannot import name '_path'

## Reading a label

In [None]:
# Adapted from: https://docs.python.org/3/library/gzip.html

import gzip

with gzip.open('data/t10k-labels-idx1-ubyte.gz', 'rb') as f:
    labels = f.read()

In [None]:
int.from_bytes(labels[8:9], byteorder="big")

## Neural network

In [13]:
# Import keras.
import keras as kr

# Start a neural network, building it by layers.
model = kr.models.Sequential()

# Add a hidden layer with 1000 neurons and an input layer with 784.
model.add(kr.layers.Dense(units=1000, activation='relu', input_dim=784))
# Add a three neuron output layer.
model.add(kr.layers.Dense(units=10, activation='softmax'))

# Build the graph.
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

Using TensorFlow backend.


In [14]:
with gzip.open('data/train-images-idx3-ubyte.gz', 'rb') as f:
    train_img = f.read()

with gzip.open('data/train-labels-idx1-ubyte.gz', 'rb') as f:
    train_lbl = f.read()
    
train_img = ~np.array(list(train_img[16:])).reshape(60000, 28, 28).astype(np.uint8)
train_lbl =  np.array(list(train_lbl[ 8:])).astype(np.uint8)

In [15]:
inputs = train_img.reshape(60000, 784)

In [16]:
# For encoding categorical variables.
import sklearn.preprocessing as pre

encoder = pre.LabelBinarizer()
encoder.fit(train_lbl)
outputs = encoder.transform(train_lbl)

print(train_lbl[0], outputs[0])

ImportError: No module named 'sklearn.__check_build._check_build'
___________________________________________________________________________
Contents of C:\Users\Michael\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\__check_build:
setup.py                  _check_build.cp37-win_amd64.pyd__init__.py
__pycache__
___________________________________________________________________________
It seems that scikit-learn has not been built correctly.

If you have installed scikit-learn from source, please do not forget
to build the package before using it: run `python setup.py install` or
`make` in the source directory.

If you have used an installer, please check that it is suited for your
Python version, your operating system and your platform.

In [17]:
model.fit(inputs, outputs, epochs=15, batch_size=10)

NameError: name 'outputs' is not defined

In [24]:
with gzip.open('data/t10k-images-idx3-ubyte.gz', 'rb') as f:
    test_img = f.read()

with gzip.open('data/t10k-labels-idx1-ubyte.gz', 'rb') as f:
    test_lbl = f.read()
    
test_img = ~np.array(list(test_img[16:])).reshape(10000, 784).astype(np.uint8)
test_lbl =  np.array(list(test_lbl[ 8:])).astype(np.uint8)

In [25]:
(encoder.inverse_transform(model.predict(test_img)) == test_lbl).sum()

1010

## End