# example of gradient checking

In [1]:
# gradient checking: compare the analytical gradient with the numerical gradient
# taking the affine layer as an example
from gradient_check import eval_numerical_gradient_array
import numpy as np
from layers import *
N = 2
D = 3
M = 4
x = np.random.normal(size=(N, D))
w = np.random.normal(size=(D, M))
b = np.random.normal(size=(M, ))
dout = np.random.normal(size=(N, M))

# do a forward pass first
out, cache = affine_forward(x,w,b)
# check grad f/grad w, the [0] below gets the output out of the (output, cache) original output
f=lambda w: affine_forward(x,w,b)[0]
# compute the analytical gradient you wrote, [1] get the dw out of the (dx, dw, db) original output
grad = affine_backward(dout, cache)[1]
# compute the numerical gradient using the provided utility function
ngrad = eval_numerical_gradient_array(f, w, dout)
print(grad)
print(ngrad)
# they should be similar enough within some small error tolerance

[[-1.09842782  2.36980251 -1.71311115 -1.69784127]
 [-1.10818927 -0.89843516  1.18975116  1.46538825]
 [ 0.37443655  0.19727268 -0.30769843 -0.39242183]]
[[-1.09842782  2.36980251 -1.71311115 -1.69784127]
 [-1.10818927 -0.89843516  1.18975116  1.46538825]
 [ 0.37443655  0.19727268 -0.30769843 -0.39242183]]


# example of training a network

In [1]:
def normalization(X):
    Xn = np.zeros(X.shape)
    for i in range(X.shape[0]):
        x = X[i, :]
        Xn[i, :] = (x- np.min(x)/(np.max(x)-np.min(x)))
    return Xn

# def normalization(X):
#     Xn = np.zeros(X.shape)
#     for i in range(X.shape[0]):
#         x = X[i,:]
#         Xn[i,:] = (x/(np.sqrt(x.dot(x))+1e-15))
#     return Xn

In [180]:
# Load the dataset
import scipy.io
import numpy as np
data = scipy.io.loadmat("mnist_data.mat")
X = data['training_data']
y = data['training_labels'].ravel()
X_test = data['test_data']
# X = normalization(X)
# X_test = normalization(X_test)

# Split the data into a training set and validation set.
num_train = X.shape[0]
indices = np.array(range(num_train))
np.random.shuffle(indices)
train_indices, val_indices = indices[0:50000], indices[50000:]
X_train, X_val = X[train_indices], X[val_indices]
y_train, y_val = y[train_indices], y[val_indices]

# one hot encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train_en = le.fit_transform(y_train)
y_val_en = le.fit_transform(y_val)
y_train_en[y_train_en == 0] = 0.01
y_train_en[y_train_en == 1] = 0.99
y_val_en[y_val_en == 0] = 0.01
y_val_en[y_val_en == 1] = 0.99

# Feature Normalization 
X_train = X_train.astype('float32'); X_val= X_val.astype('float32'); X_test = X_test.astype('float32')
X_train /= 255; X_val /= 255; X_test /= 255

from solver import Solver
from classifiers.fc_net import FullyConnectedNet

In [187]:

data = {
      'X_train': X_train,
      'y_train': y_train_en,
      'X_val': X_val,
      'y_val': y_val_en, 
    'X_test': X_test} # change to X_train to check predict function


# TODO: fill out the hyperparamets
hyperparams = {'lr_decay': 1, # 1 is the best
               'num_epochs': 30, # usually 10, 100, 500, 1000, and larger. 10-20 is better
               'batch_size': 90, # usually 32, 64, and 128  120 is good
               'learning_rate':1e-3 # 1e-3 is the best
              }

# TODO: fill out the number of units in your hidden layers
hidden_dim = [700,500,350,200] # this should be a list of units for each hiddent layer

In [188]:
# hyperparams = {'lr_decay': 1, # 1 is the best
#                'num_epochs': 20, # usually 10, 100, 500, 1000, and larger. 10-20 is better
#                'batch_size': 100, # usually 32, 64, and 128 
#                'learning_rate':1e-3 # 1e-3 is the best
#               }

model = FullyConnectedNet(input_dim=784,
                          hidden_dim=hidden_dim)
solver = Solver(model, data,
                update_rule='sgd',
                optim_config={
                  'learning_rate': hyperparams['learning_rate'],
                },
                lr_decay=hyperparams['lr_decay'],
                num_epochs=hyperparams['num_epochs'], 
                batch_size=hyperparams['batch_size'],
                print_every=100)
solver.train()

(Iteration 1 / 19980) loss: 494.270804
(Epoch 0 / 30) train acc: 0.091000; val_acc: 0.102900
(Iteration 101 / 19980) loss: 32.322357
(Iteration 201 / 19980) loss: 31.752244
(Iteration 301 / 19980) loss: 12.892911
(Iteration 401 / 19980) loss: 13.884229
(Iteration 501 / 19980) loss: 19.979101
(Iteration 601 / 19980) loss: 11.992356
(Epoch 1 / 30) train acc: 0.967000; val_acc: 0.966300
(Iteration 701 / 19980) loss: 5.149894
(Iteration 801 / 19980) loss: 7.292631
(Iteration 901 / 19980) loss: 9.208693
(Iteration 1001 / 19980) loss: 4.958672
(Iteration 1101 / 19980) loss: 7.288830
(Iteration 1201 / 19980) loss: 8.211078
(Iteration 1301 / 19980) loss: 10.748854
(Epoch 2 / 30) train acc: 0.990000; val_acc: 0.982200
(Iteration 1401 / 19980) loss: 9.932001
(Iteration 1501 / 19980) loss: 3.435816
(Iteration 1601 / 19980) loss: 9.617993
(Iteration 1701 / 19980) loss: 4.775626
(Iteration 1801 / 19980) loss: 1.420054
(Iteration 1901 / 19980) loss: 5.179353
(Epoch 3 / 30) train acc: 0.987000; val_a

(Iteration 16801 / 19980) loss: 0.029400
(Iteration 16901 / 19980) loss: 0.018437
(Iteration 17001 / 19980) loss: 0.009872
(Iteration 17101 / 19980) loss: 0.015708
(Iteration 17201 / 19980) loss: 0.018686
(Iteration 17301 / 19980) loss: 0.005891
(Epoch 26 / 30) train acc: 1.000000; val_acc: 1.000000
(Iteration 17401 / 19980) loss: 0.008886
(Iteration 17501 / 19980) loss: 0.016581
(Iteration 17601 / 19980) loss: 0.005945
(Iteration 17701 / 19980) loss: 0.013497
(Iteration 17801 / 19980) loss: 0.010685
(Iteration 17901 / 19980) loss: 0.000970
(Epoch 27 / 30) train acc: 1.000000; val_acc: 1.000000
(Iteration 18001 / 19980) loss: 0.016987
(Iteration 18101 / 19980) loss: 0.011529
(Iteration 18201 / 19980) loss: 0.014291
(Iteration 18301 / 19980) loss: 0.004119
(Iteration 18401 / 19980) loss: 0.012931
(Iteration 18501 / 19980) loss: 0.002196
(Iteration 18601 / 19980) loss: 0.003573
(Epoch 28 / 30) train acc: 1.000000; val_acc: 1.000000
(Iteration 18701 / 19980) loss: 0.001582
(Iteration 1880

In [189]:
# predict X_test
y_pred = solver.predict()
y_pred.shape,X_test.shape

((10000,), (10000, 784))

In [190]:
y_pred

array([7, 2, 0, ..., 4, 5, 6], dtype=int64)

In [191]:
import pandas as pd
def results_to_csv(y_test):
    y_test = y_test.astype(int)
    df = pd.DataFrame({'Category': y_test})
    df.index += 1  # Ensures that the index starts at 1. 
    df.to_csv('submission_m_8.8.15.57.csv', index_label='Id')
results_to_csv(y_pred)

In [9]:
# check my predict function
y_pred_X_train = solver.predict()

In [10]:
# seems my predict fn is correct
np.sum(y_pred_X_train == y_train)/len(X_train)

  


0.0

In [11]:
newdict = {}
newdict['cache%d'%1] = 1 
newdict['cache%d' %(1+2)] = 3
newdict

{'cache1': 1, 'cache3': 3}