In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
mnist=fetch_openml('mnist_784',version=1) # the data type of mnist is bunch
X,y=mnist['data'],mnist['target'] # X is DataFrame, and y is Series
npar_X, npar_y = X.values, y.astype(int).values # the original data in y is string

In [11]:
# one_hot encoding for y
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(sparse_output=False)
npar_y_oh=encoder.fit_transform(npar_y.reshape(-1,1))
print(npar_y.shape)
print(npar_y_oh.shape)

(70000,)
(70000, 10)


In [15]:
# train_test_split
X_train,X_test,y_train,y_test = train_test_split(npar_X,npar_y_oh,test_size=0.2,random_state=42)
print(X_train.shape)
print(y_train.shape)
print(X_train.shape)
print(y_test.shape)

(56000, 784)
(56000, 10)
(56000, 784)
(14000, 10)


In [17]:
def sigmoid(x):
    return 1/1+np.exp(-x)
def softmax(x):
    x_max=np.max(x)
    x_new=x-x_max
    exp_x=np.exp(x_new)
    sum_exp_x=np.sum(exp_x)
    return exp_x/sum_exp_x

In [19]:
def cross_entropy_error(y,t):
    if y.ndim==1 or t.ndim==1:
        t.reshape(1,t.size)
        y.reshape(1,y.size)
    batch_size=y.shape[0]
    return -np.sum(t*np.log(y+1e-7))/batch_size

In [51]:
def numerical_gradient(f,x): # x is a vector
    grads=np.zeros_like(x)
    h=1e-4
    for idx in range(x.size): #'int' object is not iterable
        tmp=x[idx]
        x[idx]=tmp+h
        f1=f(x)
        x[idx]=tmp-h
        f2=f(x)
        grads[idx]=(f1-f2)/2*h
    return grads
def numerical_gradient_W(f,x):
    grads=np.zeros_like(x)
    h=1e-4
    for idx1 in range(x.shape[0]): 
        for idx2 in range(x.shape[1]): #'int' object is not iterable
            tmp=x[idx1][idx2]
            x[idx1][idx2]=tmp+h
            f1=f(x)
            x[idx1][idx2]=tmp-h
            f2=f(x)
            grads[idx1][idx2]=(f1-f2)/2*h
    return grads

In [59]:
class TwoLayerNet:
    def __init__(self,input_size,hidden_size,output_size,weight_init_std=0.01):
        self.params={}
        self.params['W1']=weight_init_std*np.random.randn(input_size,hidden_size)
        self.params['b1']=np.zeros(hidden_size)
        self.params['W2']=weight_init_std*np.random.randn(hidden_size,output_size)
        self.params['b2']=np.zeros(output_size)
    def predict(self,x):
        W1,W2=self.params['W1'],self.params['W2']
        b1,b2=self.params['b1'],self.params['b2']
        a1=np.dot(x,W1)+b1
        Z1=sigmoid(a1)
        a2=np.dot(Z1,W2)+b2
        y=softmax(a2)
        return y # Must remember to add a return here! Otherwise, y will become Nonetype !
    def loss(self,x,t):
        y=self.predict(x)
        return cross_entropy_error(y,t)
    def accuracy(self,x,t):
        y=self.predict(x)
        y=np.argmax(y,axis=1)# row direction
        t=np.argmax(t,axis=1)
        accuracy=np.sum(y==t)/float(x.shape[0])
        return accuracy
    def numerical_gradient(self,x,t):
        loss_W=lambda W: self.loss(x,t) # anonymous function
        grads={}
        grads['W1']=numerical_gradient_W(loss_W,self.params['W1'])
        grads['b1']=numerical_gradient(loss_W,self.params['b1'])
        grads['W2']=numerical_gradient_W(loss_W,self.params['W2'])
        grads['b2']=numerical_gradient(loss_W,self.params['b2'])
        return grads

In [61]:
import numpy as np
network=TwoLayerNet(input_size=784,hidden_size=100,output_size=10)

In [37]:
print(network.params['W1'].shape)
print(network.params['b1'].shape)
print(network.params['W2'].shape)
print(network.params['b1'].shape)

(784, 100)
(100,)
(100, 10)
(100,)


In [63]:
x=np.random.rand(100,784)
y=network.predict(x)
print(y.shape)
print(y[:5])

(100, 10)
[[0.00067407 0.00115557 0.00090699 0.00088504 0.00078098 0.00125954
  0.0010759  0.0012445  0.00114749 0.00088299]
 [0.00067713 0.00114939 0.00092821 0.00086369 0.00077747 0.00129363
  0.00107318 0.00121851 0.00112876 0.00086322]
 [0.00067601 0.00115974 0.0009185  0.00087777 0.00078587 0.00125612
  0.00106831 0.0012458  0.00112124 0.00088044]
 [0.00067168 0.0011682  0.00092299 0.00087547 0.00077395 0.00125561
  0.00107265 0.00123325 0.00113988 0.00086844]
 [0.00068102 0.00116788 0.00092083 0.00086285 0.00077981 0.00123935
  0.0010802  0.00122227 0.00114768 0.00087507]]


In [47]:
t=np.random.rand(100,10) # tag and supervision data
print(t.shape)
print(t[:5])

(100, 10)
[[0.70463618 0.27538688 0.91193995 0.59485648 0.74581229 0.44461943
  0.19487171 0.87154424 0.49608018 0.46656214]
 [0.68963219 0.37598222 0.81540459 0.30188208 0.26383209 0.82316901
  0.20735306 0.90424706 0.03366783 0.92266239]
 [0.28513915 0.89189896 0.14239904 0.45718301 0.31300986 0.76403229
  0.71506301 0.55496826 0.21568884 0.58507453]
 [0.62915294 0.85649429 0.06531954 0.69326936 0.21264488 0.84041212
  0.77577135 0.23081563 0.01981447 0.06898094]
 [0.934198   0.62894788 0.15205622 0.25851171 0.57944887 0.28224965
  0.30017834 0.11382467 0.78980168 0.16235504]]


In [67]:
grads=network.numerical_gradient(x,t)
print(grads['W1'].shape)
print(grads['b1'].shape)
print(grads['W2'].shape)
print(grads['b1'].shape)

(784, 100)
(100,)
(100, 10)
(100,)


In [None]:
#setting hyper parameter
iters_num=10000
train_size=X_train.shape[0]
batch_size=100
learning_rate=0.1
train_loss_list=[]#an empty list
network=TwoLayerNet(input_size=784,hidden_size=50,output_size=10)
for i in range(iters_num):
    #get mini batch
    batch_mask=np.random.choice(train_size,batch_size)
    X_batch=X_train[batch_mask]
    t_batch=y_train[batch_mask]
    grad=network.numerical_gradient(X_batch,t_batch)
    # renew the parameters
    for key in ('W1','b1','W2','b2'): # tuple is also an iterable object
        network.params[key]=-learning_rate*grad[key]
    loss=network.loss(X_batch,t_batch)
    train_loss_list.append(loss)
        