In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from random import randint
import time
import utils

In [2]:
device = torch.device("cpu")
print(device)

cpu


In [3]:
from utils import check_mnist_dataset_exists
data_path = check_mnist_dataset_exists()

# download mnist dataset
# 60,000 gray scale pictures as well as their label, each picture is 28 by 28 pixels
train_data = torch.load(data_path + 'mnist/train_data.pt')
train_label = torch.load(data_path + 'mnist/train_label.pt')
test_data = torch.load(data_path + 'mnist/test_data.pt')
test_label = torch.load(data_path + 'mnist/test_label.pt')

MNIST dataset missing - downloading...
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz


1.7%

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ../../data/mnist/temp/MNIST/raw/train-images-idx3-ubyte.gz


100.0%


Extracting ../../data/mnist/temp/MNIST/raw/train-images-idx3-ubyte.gz to ../../data/mnist/temp/MNIST/raw


102.8%
2.8%


Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ../../data/mnist/temp/MNIST/raw/train-labels-idx1-ubyte.gz
Extracting ../../data/mnist/temp/MNIST/raw/train-labels-idx1-ubyte.gz to ../../data/mnist/temp/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ../../data/mnist/temp/MNIST/raw/t10k-images-idx3-ubyte.gz


100.0%


Extracting ../../data/mnist/temp/MNIST/raw/t10k-images-idx3-ubyte.gz to ../../data/mnist/temp/MNIST/raw


112.7%


Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ../../data/mnist/temp/MNIST/raw/t10k-labels-idx1-ubyte.gz
Extracting ../../data/mnist/temp/MNIST/raw/t10k-labels-idx1-ubyte.gz to ../../data/mnist/temp/MNIST/raw




  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [4]:
class three_layer_net(nn.Module):

    def __init__(self, input_size, hidden_size1, hidden_size2,  output_size):
        super(three_layer_net , self).__init__()
        
        # 三层全连接网络MLP
        self.layer1 = nn.Linear(input_size, hidden_size1, bias=False)
        self.layer2 = nn.Linear(hidden_size1, hidden_size2, bias=False)
        self.layer3 = nn.Linear(hidden_size2, output_size, bias=False)
        
    def forward(self, x):
        
        y = self.layer1(x)
        # 第一层使用ReLU作为激活函数
        y_hat = torch.relu(y)
        z = self.layer2(y_hat)
        # 第二层使用ReLU作为激活函数
        z_hat = torch.relu(z)
        # 第三层直接输出
        scores = self.layer3(z_hat)
        # prob = torch.softmax(y, dim=1)
        # 若这里用了softmax，则
        # 1. criterion需要用NLLLoss
        # 2. 需要对输出的分数求log，即log_scores = torch.log(scores)
        # 3. 最终的损失loss = criterion(log_scores, label)
        # 以上的步骤其实就是Cross-Entropy Loss的拆分，NLLLoss实际就是在做归一化
        # 若使用LogSoftmax
        # prob = torch.logsoftmax(y, dim=1)
        # 则不需要求log
        return scores

In [6]:
net = three_layer_net(784, 50, 50, 10)
print(net)
utils.display_num_param(net)

three_layer_net(
  (layer1): Linear(in_features=784, out_features=50, bias=False)
  (layer2): Linear(in_features=50, out_features=50, bias=False)
  (layer3): Linear(in_features=50, out_features=10, bias=False)
)
There are 42200 (0.04 million) parameters in this neural network


In [7]:
net = net.to(device)

In [8]:
# cross-entropy Loss
criterion = nn.CrossEntropyLoss()
# criterion = nn.NLLLoss()
# batch size = 200
bs = 200

In [9]:
def eval_on_test_set():

    running_error = 0
    num_batches = 0

    # test size = 10000
    for i in range(0, 10000, bs):

        # extract the minibatch
        minibatch_data = test_data[i: i + bs]
        minibatch_label = test_label[i: i + bs]

        # reshape the minibatch, 784 = 28 x 28
        # 200 x 784
        inputs = minibatch_data.view(bs, 784)

        # feed it to the network
        scores = net(inputs) 

        # compute the error made on this batch
        error = utils.get_error(scores, minibatch_label)

        # add it to the running error
        running_error += error.item()

        num_batches += 1

    total_error = running_error / num_batches
    print('test error  = ', total_error * 100, ' percent')

In [10]:
start = time.time()

lr = 0.05 # initial learning rate

for epoch in range(200):
    
    # learning rate strategy: divide the learning rate by 1.5 every 10 epochs
    if epoch % 10 == 0 and epoch > 10:
        lr = lr / 1.5
    
    # create a new optimizer at the beginning of each epoch: give the current learning rate.
    optimizer = torch.optim.SGD(net.parameters() , lr=lr)
        
    running_loss = 0
    running_error = 0
    num_batches = 0
    
    # 先随机排序
    # train size = 60000
    shuffled_indices = torch.randperm(60000)

    # train size = 60000
    for count in range(0, 60000, bs):
        
        # forward and backward pass
        # set dL/dU, dL/dV, dL/dW to be filled with zeros
        optimizer.zero_grad()
        
        # 随机抽取200条数据, batch size = 200
        indices = shuffled_indices[count: count + bs]
        minibatch_data = train_data[indices]
        minibatch_label = train_label[indices]

        # reshape the minibatch, batch size = 200, 784 = 28 x 28
        # 200 x 784
        inputs = minibatch_data.view(bs, 784)

        # tell Pytorch to start tracking all operations that will be done on "inputs"
        inputs.requires_grad_()

        # forward the minibatch through the net
        scores = net(inputs) 
        # log_scores = torch.log(scores)

        # compute the average of the losses of the data points in the minibatch
        # 一个batch的平均损失
        loss = criterion(scores, minibatch_label) 
        # loss = criterion(log_scores, minibatch_label)
        
        # backward pass to compute dL/dU, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: U=U-lr(dL/dU), V=V-lr(dL/dU), ...
        optimizer.step()
        
        # compute some stats
        # 获得当前的loss
        running_loss += loss.detach().item()
               
        error = utils.get_error(scores.detach(), minibatch_label)
        running_error += error.item()
        
        num_batches += 1
    
    # compute stats for the full training set
    # once the epoch is finished we divide the "running quantities" by the number of batches
    # 总Loss = 每个Batch的Loss累加 / Batch数量累加 = 所有Batch的Loss / Batch数
    # 若Batch Size = 1，则Batch数 = 数据集大小
    # 若Batch Size = 数据集大小，则Batch数 = 1
    total_loss = running_loss / num_batches
    # 总Error = 每个Batch的Error累加 / Error数量累加 = 所有Batch的Error / Batch数
    # 若Batch Size = ，则Batch数 = 数据集大小
    # 若Batch Size = 数据集大小，则Batch数 = 1
    total_error = running_error / num_batches
    # 训练一个batch的时间
    elapsed_time = time.time() - start
    
    # every 10 epoch we display the stats and compute the error rate on the test set  
    if epoch % 10 == 0 : 
        print(' ')
        print('epoch = ', epoch, ' time = ', elapsed_time, ' loss = ', total_loss, ' error = ', total_error * 100, ' percent lr = ', lr)
        eval_on_test_set()

 
epoch =  0  time =  0.5236721038818359  loss =  1.3819054213166237  error =  36.825000027815506  percent lr =  0.05
test error  =  14.660000085830688  percent
 
epoch =  10  time =  5.694929122924805  loss =  0.17998844516774018  error =  5.126667161782582  percent lr =  0.05
test error  =  5.349999785423279  percent
 
epoch =  20  time =  10.660156011581421  loss =  0.10820389812812209  error =  3.06499973932902  percent lr =  0.03333333333333333
test error  =  3.700000047683716  percent
 
epoch =  30  time =  15.705405235290527  loss =  0.0824780416302383  error =  2.32499893506368  percent lr =  0.022222222222222223
test error  =  3.18999981880188  percent
 
epoch =  40  time =  20.70580005645752  loss =  0.07029626129815976  error =  1.9466654459635417  percent lr =  0.014814814814814815
test error  =  3.039999485015869  percent
 
epoch =  50  time =  25.767770051956177  loss =  0.06325585604955752  error =  1.714998682339986  percent lr =  0.009876543209876543
test error  =  2.8

In [None]:
# choose a picture at random
idx = randint(0, 10000-1)
im = test_data[idx]

# diplay the picture
utils.show(im)

# feed it to the net and display the confidence scores
# im.view(1, 784)而不是im.view(784)是因为net是根据有batch size存在而设计的
# 例如batch size = 200，即im.view(200, 784)，则input是[[data_1], [data_2], ..., [data_200]]
# 而im.view(784)的input是[data_1]，少了一个维度
# 1代表了batch size = 1，就是只有一张图
scores = net(im.view(1, 784)) # one 1 x 784 image, 784 = 28 x 28
# dim=1是因为这里的输出是
# [[-7.2764, 8.4730, 2.6842, 1.6302, -3.8437, -1.9697, -0.5854, -0.0792, 2.0861, -0.5462]]
# 需要求里面的维度的softmax
probs = torch.softmax(scores, dim=1)
utils.show_prob_mnist(probs)