In [None]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms


# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters 
input_size = 784
hidden_size = 500
num_classes = 10
num_epochs = 5
batch_size = 100
learning_rate = 0.001

: 

In [2]:
torch.cuda.is_available()

True

In [3]:
print('device properties:', torch.cuda.get_device_properties(0)) #can be 0~torch.cuda.device_count()-1
print('device count:', torch.cuda.device_count())

device properties: _CudaDeviceProperties(name='Tesla V100-SXM2-32GB', major=7, minor=0, total_memory=32502MB, multi_processor_count=80)
device count: 2


In [4]:
# MNIST dataset 
train_dataset = torchvision.datasets.MNIST(root='../../data', 
                                           train=True, 
                                           transform=transforms.ToTensor(),  
                                           download=True)

test_dataset = torchvision.datasets.MNIST(root='../../data', 
                                          train=False, 
                                          transform=transforms.ToTensor())

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

In [5]:
# Fully connected neural network with one hidden layer
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

"""
model_cpu.children(): generator
list(model_cpu.children()): 
[Linear(in_features=784, out_features=500, bias=True),
 ReLU(),
 Linear(in_features=500, out_features=10, bias=True)]
"""
model_cpu = NeuralNet(input_size, hidden_size, num_classes)
print(list(model_cpu.children())[0].weight)
"""
model_cpu.to(device, dtype or tensor)
"""
model = model_cpu.to(device)
print(list(model.children())[0].weight)

Parameter containing:
tensor([[-0.0296,  0.0275,  0.0136,  ...,  0.0286,  0.0252,  0.0215],
        [ 0.0140, -0.0282, -0.0238,  ..., -0.0155,  0.0353, -0.0355],
        [ 0.0090, -0.0312,  0.0236,  ...,  0.0159,  0.0317,  0.0126],
        ...,
        [ 0.0246,  0.0286,  0.0017,  ..., -0.0235, -0.0090,  0.0258],
        [ 0.0233, -0.0357,  0.0156,  ...,  0.0336, -0.0284,  0.0288],
        [ 0.0173, -0.0182, -0.0280,  ...,  0.0077,  0.0042,  0.0346]],
       requires_grad=True)
Parameter containing:
tensor([[-0.0296,  0.0275,  0.0136,  ...,  0.0286,  0.0252,  0.0215],
        [ 0.0140, -0.0282, -0.0238,  ..., -0.0155,  0.0353, -0.0355],
        [ 0.0090, -0.0312,  0.0236,  ...,  0.0159,  0.0317,  0.0126],
        ...,
        [ 0.0246,  0.0286,  0.0017,  ..., -0.0235, -0.0090,  0.0258],
        [ 0.0233, -0.0357,  0.0156,  ...,  0.0336, -0.0284,  0.0288],
        [ 0.0173, -0.0182, -0.0280,  ...,  0.0077,  0.0042,  0.0346]],
       device='cuda:0', requires_grad=True)


In [6]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

In [7]:
# Train the model
total_step = len(train_loader) #=len(train_dataset)/batch_size
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):  
        # Move tensors to the configured device
        """
        If we omit .to(device) after the following two variables, it will give
        RuntimeError: Expected object of backend CUDA but got backend CPU for argument #4 'mat1'
        """
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % (total_step/6) == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

Epoch [1/5], Step [100/600], Loss: 0.3901
Epoch [1/5], Step [200/600], Loss: 0.2135
Epoch [1/5], Step [300/600], Loss: 0.1688
Epoch [1/5], Step [400/600], Loss: 0.1252
Epoch [1/5], Step [500/600], Loss: 0.1077
Epoch [1/5], Step [600/600], Loss: 0.1319
Epoch [2/5], Step [100/600], Loss: 0.1955
Epoch [2/5], Step [200/600], Loss: 0.1654
Epoch [2/5], Step [300/600], Loss: 0.2031
Epoch [2/5], Step [400/600], Loss: 0.1387
Epoch [2/5], Step [500/600], Loss: 0.1193
Epoch [2/5], Step [600/600], Loss: 0.0847
Epoch [3/5], Step [100/600], Loss: 0.0274
Epoch [3/5], Step [200/600], Loss: 0.1189
Epoch [3/5], Step [300/600], Loss: 0.0786
Epoch [3/5], Step [400/600], Loss: 0.1031
Epoch [3/5], Step [500/600], Loss: 0.0316
Epoch [3/5], Step [600/600], Loss: 0.1281
Epoch [4/5], Step [100/600], Loss: 0.0952
Epoch [4/5], Step [200/600], Loss: 0.0358
Epoch [4/5], Step [300/600], Loss: 0.0378
Epoch [4/5], Step [400/600], Loss: 0.0395
Epoch [4/5], Step [500/600], Loss: 0.0175
Epoch [4/5], Step [600/600], Loss:

In [8]:
# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
"""
torch.no_grad: Context-manager that disabled gradient calculation.
"""
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        outputs = model(images)
        """
        max_value, index_of_max_value = torch.max(input, dim)
        input: tensor
        dim: int
        """
        """The results from torch.max(outputs,1) and torch.max(outputs.data, 1) are the same"""
        _, predicted = torch.max(outputs.data, 1)
#         _, predicted = torch.max(outputs, 1)
        """
        labels.size(): torch.Size
        labels.size(0): int
        """
        total += labels.size(0)
        """(predicted==labels) returns a dytpe=torch.uint8 tensor"""
        """tensor.item(): get number from a tensor"""
        correct += (predicted == labels).sum().item()

    print('Accuracy of the network on the 10000 test images: {} %'.format(100 * correct / total))

# Save the model checkpoint
torch.save(model.state_dict(), 'model.ckpt')

Accuracy of the network on the 10000 test images: 97.96 %
