### Batch & Layer Normalization and Dropout
------------------------
+ __Batch Normalization__ is normalizing input distribution in mini-batch by making the mean 0 and the standard deviation 1 and adapting scale factor and shift factor.
+ __Dropout__ is a feature in which some nodes are randomly selected at a certain rate and removed during the training phase.

In [None]:
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import random
from tqdm.auto import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

random.seed(111)
torch.manual_seed(777)
if device == 'cuda':
    torch.cuda.manual_seed_all(777)
    
# parameters
learning_rate = 0.001
training_epochs = 15
batch_size = 100

In [None]:
# transform setting 
# You need to set up this code!
transform = transforms.Compose([
                    transforms.ToTensor(),
                    # Data Normalization
                    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    
                    # Data Augmentation
                    # transforms.CenterCrop(28), # this parameter must lower than real image size
                    # transforms.RandomHorizontalFlip(p=0.5), # this parameter is probability of applying this function
                    # transforms.RandomVerticalFlip(p=0.5), # this parameter is probability of applying this function
                    # transforms.RandomRotation(70), # this parameter is degree of rotate image
                    # transforms.ColorJitter(brightness=20, contrast=25, saturation=0, hue=0)
                    
    ])


trainset = dsets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
testset = dsets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)


trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

In [None]:
# model define
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.input_layer = nn.Linear(32 * 32 * 3, 256)
        self.hidden_layer1 = nn.Linear(256, 512)
        self.hidden_layer2 = nn.Linear(512, 256)
        self.output_layer = nn.Linear(256, 10)
        self.relu = nn.ReLU()
        # self.batchnorm = nn.BatchNorm2d(100) # batch normalization
        #self.dropout = nn.Dropout(p=0.5) # Dropout
        
    def forward(self, x):
        x = self.flatten(x)
        h = self.relu(self.input_layer(x))
        h = self.relu(self.hidden_layer1(h))
        h = self.relu(self.hidden_layer2(h))
        y = self.output_layer(h)
        return y

### Regularization
------------------------
+ __Regularizationn__ is a kind of penalty condition. In general, when doing machine learning or statistical inference. The cost function or error function proceeds toward the smaller one. 

In [None]:
# set up criteron and optimizer
criterion = torch.nn.CrossEntropyLoss().to(device)    # Softmax is internally computed.
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

#optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5) # if weight_decay > 0 : L2 Regularization 

In [None]:
# training
for epoch in tqdm(range(training_epochs)):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0].to(device), data[1].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion
        (outputs, labels)
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
    print('epochs:',epoch + 1 ,', loss:', running_loss / 2000)
    running_loss = 0.0
print('Finished Training')

In [None]:
# test
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for data in testloader:
        images, labels = data[0].to(device), data[1].to(device)

        # calculate outputs by running images through the network
        outputs = model(images)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct // total} %')

In [None]:
# prepare to count predictions for each class
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

# again no gradients needed
with torch.no_grad():
    for data in testloader:
        images, labels = data[0].to(device), data[1].to(device)
        outputs = model(images)
        _, predictions = torch.max(outputs, 1)
        # collect the correct predictions for each class
        for label, prediction in zip(labels, predictions):
            if label == prediction:
                correct_pred[classes[label]] += 1
            total_pred[classes[label]] += 1


# print accuracy for each class
for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print(f'Accuracy for class: {classname:5s} is {accuracy:.1f} %')