In [1]:
import numpy as np

from torch.utils.data import DataLoader
from torchvision import datasets

from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torchvision.transforms as Trans

import torch.nn.init as init


In [39]:
print(torch.max(init.kaiming_normal_(torch.empty(size=(28*28, 100)), nonlinearity='relu')))

tensor(0.6043)


In [29]:
BATCH_SIZE = 32
NUM_WORKERS = 4

EPOCHS = 10

In [30]:
transform = Trans.ToTensor()


train_dataset = datasets.MNIST(
    root='datasets',
    train=True,
    transform=transform,
    download=True
)

test_dataset = datasets.MNIST(
    root='datasets',
    train=False,
    transform=transform,
    download=True
)

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
)

test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

In [41]:
a = test_dataset[0][0]
a = a.reshape(1, -1)
l = nn.Linear(28*28, 100)
init.kaiming_normal_(l.weight, nonlinearity='relu')

b= l(a)
print(torch.max(a))
print(torch.max(b))
print(torch.max(l.weight))


tensor(1.)
tensor(0.8363, grad_fn=<MaxBackward1>)
tensor(0.2428, grad_fn=<MaxBackward1>)


In [10]:
class MyNet(nn.Module):
    def __init__(self):
        super().__init__()

        self.flatten = nn.Flatten()

        self.fc1 = nn.Sequential(
            nn.Linear(28*28, 100),
            nn.Relu()
        )

        self.fc2 = nn.Sequential(
            nn.Linear(100, 50),
            nn.Relu(),
        )

        self.fcRes = nn.Sequential(
            nn.Linear(50, 10),
            nn.Relu()
        )

    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fcRes(x)
        return x

CELoss = nn.CrossEntropyLoss()
net = MyNet()

optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

In [11]:
def train(net, optimizer, criterion):
    running_loss = 0
    for images, labels in train_dataloader:
        
        optimizer.zero_grad()

        output = net(images)

        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss
    with torch.no_grad():
        train_loss = running_loss / len(train_dataloader)
    return train_loss


def valid(net, criterion):
    running_loss = 0
    correct_total = 0
    with torch.no_grad():
        for images, labels in test_dataloader:
            output = net(images)

            loss = criterion(output, labels)
            running_loss += loss

            pred = output.max(dim = 1, keepdim=True)[1]
            correct_total += pred.eq(labels.view_as(pred)).sum()
            
        precison = correct_total / len(test_dataloader.dataset)
        valid_loss = running_loss / len(test_dataloader)
        return valid_loss, precison

In [12]:
for epoch in (pbar := tqdm(range(EPOCHS))):
    train_loss = train(net, optimizer, CELoss)
    valid_loss, prec = valid(net, CELoss)

    print(f"[{epoch}] train/valid loss: {train_loss:.4f}/{valid_loss:.4f} prec: {prec:.4f}")
    pbar.set_description(f"train/valid loss: {train_loss:.4f}/{valid_loss:.4f} prec: {prec:.4f}")

  0%|          | 0/10 [00:00<?, ?it/s]

[0] train/valid loss: 2.2967/2.2838 prec: 0.1135
[1] train/valid loss: 2.1400/1.9457 prec: 0.5384
[2] train/valid loss: 1.8487/1.7904 prec: 0.6173
[3] train/valid loss: 1.7696/1.7531 prec: 0.6235
[4] train/valid loss: 1.7417/1.7294 prec: 0.6248
[5] train/valid loss: 1.7213/1.7103 prec: 0.5927
[6] train/valid loss: 1.7033/1.6921 prec: 0.5683
[7] train/valid loss: 1.6877/1.6777 prec: 0.5480
[8] train/valid loss: 1.6762/1.6676 prec: 0.5570
[9] train/valid loss: 1.6668/1.6586 prec: 0.5736
