In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [44]:
import torchvision.datasets as dsets
import torchvision.transforms as transforms

mnist_train = dsets.MNIST(root="MNIST_data/", train=True, transform=transforms.ToTensor(), download=True)
mnist_test = dsets.MNIST(root="MNIST_data/", train=False, transform=transforms.ToTensor(), download=True)

data_loader = torch.utils.data.DataLoader(dataset=mnist_train, batch_size = 100, shuffle = True, drop_last=True)

learning_rate = 0.001
training_epochs=15


In [45]:

cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")
print(device)

cuda


In [46]:
linear1 = nn.Linear(784,256,bias=True).to(device)
linear2 = nn.Linear(256,256,bias=True).to(device)
linear3 = nn.Linear(256,10,bias=True).to(device)
relu = nn.ReLU()

nn.init.normal_(linear1.weight)
nn.init.normal_(linear2.weight)
nn.init.normal_(linear3.weight)

model = nn.Sequential(linear1, relu, linear2, relu, linear3)

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [47]:
total_batch = len(data_loader)
total_batch

600

In [48]:
len(mnist_train)

60000

In [49]:
for epoch in range(training_epochs):
  avg_cost=0
  for X,y in data_loader:
    X=X.view(-1,28*28).to(device)
    hypothesis = model(X)
    y = y.to(device)
    cost=criterion(hypothesis, y)
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()
    avg_cost += cost/total_batch
  print(f'epoch: {epoch+1:04d}, cost: {avg_cost:.9f}')

epoch: 0001, cost: 134.919097900
epoch: 0002, cost: 34.796195984
epoch: 0003, cost: 21.408962250
epoch: 0004, cost: 14.904853821
epoch: 0005, cost: 10.834650993
epoch: 0006, cost: 8.049036980
epoch: 0007, cost: 6.074109077
epoch: 0008, cost: 4.601957798
epoch: 0009, cost: 3.459908962
epoch: 0010, cost: 2.565858841
epoch: 0011, cost: 2.038970232
epoch: 0012, cost: 1.483794093
epoch: 0013, cost: 1.131309986
epoch: 0014, cost: 0.928988159
epoch: 0015, cost: 0.790462732


# weight 초기화

In [54]:
linear1 = nn.Linear(784,256,bias=True).to(device)
linear2 = nn.Linear(256,256,bias=True).to(device)
linear3 = nn.Linear(256,10,bias=True).to(device)
relu = nn.ReLU()

nn.init.xavier_uniform_(linear1.weight)
nn.init.xavier_uniform_(linear2.weight)
nn.init.xavier_uniform_(linear3.weight)

model = nn.Sequential(linear1, relu, linear2, relu, linear3).to(device)

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [55]:
linear1.weight

Parameter containing:
tensor([[ 0.0403,  0.0759, -0.0277,  ..., -0.0018, -0.0389,  0.0754],
        [-0.0254,  0.0086, -0.0216,  ...,  0.0518,  0.0256,  0.0715],
        [-0.0376, -0.0484,  0.0104,  ...,  0.0452, -0.0006,  0.0437],
        ...,
        [-0.0268,  0.0552,  0.0349,  ...,  0.0130, -0.0700, -0.0150],
        [ 0.0092, -0.0497,  0.0274,  ...,  0.0707, -0.0240,  0.0130],
        [ 0.0583,  0.0641, -0.0295,  ...,  0.0113,  0.0480,  0.0682]],
       device='cuda:0', requires_grad=True)

In [56]:
for epoch in range(training_epochs):
  avg_cost=0
  for X,y in data_loader:
    X=X.view(-1,28*28).to(device)
    hypothesis = model(X)
    y = y.to(device)
    cost=criterion(hypothesis, y)
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()
    avg_cost += cost/total_batch
  print(f'epoch: {epoch+1:04d}, cost: {avg_cost:.9f}')

epoch: 0001, cost: 0.242756695
epoch: 0002, cost: 0.092285402
epoch: 0003, cost: 0.062039033
epoch: 0004, cost: 0.042520743
epoch: 0005, cost: 0.031513788
epoch: 0006, cost: 0.028479766
epoch: 0007, cost: 0.021315072
epoch: 0008, cost: 0.015401734
epoch: 0009, cost: 0.015323811
epoch: 0010, cost: 0.013908255
epoch: 0011, cost: 0.012902365
epoch: 0012, cost: 0.013531565
epoch: 0013, cost: 0.009962415
epoch: 0014, cost: 0.013476742
epoch: 0015, cost: 0.008597630


In [58]:
import random
with torch.no_grad():
    X_test = mnist_test.test_data.view(-1, 28 * 28).float().to(device)
    Y_test = mnist_test.test_labels.to(device)

    prediction = model(X_test)
    correct_prediction = torch.argmax(prediction, 1) == Y_test
    accuracy = correct_prediction.float().mean()
    print('Accuracy:', accuracy.item())

    # Get one and predict
    r = random.randint(0, len(mnist_test) - 1)
    X_single_data = mnist_test.test_data[r:r + 1].view(-1, 28 * 28).float().to(device)
    Y_single_data = mnist_test.test_labels[r:r + 1].to(device)

    print('Label: ', Y_single_data.item())
    single_prediction = model(X_single_data)
    print('Prediction: ', torch.argmax(single_prediction, 1).item())

Accuracy: 0.9788999557495117
Label:  8
Prediction:  8




# solution for overfitting : Dropout

In [59]:
drop_prob = 0.3

In [61]:
linear1 = nn.Linear(784,256,bias=True).to(device)
linear2 = nn.Linear(256,256,bias=True).to(device)
linear3 = nn.Linear(256,10,bias=True).to(device)
relu = nn.ReLU()
dropout = nn.Dropout(p=drop_prob)

nn.init.xavier_uniform_(linear1.weight)
nn.init.xavier_uniform_(linear2.weight)
nn.init.xavier_uniform_(linear3.weight)

model = nn.Sequential(linear1, relu,dropout, linear2, relu,dropout, linear3).to(device)

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [63]:
model.train()
for epoch in range(training_epochs):
  avg_cost=0
  for X,y in data_loader:
    X=X.view(-1,28*28).to(device)
    hypothesis = model(X)
    y = y.to(device)
    cost=criterion(hypothesis, y)
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()
    avg_cost += cost/total_batch
  print(f'epoch: {epoch+1:04d}, cost: {avg_cost:.9f}')

epoch: 0001, cost: 0.061630543
epoch: 0002, cost: 0.057797752
epoch: 0003, cost: 0.051700540
epoch: 0004, cost: 0.048283633
epoch: 0005, cost: 0.045227014
epoch: 0006, cost: 0.044656452
epoch: 0007, cost: 0.038138166
epoch: 0008, cost: 0.037211888
epoch: 0009, cost: 0.037101697
epoch: 0010, cost: 0.033996876
epoch: 0011, cost: 0.031392924
epoch: 0012, cost: 0.033477139
epoch: 0013, cost: 0.029849797
epoch: 0014, cost: 0.031477891
epoch: 0015, cost: 0.028862039


In [64]:
with torch.no_grad():
    model.eval()    # set the model to evaluation mode (dropout=False)

    # Test the model using test sets
    X_test = mnist_test.test_data.view(-1, 28 * 28).float().to(device)
    Y_test = mnist_test.test_labels.to(device)

    prediction = model(X_test)
    correct_prediction = torch.argmax(prediction, 1) == Y_test
    accuracy = correct_prediction.float().mean()
    print('Accuracy:', accuracy.item())

    # Get one and predict
    r = random.randint(0, len(mnist_test) - 1)
    X_single_data = mnist_test.test_data[r:r + 1].view(-1, 28 * 28).float().to(device)
    Y_single_data = mnist_test.test_labels[r:r + 1].to(device)

    print('Label: ', Y_single_data.item())
    single_prediction = model(X_single_data)
    print('Prediction: ', torch.argmax(single_prediction, 1).item())

Accuracy: 0.9817000031471252
Label:  3
Prediction:  3




In [70]:
linear1 = nn.Linear(784,32,bias=True).to(device)
linear2 = nn.Linear(32,32,bias=True).to(device)
linear3 = nn.Linear(32,10,bias=True).to(device)
relu = nn.ReLU()
# dropout = nn.Dropout(p=drop_prob)
bn1 = nn.BatchNorm1d(32)
bn2 = nn.BatchNorm1d(32)


nn.init.xavier_uniform_(linear1.weight)
nn.init.xavier_uniform_(linear2.weight)
nn.init.xavier_uniform_(linear3.weight)

model = nn.Sequential(linear1, bn1, relu,linear2, relu, bn2, linear3).to(device)

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [71]:
model.train()
for epoch in range(training_epochs):
  avg_cost=0
  for X,y in data_loader:
    X=X.view(-1,28*28).to(device)
    hypothesis = model(X)
    y = y.to(device)
    cost=criterion(hypothesis, y)
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()
    avg_cost += cost/total_batch
  print(f'epoch: {epoch+1:04d}, cost: {avg_cost:.9f}')

epoch: 0001, cost: 0.437240452
epoch: 0002, cost: 0.178472072
epoch: 0003, cost: 0.136187404
epoch: 0004, cost: 0.114239112
epoch: 0005, cost: 0.098816007
epoch: 0006, cost: 0.090406254
epoch: 0007, cost: 0.083359987
epoch: 0008, cost: 0.077698804
epoch: 0009, cost: 0.071442246
epoch: 0010, cost: 0.066118881
epoch: 0011, cost: 0.063378260
epoch: 0012, cost: 0.059494715
epoch: 0013, cost: 0.056903966
epoch: 0014, cost: 0.052922558
epoch: 0015, cost: 0.051940672


In [89]:
with torch.no_grad():
    model.eval()    # set the model to evaluation mode (dropout=False)

    # Test the model using test sets
    X_test = mnist_test.test_data.view(-1, 28 * 28).float().to(device)
    Y_test = mnist_test.test_labels.to(device)

    prediction = model(X_test)
    correct_prediction = torch.argmax(prediction, 1) == Y_test
    accuracy = correct_prediction.float().mean()
    print('Accuracy:', accuracy.item())

    # Get one and predict
    r = random.randint(0, len(mnist_test) - 1)
    X_single_data = mnist_test.test_data[r:r + 1].view(-1, 28 * 28).float().to(device)
    Y_single_data = mnist_test.test_labels[r:r + 1].to(device)

    print('Label: ', Y_single_data.item())
    single_prediction = model(X_single_data)
    print('Prediction: ', torch.argmax(single_prediction, 1).item())

Accuracy: 0.6811000108718872
Label:  8
Prediction:  7




In [74]:
correct_prediction.shape

torch.Size([10000])

In [84]:
Y_test.shape

torch.Size([10000])

In [86]:
prediction

tensor([[  243.8239,   650.8428,  -477.4907,  ...,  2249.4023, -1879.7969,
          -748.8295],
        [ -736.4235,  2310.7439,  2804.8145,  ...,   466.2023, -2228.4526,
         -2099.3091],
        [ -755.6474,  3778.6433,   240.2356,  ...,   841.0907, -1854.3988,
          -858.4570],
        ...,
        [ -218.4949,  1747.1770, -1235.7329,  ...,  1259.5645, -2456.7734,
         -1082.5540],
        [ -443.5300,   797.8032, -2248.0110,  ...,    39.5957, -1346.3348,
         -1176.6896],
        [  143.4415,    82.2598,    59.0561,  ...,  -108.8685, -2741.8887,
         -1341.3666]], device='cuda:0')

In [83]:
prediction.shape

torch.Size([10000, 10])

In [88]:
 torch.argmax(prediction, 1,keepdim=True)

tensor([[7],
        [2],
        [1],
        ...,
        [4],
        [5],
        [6]], device='cuda:0')