Постройте модель на основе полносвязных слоёв для классификации Fashion MNIST из библиотеки torchvision (datasets). Получите качество на тестовой выборке не ниже 88%.  

Инструкция по выполнению задания. 

1. Скачайте тренировочную и тестовою часть датасета Fashion MNIST
2. Постройте модель, выбрав стартовую архитектуру
3. Обучите модель и сверьте качество на тестовой части с заданным порогом
4. Изменяйте архитектуру модели пока качество на тестовой части не будет выше порога. Вариации архитектуры можно реализовать через изменение количества слоёв, количества нейронов в слоях и использование регуляризации. Можно использовать различные оптимизаторы.

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import numpy as np
import pandas as pd
import torchvision as tv
import torchvision.transforms as transforms
import time

In [2]:
BATCH_SIZE=128
train_dataset = tv.datasets.FashionMNIST('.', train=True, transform=tv.transforms.ToTensor(), download=True)
test_dataset = tv.datasets.FashionMNIST('.', train=False, transform=tv.transforms.ToTensor(), download=True)
train = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE)
test = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [3]:
train_dataset[0][0].shape

torch.Size([1, 28, 28])

In [4]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 10),
    torch.nn.ReLU()
)

In [5]:
loss = torch.nn.CrossEntropyLoss()
trainer = torch.optim.SGD(model.parameters(), lr=.01)
num_epochs = 10

In [6]:
def train_model():
    for ep in range(num_epochs):
        train_iters, train_passed  = 0, 0
        train_loss, train_acc = 0., 0.
        start=time.time()
        
        model.train()
        for X, y in train:
            trainer.zero_grad()
            y_pred = model(X)
            l = loss(y_pred, y)
            l.backward()
            trainer.step()
            train_loss += l.item()
            train_acc += (y_pred.argmax(dim=1) == y).sum().item()
            train_iters += 1
            train_passed += len(X)
        
        test_iters, test_passed  = 0, 0
        test_loss, test_acc = 0., 0.
        model.eval()
        for X, y in test:
            y_pred = model(X)
            l = loss(y_pred, y)
            test_loss += l.item()
            test_acc += (y_pred.argmax(dim=1) == y).sum().item()
            test_iters += 1
            test_passed += len(X)
            
        print("ep: {}, taked: {:.3f}, train_loss: {}, train_acc: {}, test_loss: {}, test_acc: {}".format(
            ep, time.time() - start, train_loss / train_iters, train_acc / train_passed,
            test_loss / test_iters, test_acc / test_passed)
        )

In [7]:
train_model()

ep: 0, taked: 5.177, train_loss: 1.7016190524294432, train_acc: 0.4368666666666667, test_loss: 1.5142080089713954, test_acc: 0.4744
ep: 1, taked: 5.002, train_loss: 1.4346487191694377, train_acc: 0.5136166666666667, test_loss: 1.3958521628681617, test_acc: 0.5188
ep: 2, taked: 5.010, train_loss: 1.3500586277894628, train_acc: 0.5405833333333333, test_loss: 1.3369727270512641, test_acc: 0.5362
ep: 3, taked: 5.349, train_loss: 1.23805799857894, train_acc: 0.5934, test_loss: 1.1224244282215456, test_acc: 0.6341
ep: 4, taked: 5.148, train_loss: 1.0768371362930167, train_acc: 0.65215, test_loss: 1.0716954431956327, test_acc: 0.6475
ep: 5, taked: 5.173, train_loss: 1.040840032639534, train_acc: 0.6609333333333334, test_loss: 1.0457736389546455, test_acc: 0.6559
ep: 6, taked: 5.248, train_loss: 1.0181960528339151, train_acc: 0.66735, test_loss: 1.0272730694541448, test_acc: 0.6617
ep: 7, taked: 4.979, train_loss: 1.0012118233038163, train_acc: 0.6730833333333334, test_loss: 1.0129240804080721

In [8]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 10)
)

In [9]:
trainer = torch.optim.SGD(model.parameters(), lr=.01)
num_epochs = 10

In [10]:
train_model()

ep: 0, taked: 5.613, train_loss: 1.3996451202231939, train_acc: 0.6249833333333333, test_loss: 0.9405898580068275, test_acc: 0.6802
ep: 1, taked: 5.561, train_loss: 0.8142721149712991, train_acc: 0.7224666666666667, test_loss: 0.74983533575565, test_acc: 0.7361
ep: 2, taked: 5.347, train_loss: 0.6907286261444661, train_acc: 0.7660166666666667, test_loss: 0.6679099183293837, test_acc: 0.7691
ep: 3, taked: 5.338, train_loss: 0.6255021498782802, train_acc: 0.7905, test_loss: 0.6180896321429482, test_acc: 0.7857
ep: 4, taked: 5.668, train_loss: 0.5831251960319243, train_acc: 0.8054, test_loss: 0.5846508767785905, test_acc: 0.7983
ep: 5, taked: 5.447, train_loss: 0.55334476557876, train_acc: 0.8143333333333334, test_loss: 0.5607467033440554, test_acc: 0.8061
ep: 6, taked: 5.536, train_loss: 0.5312293526460367, train_acc: 0.8212833333333334, test_loss: 0.542773163394083, test_acc: 0.8124
ep: 7, taked: 5.576, train_loss: 0.5140624846349647, train_acc: 0.8265166666666667, test_loss: 0.52867325

In [11]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 10)
)

In [12]:
trainer = torch.optim.SGD(model.parameters(), lr=.01)
num_epochs = 10

In [13]:
train_model()

ep: 0, taked: 6.194, train_loss: 2.1650124234177155, train_acc: 0.4069333333333333, test_loss: 1.7764837636223323, test_acc: 0.4418
ep: 1, taked: 6.249, train_loss: 1.2697238545936307, train_acc: 0.5462666666666667, test_loss: 1.007527318181871, test_acc: 0.6231
ep: 2, taked: 6.371, train_loss: 0.8840221680049449, train_acc: 0.6674166666666667, test_loss: 0.8133070333094536, test_acc: 0.6939
ep: 3, taked: 6.064, train_loss: 0.7509583983339989, train_acc: 0.72235, test_loss: 0.7229021002974692, test_acc: 0.7382
ep: 4, taked: 6.341, train_loss: 0.6756322275219696, train_acc: 0.757, test_loss: 0.6611461077309861, test_acc: 0.7616
ep: 5, taked: 6.233, train_loss: 0.6231480816534078, train_acc: 0.7764666666666666, test_loss: 0.6167278293567368, test_acc: 0.7794
ep: 6, taked: 6.361, train_loss: 0.584306109752228, train_acc: 0.7933, test_loss: 0.5836895892891703, test_acc: 0.7927
ep: 7, taked: 6.446, train_loss: 0.5540322684911269, train_acc: 0.8049666666666667, test_loss: 0.5581762488884262,

In [14]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 256),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(256),
    torch.nn.Linear(256, 10)
)

In [15]:
trainer = torch.optim.SGD(model.parameters(), lr=.01)
num_epochs = 10

In [16]:
train_model()

ep: 0, taked: 5.646, train_loss: 0.5870682567294473, train_acc: 0.8095333333333333, test_loss: 0.46556350588798523, test_acc: 0.8389
ep: 1, taked: 5.697, train_loss: 0.4126950154807776, train_acc: 0.8583, test_loss: 0.41938952724390394, test_acc: 0.8535
ep: 2, taked: 5.588, train_loss: 0.3719282929958311, train_acc: 0.8700333333333333, test_loss: 0.4018903110600725, test_acc: 0.8573
ep: 3, taked: 5.428, train_loss: 0.344911246125632, train_acc: 0.8791666666666667, test_loss: 0.38484380686584907, test_acc: 0.8624
ep: 4, taked: 5.589, train_loss: 0.3243396333349285, train_acc: 0.8854666666666666, test_loss: 0.377733513052705, test_acc: 0.8645
ep: 5, taked: 5.740, train_loss: 0.30730250363410916, train_acc: 0.8911666666666667, test_loss: 0.36695722594291347, test_acc: 0.8662
ep: 6, taked: 5.554, train_loss: 0.29501085278830297, train_acc: 0.89495, test_loss: 0.3585036285693132, test_acc: 0.8727
ep: 7, taked: 5.727, train_loss: 0.28184882734121797, train_acc: 0.9003666666666666, test_loss:

In [17]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 2560),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.Linear(2560, 640),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.Linear(640, 10)
)

In [18]:
trainer = torch.optim.SGD(model.parameters(), lr=.01)
num_epochs = 10

In [19]:
train_model()

ep: 0, taked: 12.855, train_loss: 1.5964324412061208, train_acc: 0.49953333333333333, test_loss: 0.9734175537205949, test_acc: 0.6635
ep: 1, taked: 12.855, train_loss: 0.8851051049700169, train_acc: 0.68105, test_loss: 0.7460513771334781, test_acc: 0.7286
ep: 2, taked: 12.700, train_loss: 0.7430795573476535, train_acc: 0.736, test_loss: 0.6608713348455066, test_acc: 0.7636
ep: 3, taked: 13.015, train_loss: 0.6691093637999187, train_acc: 0.7662, test_loss: 0.607084937865221, test_acc: 0.786
ep: 4, taked: 12.347, train_loss: 0.620769393151757, train_acc: 0.7845166666666666, test_loss: 0.5708023422881018, test_acc: 0.7977
ep: 5, taked: 12.659, train_loss: 0.5838083424039487, train_acc: 0.7995666666666666, test_loss: 0.5440943203394926, test_acc: 0.8073
ep: 6, taked: 12.530, train_loss: 0.5597801422005269, train_acc: 0.8065833333333333, test_loss: 0.525283216298381, test_acc: 0.8156
ep: 7, taked: 13.136, train_loss: 0.537651578500581, train_acc: 0.8140666666666667, test_loss: 0.50908905380

In [20]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 256),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(256),
    torch.nn.Linear(256, 10)
)

In [21]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
num_epochs = 10

In [22]:
train_model()

ep: 0, taked: 5.741, train_loss: 0.4844061788211245, train_acc: 0.8251666666666667, test_loss: 0.4869471407389339, test_acc: 0.8249
ep: 1, taked: 5.934, train_loss: 0.3936743134501646, train_acc: 0.85815, test_loss: 0.4679542327228981, test_acc: 0.8409
ep: 2, taked: 5.882, train_loss: 0.36199607701698094, train_acc: 0.8688666666666667, test_loss: 0.44849730462213105, test_acc: 0.8468
ep: 3, taked: 6.049, train_loss: 0.3376496312206488, train_acc: 0.8768166666666667, test_loss: 0.40291006957428366, test_acc: 0.857
ep: 4, taked: 6.114, train_loss: 0.32021517565509655, train_acc: 0.8821666666666667, test_loss: 0.38691831502733354, test_acc: 0.8644
ep: 5, taked: 6.191, train_loss: 0.30893919386589197, train_acc: 0.8860333333333333, test_loss: 0.4111422500278376, test_acc: 0.8624
ep: 6, taked: 6.208, train_loss: 0.2975260585482949, train_acc: 0.88975, test_loss: 0.41211007120488563, test_acc: 0.862
ep: 7, taked: 5.998, train_loss: 0.2886652720889557, train_acc: 0.89225, test_loss: 0.4175740

In [23]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(512),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(256),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(128),
    torch.nn.Linear(128, 10)
)

In [24]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
num_epochs = 10

In [25]:
train_model()

ep: 0, taked: 7.264, train_loss: 0.48737359796759927, train_acc: 0.8198166666666666, test_loss: 0.4912898342443418, test_acc: 0.8241
ep: 1, taked: 7.361, train_loss: 0.38654602425439016, train_acc: 0.8560666666666666, test_loss: 0.5136137897077995, test_acc: 0.8145
ep: 2, taked: 7.489, train_loss: 0.34979667551100635, train_acc: 0.86965, test_loss: 0.42619315846056877, test_acc: 0.8482
ep: 3, taked: 7.813, train_loss: 0.3242322854332324, train_acc: 0.8788333333333334, test_loss: 0.40798549523836447, test_acc: 0.8518
ep: 4, taked: 7.702, train_loss: 0.3034930884647471, train_acc: 0.88695, test_loss: 0.3932411631074133, test_acc: 0.8598
ep: 5, taked: 7.803, train_loss: 0.29023533449498323, train_acc: 0.8914833333333333, test_loss: 0.40640042269531684, test_acc: 0.8596
ep: 6, taked: 7.595, train_loss: 0.2764786295989937, train_acc: 0.8977666666666667, test_loss: 0.4101532327977917, test_acc: 0.8659
ep: 7, taked: 7.754, train_loss: 0.265218635294229, train_acc: 0.9009666666666667, test_los

In [26]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(512),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(256),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(128),
    torch.nn.Linear(128, 10)
)

In [27]:
trainer = torch.optim.RMSprop(model.parameters(), lr=.01)
num_epochs = 10

In [28]:
train_model()

ep: 0, taked: 6.726, train_loss: 0.5404372358881334, train_acc: 0.8049833333333334, test_loss: 0.4963313554283939, test_acc: 0.8251
ep: 1, taked: 7.004, train_loss: 0.38995990849761314, train_acc: 0.8552333333333333, test_loss: 0.4490664755053158, test_acc: 0.842
ep: 2, taked: 7.002, train_loss: 0.34771604286328056, train_acc: 0.8710833333333333, test_loss: 0.414754246326187, test_acc: 0.8483
ep: 3, taked: 7.105, train_loss: 0.31816135137192986, train_acc: 0.8821666666666667, test_loss: 0.4377459763914724, test_acc: 0.8482
ep: 4, taked: 6.696, train_loss: 0.2960212174127859, train_acc: 0.8893666666666666, test_loss: 0.43038181390000296, test_acc: 0.8531
ep: 5, taked: 6.850, train_loss: 0.27747694266312667, train_acc: 0.8954, test_loss: 0.4264337543068053, test_acc: 0.8647
ep: 6, taked: 6.836, train_loss: 0.26407847987182104, train_acc: 0.9004666666666666, test_loss: 0.4121619495032709, test_acc: 0.8598
ep: 7, taked: 6.949, train_loss: 0.2475953085431412, train_acc: 0.9049666666666667, 

In [29]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.BatchNorm1d(512),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.BatchNorm1d(256),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.BatchNorm1d(128),
    torch.nn.Linear(128, 10)
)

In [30]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
num_epochs = 10

In [31]:
train_model()

ep: 0, taked: 7.394, train_loss: 0.6928026308891362, train_acc: 0.7461166666666667, test_loss: 0.5161974499874478, test_acc: 0.821
ep: 1, taked: 7.940, train_loss: 0.568504066673169, train_acc: 0.7986666666666666, test_loss: 0.49533548045761977, test_acc: 0.8336
ep: 2, taked: 7.875, train_loss: 0.5382526452734526, train_acc: 0.8096833333333333, test_loss: 0.45311844801600976, test_acc: 0.8385
ep: 3, taked: 7.931, train_loss: 0.5137144877458177, train_acc: 0.8188833333333333, test_loss: 0.4414772474313084, test_acc: 0.8412
ep: 4, taked: 8.089, train_loss: 0.5082014259626108, train_acc: 0.8209333333333333, test_loss: 0.43102669187738923, test_acc: 0.8492
ep: 5, taked: 8.218, train_loss: 0.4994320380789385, train_acc: 0.8226666666666667, test_loss: 0.43365675324126135, test_acc: 0.8497
ep: 6, taked: 7.808, train_loss: 0.48215906471331743, train_acc: 0.8291166666666666, test_loss: 0.4439719441948058, test_acc: 0.8504
ep: 7, taked: 8.054, train_loss: 0.4712269718586001, train_acc: 0.8331, t

In [32]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 2560),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.BatchNorm1d(2560),
    torch.nn.Linear(2560, 128),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.BatchNorm1d(128),
    torch.nn.Linear(128, 10)
)

In [33]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
num_epochs = 10

In [34]:
train_model()

ep: 0, taked: 13.644, train_loss: 0.6012180606439423, train_acc: 0.78205, test_loss: 0.8842642228060131, test_acc: 0.8144
ep: 1, taked: 15.209, train_loss: 0.5055163416272795, train_acc: 0.8177833333333333, test_loss: 2.3824616957314406, test_acc: 0.7329
ep: 2, taked: 16.112, train_loss: 0.477633545457173, train_acc: 0.8262, test_loss: 0.46754017359093775, test_acc: 0.8457
ep: 3, taked: 16.837, train_loss: 0.4620061135495395, train_acc: 0.8344, test_loss: 0.526482364233536, test_acc: 0.8469
ep: 4, taked: 16.805, train_loss: 0.4508689435115501, train_acc: 0.8354666666666667, test_loss: 0.5644418321832826, test_acc: 0.8502
ep: 5, taked: 16.825, train_loss: 0.4400277541898715, train_acc: 0.8412333333333334, test_loss: 0.5058349902871289, test_acc: 0.8527
ep: 6, taked: 17.281, train_loss: 0.4302519623086905, train_acc: 0.8453166666666667, test_loss: 0.5703509449958801, test_acc: 0.857
ep: 7, taked: 17.175, train_loss: 0.42117647200759284, train_acc: 0.8483666666666667, test_loss: 0.4882386

In [35]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 1250),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.BatchNorm1d(1250),
    torch.nn.Linear(1250, 512),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.BatchNorm1d(512),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.BatchNorm1d(256),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.BatchNorm1d(128),
    torch.nn.Linear(128, 10)
)

In [36]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
num_epochs = 10

In [37]:
train_model()

ep: 0, taked: 11.475, train_loss: 0.7020501094713394, train_acc: 0.7458166666666667, test_loss: 0.8884765407707118, test_acc: 0.8147
ep: 1, taked: 11.442, train_loss: 0.577420278843532, train_acc: 0.7984333333333333, test_loss: 1.6608804924578606, test_acc: 0.8254
ep: 2, taked: 12.644, train_loss: 0.5408304620907505, train_acc: 0.8115166666666667, test_loss: 0.49685370922088623, test_acc: 0.8374
ep: 3, taked: 12.601, train_loss: 0.5132737905104786, train_acc: 0.8195833333333333, test_loss: 0.5096864462653293, test_acc: 0.842
ep: 4, taked: 13.077, train_loss: 0.4959651723599383, train_acc: 0.8274333333333334, test_loss: 0.523750367987005, test_acc: 0.8484
ep: 5, taked: 13.046, train_loss: 0.48549903347802315, train_acc: 0.8303166666666667, test_loss: 0.6328998308015775, test_acc: 0.8556
ep: 6, taked: 12.909, train_loss: 0.48454682487668765, train_acc: 0.8300333333333333, test_loss: 10.271532132278514, test_acc: 0.8515
ep: 7, taked: 13.056, train_loss: 0.47304833945689173, train_acc: 0.8

In [38]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 1250),
    torch.nn.ReLU(),
    torch.nn.Linear(1250, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.BatchNorm1d(128),
    torch.nn.Linear(128, 10)
)

In [39]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
num_epochs = 10

In [40]:
train_model()

ep: 0, taked: 10.865, train_loss: 1.0095461064310216, train_acc: 0.6116166666666667, test_loss: 0.5632213844528681, test_acc: 0.7937
ep: 1, taked: 12.388, train_loss: 0.5770721687182688, train_acc: 0.7987833333333333, test_loss: 0.7386319569394558, test_acc: 0.8383
ep: 2, taked: 13.623, train_loss: 0.47884672282855395, train_acc: 0.8346833333333333, test_loss: 0.47944573661949064, test_acc: 0.838
ep: 3, taked: 13.291, train_loss: 0.4282309130819113, train_acc: 0.85155, test_loss: 0.7161148758251455, test_acc: 0.8519
ep: 4, taked: 13.934, train_loss: 0.39870187023809467, train_acc: 0.86045, test_loss: 1.9825844147914573, test_acc: 0.8652
ep: 5, taked: 13.282, train_loss: 0.3740018313246241, train_acc: 0.86945, test_loss: 0.4213680624961853, test_acc: 0.866
ep: 6, taked: 13.728, train_loss: 0.35064920347763784, train_acc: 0.8779833333333333, test_loss: 0.3878108938283558, test_acc: 0.8677
ep: 7, taked: 13.461, train_loss: 0.3341475796661397, train_acc: 0.8809333333333333, test_loss: 0.39

In [41]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 128),
    torch.nn.ReLU(),
    torch.nn.Dropout(0.5),
    torch.nn.BatchNorm1d(128),
    torch.nn.Linear(128, 10)
)

In [42]:
trainer = torch.optim.Adam(model.parameters(), lr=.01)
num_epochs = 10

In [43]:
train_model()

ep: 0, taked: 6.750, train_loss: 0.6978995223035181, train_acc: 0.7500333333333333, test_loss: 0.4764529770310921, test_acc: 0.8312
ep: 1, taked: 6.798, train_loss: 0.5183545487013452, train_acc: 0.81685, test_loss: 0.46663782660719716, test_acc: 0.8512
ep: 2, taked: 7.171, train_loss: 0.47001587562977887, train_acc: 0.8318333333333333, test_loss: 0.4016511866563483, test_acc: 0.8583
ep: 3, taked: 7.226, train_loss: 0.4202747120023536, train_acc: 0.8500333333333333, test_loss: 0.4598609258102465, test_acc: 0.86
ep: 4, taked: 7.130, train_loss: 0.3822564820109654, train_acc: 0.8638, test_loss: 0.36590074908129777, test_acc: 0.8673
ep: 5, taked: 7.267, train_loss: 0.3872493338038418, train_acc: 0.8588333333333333, test_loss: 0.3717942020938366, test_acc: 0.8648
ep: 6, taked: 7.362, train_loss: 0.35073992857800873, train_acc: 0.8737, test_loss: 0.36447673262674596, test_acc: 0.8742
ep: 7, taked: 7.365, train_loss: 0.33578062616685816, train_acc: 0.87895, test_loss: 0.3655428746833077, tes

In [44]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(128),
    torch.nn.Linear(128, 10)
)

In [45]:
trainer = torch.optim.NAdam(model.parameters(), lr=.01)
num_epochs = 10

In [46]:
train_model()

ep: 0, taked: 7.188, train_loss: 0.5170742955797517, train_acc: 0.8109833333333333, test_loss: 0.47171768112273155, test_acc: 0.8306
ep: 1, taked: 7.869, train_loss: 0.3683315847219943, train_acc: 0.8642166666666666, test_loss: 0.3965271954672246, test_acc: 0.8564
ep: 2, taked: 7.597, train_loss: 0.3311366995514583, train_acc: 0.8771833333333333, test_loss: 0.3857541956667659, test_acc: 0.8628
ep: 3, taked: 8.018, train_loss: 0.3066761795836471, train_acc: 0.8862833333333333, test_loss: 0.3751318040716497, test_acc: 0.8641
ep: 4, taked: 8.080, train_loss: 0.2890904584228357, train_acc: 0.8923, test_loss: 0.3834522897495499, test_acc: 0.8631
ep: 5, taked: 7.914, train_loss: 0.2732854774321066, train_acc: 0.8979, test_loss: 0.3886354031819331, test_acc: 0.8591
ep: 6, taked: 7.866, train_loss: 0.2599241914175975, train_acc: 0.9018333333333334, test_loss: 0.39048995492579064, test_acc: 0.8615
ep: 7, taked: 7.910, train_loss: 0.24877065931683157, train_acc: 0.9061333333333333, test_loss: 0.

In [64]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 256),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(256),
    torch.nn.Linear(256, 10)
)

In [73]:
trainer = torch.optim.ASGD(model.parameters(), lr=.05)
num_epochs = 10

In [74]:
train_model()

ep: 0, taked: 5.785, train_loss: 0.23209441114844545, train_acc: 0.91315, test_loss: 0.3529046943670587, test_acc: 0.8813
ep: 1, taked: 5.810, train_loss: 0.2301828177657717, train_acc: 0.9142166666666667, test_loss: 0.35201920889600924, test_acc: 0.8818
ep: 2, taked: 5.836, train_loss: 0.228756201260888, train_acc: 0.9145, test_loss: 0.351418352768391, test_acc: 0.8821
ep: 3, taked: 5.968, train_loss: 0.22764737225735365, train_acc: 0.9150333333333334, test_loss: 0.3509915761555297, test_acc: 0.8815
ep: 4, taked: 5.746, train_loss: 0.22675917253120623, train_acc: 0.9156, test_loss: 0.3506897346505636, test_acc: 0.8822
ep: 5, taked: 5.660, train_loss: 0.22601728767220144, train_acc: 0.91605, test_loss: 0.3503744174030763, test_acc: 0.8823
ep: 6, taked: 5.802, train_loss: 0.2253774476330926, train_acc: 0.9165166666666666, test_loss: 0.35016559705704076, test_acc: 0.8826
ep: 7, taked: 5.763, train_loss: 0.2248194138410249, train_acc: 0.9165833333333333, test_loss: 0.3499790814858449, tes

In [None]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 256),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(256),
    torch.nn.Linear(256, 10)
)