# **Multi-layer Perceptron**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchsummary import summary

## **MLP Basic**

**Model**

In [None]:
print(model)

Sequential(
  (0): Linear(in_features=1, out_features=1, bias=True)
  (1): Linear(in_features=1, out_features=1, bias=True)
  (2): Sigmoid()
)


In [None]:
model = nn.Sequential(
    nn.Linear(1, 1),
    nn.Linear(1, 1),
    nn.Sigmoid()
)

In [None]:
summary(model, (1, 1))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 1]               2
            Linear-2                 [-1, 1, 1]               2
           Sigmoid-3                 [-1, 1, 1]               0
Total params: 4
Trainable params: 4
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [None]:
for layer in model.children():
    print(layer.state_dict())

OrderedDict([('weight', tensor([[-0.1094]])), ('bias', tensor([0.2590]))])
OrderedDict([('weight', tensor([[0.6766]])), ('bias', tensor([-0.4089]))])
OrderedDict()


In [None]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Linear(2, 2),
    nn.Sigmoid()
)

In [None]:
summary(model, (10000000, 2))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1          [-1, 10000000, 2]               6
            Linear-2          [-1, 10000000, 2]               6
           Sigmoid-3          [-1, 10000000, 2]               0
Total params: 12
Trainable params: 12
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 76.29
Forward/backward pass size (MB): 457.76
Params size (MB): 0.00
Estimated Total Size (MB): 534.06
----------------------------------------------------------------


In [None]:
for layer in model.children():
    print(layer.state_dict())

OrderedDict([('weight', tensor([[ 0.5307,  0.3464],
        [ 0.2720, -0.1979]])), ('bias', tensor([0.5808, 0.1628]))])
OrderedDict([('weight', tensor([[0.1242, 0.0053],
        [0.6397, 0.2538]])), ('bias', tensor([-0.3213, -0.4534]))])
OrderedDict()


**Sample**

In [None]:
x = torch.tensor([1.0, 2.0])
y = torch.tensor([0.0])

In [None]:
x, y

(tensor([1., 2.]), tensor([0.]))

### BCELoss

In [None]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Linear(2, 1),
    nn.Sigmoid()
)

In [None]:
print(model)

Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): Linear(in_features=2, out_features=1, bias=True)
  (2): Sigmoid()
)


In [None]:
summary(model, (1, 2))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 2]               6
            Linear-2                 [-1, 1, 1]               3
           Sigmoid-3                 [-1, 1, 1]               0
Total params: 9
Trainable params: 9
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [None]:
for layer in model.children():
    print(layer.state_dict())

OrderedDict([('weight', tensor([[ 0.0407,  0.5425],
        [-0.1439, -0.3547]])), ('bias', tensor([-0.0699,  0.6974]))])
OrderedDict([('weight', tensor([[0.4153, 0.3386]])), ('bias', tensor([0.0457]))])
OrderedDict()


In [None]:
for p in model.parameters():
    nn.init.constant_(p, 0.1)

In [None]:
for layer in model.children():
    print(layer.state_dict())

OrderedDict([('weight', tensor([[0.1000, 0.1000],
        [0.1000, 0.1000]])), ('bias', tensor([0.1000, 0.1000]))])
OrderedDict([('weight', tensor([[0.1000, 0.1000]])), ('bias', tensor([0.1000]))])
OrderedDict()


In [None]:
y_pred = model(x)
y_pred

tensor([0.5449], grad_fn=<SigmoidBackward0>)

**Activation**

In [None]:
import torch.nn as nn

act = nn.Sigmoid()
input = torch.tensor([0.18, -0.18])
act(input)

tensor([0.5449, 0.4551])

In [None]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Sigmoid(),
    nn.Linear(2, 1),
    nn.Sigmoid()
)

In [None]:
summary(model, (1, 2))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 2]               6
           Sigmoid-2                 [-1, 1, 2]               0
            Linear-3                 [-1, 1, 1]               3
           Sigmoid-4                 [-1, 1, 1]               0
Total params: 9
Trainable params: 9
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [None]:
import torch.nn as nn

act = nn.Tanh()
input = torch.tensor([0.18, -0.18])
act(input)

tensor([ 0.1781, -0.1781])

In [None]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.Tanh(),
    nn.Linear(2, 1),
    nn.Sigmoid()
)

In [None]:
summary(model, (1, 2))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 2]               6
              Tanh-2                 [-1, 1, 2]               0
            Linear-3                 [-1, 1, 1]               3
           Sigmoid-4                 [-1, 1, 1]               0
Total params: 9
Trainable params: 9
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [None]:
import torch.nn as nn

act = nn.ReLU()
input = torch.tensor([0.18, -0.18])
act(input)

tensor([0.1800, 0.0000])

In [None]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.ReLU(),
    nn.Linear(2, 1),
    nn.Sigmoid()
)

In [None]:
summary(model, (1, 2))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 2]               6
              ReLU-2                 [-1, 1, 2]               0
            Linear-3                 [-1, 1, 1]               3
           Sigmoid-4                 [-1, 1, 1]               0
Total params: 9
Trainable params: 9
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


**Loss**

In [None]:
x

tensor([1., 2.])

In [None]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.ReLU(),
    nn.Linear(2, 1),
    nn.Sigmoid()
)

In [None]:
for p in model.parameters():
    nn.init.constant_(p, 0.1)

In [None]:
for layer in model.children():
    print(layer.state_dict())

OrderedDict([('weight', tensor([[0.1000, 0.1000],
        [0.1000, 0.1000]])), ('bias', tensor([0.1000, 0.1000]))])
OrderedDict()
OrderedDict([('weight', tensor([[0.1000, 0.1000]])), ('bias', tensor([0.1000]))])
OrderedDict()


In [None]:
import torch.nn as nn
loss_fn = nn.BCELoss()

In [None]:
y_pred = model(x)
y_pred

tensor([0.5449], grad_fn=<SigmoidBackward0>)

In [None]:
y

tensor([0.])

In [None]:
loss = loss_fn(y_pred, y)
loss

tensor(0.7872, grad_fn=<BinaryCrossEntropyBackward0>)

In [None]:
learning_rate = 0.1
optimizer = optim.SGD(model.parameters(), learning_rate)

In [None]:
for layer in model.children():
    print(layer.state_dict())

OrderedDict([('weight', tensor([[0.1000, 0.1000],
        [0.1000, 0.1000]])), ('bias', tensor([0.1000, 0.1000]))])
OrderedDict()
OrderedDict([('weight', tensor([[0.1000, 0.1000]])), ('bias', tensor([0.1000]))])
OrderedDict()


In [None]:
loss.backward()

In [None]:
optimizer.step()

In [None]:
for layer in model.children():
    print(layer.state_dict())

OrderedDict([('weight', tensor([[0.0946, 0.0891],
        [0.0946, 0.0891]])), ('bias', tensor([0.0946, 0.0946]))])
OrderedDict()
OrderedDict([('weight', tensor([[0.0782, 0.0782]])), ('bias', tensor([0.0455]))])
OrderedDict()


###CrossEntropyLoss

In [None]:
x = torch.tensor([1.0, 2.0])
y = torch.tensor(0)

In [None]:
print(model)

Sequential(
  (0): Linear(in_features=2, out_features=2, bias=True)
  (1): ReLU()
  (2): Linear(in_features=2, out_features=2, bias=True)
)


In [None]:
model = nn.Sequential(
    nn.Linear(2, 2),
    nn.ReLU(),
    nn.Linear(2, 2),
)

In [None]:
summary(model, (1, 2))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 2]               6
              ReLU-2                 [-1, 1, 2]               0
            Linear-3                 [-1, 1, 2]               6
Total params: 12
Trainable params: 12
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [None]:
y_pred = model(x)
y_pred

tensor([ 0.9005, -0.1670], grad_fn=<ViewBackward0>)

In [None]:
import torch.nn as nn
loss_fn = nn.CrossEntropyLoss()

In [None]:
y = torch.tensor(0)
y

tensor(0)

In [None]:
y_pred

tensor([ 0.9005, -0.1670], grad_fn=<ViewBackward0>)

In [None]:
loss_fn(y_pred, y)

tensor(0.2955, grad_fn=<NllLossBackward0>)

## **Classification using MLP**

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [None]:
data = load_iris()

In [None]:
data.data.shape

(150, 4)

In [None]:
data.target.shape

(150,)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(
    data.data,
    data.target,
    test_size=0.4,
    random_state=7
)

In [None]:
X_valid, X_test, Y_valid, Y_test = train_test_split(
    X_test,
    Y_test,
    test_size=0.5,
    random_state=7
)

In [None]:
X_train.shape, X_valid.shape, X_test.shape

((90, 4), (30, 4), (30, 4))

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)

In [None]:
X_train = scaler.transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

In [None]:
data.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [None]:
X_train

array([[-1.03412082,  0.99571559, -1.20196482, -0.77770727],
       [ 1.02069068,  0.08360971,  0.54282282,  0.39755931],
       [ 1.74591826, -0.37244324,  1.44335838,  0.78931484],
       [-0.4297645 , -1.05652265,  0.3739724 ,  0.00580379],
       [-0.79237829, -0.82849618,  0.09255504,  0.26697414],
       [-0.18802197,  3.04795383, -1.25824829, -1.03887762],
       [ 1.625047  ,  0.31163618,  1.27450796,  0.78931484],
       [-0.55063576,  0.76768912, -1.14568135, -1.30004797],
       [-0.0671507 ,  2.13584794, -1.42709871, -1.30004797],
       [ 0.29546309, -1.05652265,  1.04937407,  0.26697414],
       [-0.18802197, -1.28454912,  0.71167324,  1.05048519],
       [-1.27586335, -0.14441676, -1.31453176, -1.43063314],
       [ 0.77894815, -0.14441676,  1.16194102,  1.31165554],
       [-1.51760588,  0.31163618, -1.31453176, -1.30004797],
       [ 0.89981941, -0.37244324,  0.48653935,  0.13638896],
       [ 1.02069068, -0.14441676,  0.82424018,  1.44224071],
       [ 0.29546309, -0.

In [None]:
X_test

array([[-0.18802197, -1.05652265, -0.13257885, -0.25536656],
       [-0.0671507 , -0.82849618,  0.09255504,  0.00580379],
       [-1.7593484 , -0.37244324, -1.31453176, -1.30004797],
       [ 0.65807688,  0.08360971,  0.9930906 ,  0.78931484],
       [ 1.02069068, -0.14441676,  0.71167324,  0.65872966],
       [-0.18802197, -0.14441676,  0.26140546,  0.00580379],
       [ 0.65807688, -0.37244324,  0.31768893,  0.13638896],
       [ 0.05372056,  0.31163618,  0.59910629,  0.78931484],
       [ 1.26243321,  0.31163618,  1.10565754,  1.44224071],
       [ 0.41633436, -0.60046971,  0.59910629,  0.78931484],
       [-0.55063576,  1.90782147, -1.14568135, -1.03887762],
       [-1.03412082,  0.53966265, -1.31453176, -1.30004797],
       [-0.79237829,  2.36387441, -1.25824829, -1.43063314],
       [ 1.625047  ,  1.22374206,  1.33079143,  1.70341106],
       [-0.4297645 , -1.51257559,  0.03627157, -0.12478139],
       [-0.91324955,  1.679795  , -1.20196482, -1.30004797],
       [-1.51760588,  1.

In [None]:
X_train = torch.tensor(X_train, dtype=torch.float32)
Y_train = torch.tensor(Y_train)
X_valid = torch.tensor(X_valid, dtype=torch.float32)
Y_valid = torch.tensor(Y_valid)
X_test = torch.tensor(X_test, dtype=torch.float32)
Y_test = torch.tensor(Y_test)

In [None]:
X_train

tensor([[-1.0341,  0.9957, -1.2020, -0.7777],
        [ 1.0207,  0.0836,  0.5428,  0.3976],
        [ 1.7459, -0.3724,  1.4434,  0.7893],
        [-0.4298, -1.0565,  0.3740,  0.0058],
        [-0.7924, -0.8285,  0.0926,  0.2670],
        [-0.1880,  3.0480, -1.2582, -1.0389],
        [ 1.6250,  0.3116,  1.2745,  0.7893],
        [-0.5506,  0.7677, -1.1457, -1.3000],
        [-0.0672,  2.1358, -1.4271, -1.3000],
        [ 0.2955, -1.0565,  1.0494,  0.2670],
        [-0.1880, -1.2845,  0.7117,  1.0505],
        [-1.2759, -0.1444, -1.3145, -1.4306],
        [ 0.7789, -0.1444,  1.1619,  1.3117],
        [-1.5176,  0.3116, -1.3145, -1.3000],
        [ 0.8998, -0.3724,  0.4865,  0.1364],
        [ 1.0207, -0.1444,  0.8242,  1.4422],
        [ 0.2955, -0.1444,  0.4865,  0.2670],
        [-0.1880,  1.6798, -1.1457, -1.1695],
        [ 1.3833,  0.3116,  0.5428,  0.2670],
        [-1.3967,  0.3116, -1.3708, -1.3000],
        [ 0.7789, -0.1444,  0.8242,  1.0505],
        [ 2.2294, -1.0565,  1.7811

In [None]:
Y_train

tensor([0, 1, 2, 1, 1, 0, 2, 0, 0, 2, 2, 0, 2, 0, 1, 2, 1, 0, 1, 0, 2, 2, 1, 0,
        0, 1, 2, 0, 2, 2, 1, 0, 1, 0, 2, 2, 0, 0, 2, 1, 2, 2, 1, 0, 0, 2, 0, 0,
        1, 2, 2, 1, 1, 0, 2, 0, 0, 1, 1, 2, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 0, 0,
        0, 1, 1, 0, 2, 2, 1, 2, 0, 2, 1, 1, 0, 2, 1, 2, 1, 0])

In [None]:
model_classifier = nn.Sequential(
    nn.Linear(4, 8),
    nn.ReLU(),
    nn.Linear(8, 3)
)

In [None]:
summary(model_classifier, (1, 4))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1, 8]              40
              ReLU-2                 [-1, 1, 8]               0
            Linear-3                 [-1, 1, 3]              27
Total params: 67
Trainable params: 67
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [None]:
model_classifier

Sequential(
  (0): Linear(in_features=4, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=3, bias=True)
)

In [None]:
X_train[0]

tensor([-1.0341,  0.9957, -1.2020, -0.7777])

In [None]:
y_pred = model_classifier(X_train[0])
y_pred

tensor([ 0.1499,  0.0141, -0.4355], grad_fn=<ViewBackward0>)

In [None]:
loss_fn = nn.CrossEntropyLoss()

In [None]:
loss = loss_fn(y_pred, Y_train[0])
loss

tensor(0.8878, grad_fn=<NllLossBackward0>)

In [None]:
loss.item()

0.8878384828567505

In [None]:
learning_rate = 0.01

optimizer = optim.SGD(
    model_classifier.parameters(),
    learning_rate
)

In [None]:
def evaluate(model_classifier, X_valid, Y_valid):
    with torch.no_grad():
        Y_pred = model_classifier(X_valid)

    Y_pred = torch.argmax(Y_pred, dim=1)
    return sum(Y_pred == Y_valid)/len(Y_valid)

In [None]:
evaluate(model_classifier, X_valid, Y_valid)

tensor(0.5000)

In [None]:
num_epochs = 20
losses = []
for epoch in range(num_epochs):
    epoch_loss = []
    for x_train, y_train in zip(X_train, Y_train):
        y_pred = model_classifier(x_train)
        loss = loss_fn(y_pred, y_train)
        epoch_loss.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    avg_loss = sum(epoch_loss)/len(epoch_loss)
    losses.append(avg_loss)
    acc = evaluate(model_classifier, X_valid, Y_valid)
    print(f"{avg_loss} -- {acc}")

1.0095625638961792 -- 0.6000000238418579
0.818809684448772 -- 0.6666666865348816
0.641085204978784 -- 0.699999988079071
0.5172737292945385 -- 0.7333333492279053
0.4383059697018729 -- 0.7666666507720947
0.38455223745356004 -- 0.7666666507720947
0.3457096125827067 -- 0.7333333492279053
0.31627391715430553 -- 0.7333333492279053
0.29315664073348874 -- 0.7333333492279053
0.27442682819803144 -- 0.7333333492279053
0.25867093672148056 -- 0.7333333492279053
0.24499973835320107 -- 0.7333333492279053
0.23278310322332094 -- 0.7666666507720947
0.2216730286522458 -- 0.7666666507720947
0.21139325921475474 -- 0.7666666507720947
0.20179211201353206 -- 0.800000011920929
0.19276142589838452 -- 0.800000011920929
0.18424310129420418 -- 0.800000011920929
0.17618259065331787 -- 0.8333333134651184
0.16855647248287117 -- 0.8333333134651184


In [None]:
losses

[1.0095625638961792,
 0.818809684448772,
 0.641085204978784,
 0.5172737292945385,
 0.4383059697018729,
 0.38455223745356004,
 0.3457096125827067,
 0.31627391715430553,
 0.29315664073348874,
 0.27442682819803144,
 0.25867093672148056,
 0.24499973835320107,
 0.23278310322332094,
 0.2216730286522458,
 0.21139325921475474,
 0.20179211201353206,
 0.19276142589838452,
 0.18424310129420418,
 0.17618259065331787,
 0.16855647248287117]

In [None]:
with torch.no_grad():
    Y_pred = model_classifier(X_test)

In [None]:
Y_pred = torch.argmax(Y_pred, dim=1)

In [None]:
sum(Y_pred == Y_test)/len(Y_test)

tensor(0.9000)

In [None]:
Y_pred

tensor([1, 1, 0, 2, 2, 1, 1, 2, 2, 2, 0, 0, 0, 2, 1, 0, 0, 1, 2, 0, 2, 1, 1, 0,
        2, 0, 2, 0, 0, 2])