##### Mounica Subramani

# CS7180 Problem Set 1: Implement a teacher-student network setting for Gaussian inputs (20 points)

Welcome to CS7180!

Before you start, make sure to read the problem description in the handout pdf.

Collaborators: Apoorva Durai, Manaswini, Sinjini Bose. Discussed concepts.

In [24]:
# Dependencies
import argparse
import torch
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np


# hyper parameters
batch_size = 100
width = 5
d_input = 100

# Part 1: Implement a two-layer neural network with ReLU activation (5 points)

In [2]:
class Net(torch.nn.Module):
    
    def __init__(self, d_input, width):
        
        super(Net, self).__init__()
        # ------------------
        # Write your implementation here.
        
        # Linear layer obtaining input to the hidden layer from the input layer
        self.fc1 = torch.nn.Linear(d_input,width)
        
        # Applying activation to the output of hidden layer before feeding into final layer
        self.act = torch.nn.ReLU()
        
        # Linear layer -> narrowing to 1 output from hidden layer
        self.fc2 = torch.nn.Linear(width,1)
        
        # ------------------
        
        
    def forward(self, x):
        
        x = x.view(-1, d_input)
        # ------------------
        # Write your implementation here.
        
        x = self.act(self.fc1(x))
        x = self.fc2(x)
        return x
        
        # ------------------

### Generating the data

In [21]:
# sample size
N = 5 * width * d_input

# random data from standard normal distribution
x_train = torch.randn(N, d_input)
x_test = torch.randn(N, d_input)

# teacher network with random weights
teacher = Net(d_input, width)

# generate labels using the teacher network
y_train = torch.FloatTensor([teacher.forward(x) for x in x_train])
y_test = torch.FloatTensor([teacher.forward(x) for x in x_test])

# combine the data and labels into pytorch friendly format
train_data = torch.utils.data.TensorDataset(x_train, y_train)
test_data = torch.utils.data.TensorDataset(x_test, y_test)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

# Part 2: Set up the quadratic loss function and an SGD optimizer (10 points)

In [6]:
n_epochs = 2000 # the number of epochs can be tuned for better performance

criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(teacher.parameters(), lr=0.01)
teacher.train() # prep model for training

for epoch in range(n_epochs):
    train_loss = 0.0

    # train the model 
    for idx, (data, labels) in enumerate(train_loader):
        # ------------------
        # Write your implementation here.
        
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = teacher(data)
        # calculate the loss
        loss = criterion(output,labels)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss

        # ------------------

    # print the mean squared loss of the training dataset normalized by the mean square of the training dataset labels
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(
        epoch+1, 
        train_loss / torch.mean(torch.pow(y_train, 2))))

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 1 	Training Loss: 21.061852
Epoch: 2 	Training Loss: 19.397266
Epoch: 3 	Training Loss: 18.120869
Epoch: 4 	Training Loss: 17.117247
Epoch: 5 	Training Loss: 16.312916
Epoch: 6 	Training Loss: 15.657936
Epoch: 7 	Training Loss: 15.116551
Epoch: 8 	Training Loss: 14.664048
Epoch: 9 	Training Loss: 14.281998
Epoch: 10 	Training Loss: 13.956030
Epoch: 11 	Training Loss: 13.675710
Epoch: 12 	Training Loss: 13.432755
Epoch: 13 	Training Loss: 13.220798
Epoch: 14 	Training Loss: 13.034711
Epoch: 15 	Training Loss: 12.870484
Epoch: 16 	Training Loss: 12.724731
Epoch: 17 	Training Loss: 12.594853
Epoch: 18 	Training Loss: 12.478600
Epoch: 19 	Training Loss: 12.374182
Epoch: 20 	Training Loss: 12.280092
Epoch: 21 	Training Loss: 12.194963
Epoch: 22 	Training Loss: 12.117729
Epoch: 23 	Training Loss: 12.047459
Epoch: 24 	Training Loss: 11.983365
Epoch: 25 	Training Loss: 11.924751
Epoch: 26 	Training Loss: 11.871025
Epoch: 27 	Training Loss: 11.821671
Epoch: 28 	Training Loss: 11.776239
E

Epoch: 446 	Training Loss: 11.108582
Epoch: 447 	Training Loss: 11.108582
Epoch: 448 	Training Loss: 11.108582
Epoch: 449 	Training Loss: 11.108582
Epoch: 450 	Training Loss: 11.108582
Epoch: 451 	Training Loss: 11.108581
Epoch: 452 	Training Loss: 11.108581
Epoch: 453 	Training Loss: 11.108581
Epoch: 454 	Training Loss: 11.108580
Epoch: 455 	Training Loss: 11.108582
Epoch: 456 	Training Loss: 11.108580
Epoch: 457 	Training Loss: 11.108580
Epoch: 458 	Training Loss: 11.108578
Epoch: 459 	Training Loss: 11.108580
Epoch: 460 	Training Loss: 11.108578
Epoch: 461 	Training Loss: 11.108578
Epoch: 462 	Training Loss: 11.108578
Epoch: 463 	Training Loss: 11.108578
Epoch: 464 	Training Loss: 11.108578
Epoch: 465 	Training Loss: 11.108577
Epoch: 466 	Training Loss: 11.108577
Epoch: 467 	Training Loss: 11.108578
Epoch: 468 	Training Loss: 11.108577
Epoch: 469 	Training Loss: 11.108576
Epoch: 470 	Training Loss: 11.108576
Epoch: 471 	Training Loss: 11.108576
Epoch: 472 	Training Loss: 11.108576
E

Epoch: 888 	Training Loss: 11.108492
Epoch: 889 	Training Loss: 11.108491
Epoch: 890 	Training Loss: 11.108491
Epoch: 891 	Training Loss: 11.108491
Epoch: 892 	Training Loss: 11.108490
Epoch: 893 	Training Loss: 11.108490
Epoch: 894 	Training Loss: 11.108490
Epoch: 895 	Training Loss: 11.108490
Epoch: 896 	Training Loss: 11.108489
Epoch: 897 	Training Loss: 11.108489
Epoch: 898 	Training Loss: 11.108490
Epoch: 899 	Training Loss: 11.108490
Epoch: 900 	Training Loss: 11.108489
Epoch: 901 	Training Loss: 11.108488
Epoch: 902 	Training Loss: 11.108489
Epoch: 903 	Training Loss: 11.108488
Epoch: 904 	Training Loss: 11.108488
Epoch: 905 	Training Loss: 11.108488
Epoch: 906 	Training Loss: 11.108488
Epoch: 907 	Training Loss: 11.108486
Epoch: 908 	Training Loss: 11.108486
Epoch: 909 	Training Loss: 11.108487
Epoch: 910 	Training Loss: 11.108486
Epoch: 911 	Training Loss: 11.108486
Epoch: 912 	Training Loss: 11.108486
Epoch: 913 	Training Loss: 11.108486
Epoch: 914 	Training Loss: 11.108484
E

Epoch: 1321 	Training Loss: 11.108400
Epoch: 1322 	Training Loss: 11.108401
Epoch: 1323 	Training Loss: 11.108400
Epoch: 1324 	Training Loss: 11.108400
Epoch: 1325 	Training Loss: 11.108400
Epoch: 1326 	Training Loss: 11.108400
Epoch: 1327 	Training Loss: 11.108399
Epoch: 1328 	Training Loss: 11.108400
Epoch: 1329 	Training Loss: 11.108399
Epoch: 1330 	Training Loss: 11.108399
Epoch: 1331 	Training Loss: 11.108398
Epoch: 1332 	Training Loss: 11.108397
Epoch: 1333 	Training Loss: 11.108398
Epoch: 1334 	Training Loss: 11.108398
Epoch: 1335 	Training Loss: 11.108397
Epoch: 1336 	Training Loss: 11.108398
Epoch: 1337 	Training Loss: 11.108397
Epoch: 1338 	Training Loss: 11.108398
Epoch: 1339 	Training Loss: 11.108397
Epoch: 1340 	Training Loss: 11.108397
Epoch: 1341 	Training Loss: 11.108397
Epoch: 1342 	Training Loss: 11.108396
Epoch: 1343 	Training Loss: 11.108396
Epoch: 1344 	Training Loss: 11.108396
Epoch: 1345 	Training Loss: 11.108396
Epoch: 1346 	Training Loss: 11.108396
Epoch: 1347 

Epoch: 1751 	Training Loss: 11.108308
Epoch: 1752 	Training Loss: 11.108308
Epoch: 1753 	Training Loss: 11.108307
Epoch: 1754 	Training Loss: 11.108307
Epoch: 1755 	Training Loss: 11.108307
Epoch: 1756 	Training Loss: 11.108307
Epoch: 1757 	Training Loss: 11.108306
Epoch: 1758 	Training Loss: 11.108306
Epoch: 1759 	Training Loss: 11.108306
Epoch: 1760 	Training Loss: 11.108306
Epoch: 1761 	Training Loss: 11.108306
Epoch: 1762 	Training Loss: 11.108305
Epoch: 1763 	Training Loss: 11.108304
Epoch: 1764 	Training Loss: 11.108304
Epoch: 1765 	Training Loss: 11.108304
Epoch: 1766 	Training Loss: 11.108305
Epoch: 1767 	Training Loss: 11.108305
Epoch: 1768 	Training Loss: 11.108305
Epoch: 1769 	Training Loss: 11.108304
Epoch: 1770 	Training Loss: 11.108304
Epoch: 1771 	Training Loss: 11.108303
Epoch: 1772 	Training Loss: 11.108303
Epoch: 1773 	Training Loss: 11.108303
Epoch: 1774 	Training Loss: 11.108303
Epoch: 1775 	Training Loss: 11.108302
Epoch: 1776 	Training Loss: 11.108302
Epoch: 1777 

In [None]:
# Test the performance of the trained model

teacher.eval()
test_loss = 0.0

for idx, (data, labels) in enumerate(test_loader):
    # forward pass
    output = teacher(data)
    test_loss += criterion(output, labels).item()

# print the mean squared loss of the test dataset normalized by the mean square of the test  labels
print('Average mean squared error {:.6f}'.format(test_loss / torch.mean(torch.pow(y_test, 2))))

# Part 3: Vary the width parameter, and plot the test error for different widths (5 points)

1. How does the test error vary as we change the width? In particular, consider varying the width of the student network from 1 to 20.
2. [Bonus] What happens if we vary the sample size?
3. [Bonus] How about adding a small amount of noise to the labels of the training dataset?

Report what you found and include the results in your submission.

In [10]:
n_epochs = 2000 # the number of epochs can be tuned for better performance

criterion = torch.nn.MSELoss()

In [14]:
def student_net(w):
    std_net = Net(d_input,w)
    std_net.train() # prep model for training
    optimizer = torch.optim.SGD(std_net.parameters(), lr=0.01)

    for epoch in range(n_epochs):
        train_loss = 0.0

        # train the model 
        for idx, (data, labels) in enumerate(train_loader):
            # ------------------
            # Write your implementation here.

            # clear the gradients of all optimized variables
            optimizer.zero_grad()
            # forward pass: compute predicted outputs by passing inputs to the model
            output = std_net(data)
            # calculate the loss
            loss = criterion(output,labels)
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            # perform a single optimization step (parameter update)
            optimizer.step()
            # update running training loss
            train_loss += loss
            
            t_loss = train_loss / torch.mean(torch.pow(y_train, 2))

            # ------------------
        
    # Test the performance of the trained model
    std_net.eval()
    test_loss = 0.0

    for idx, (data, labels) in enumerate(test_loader):
        # forward pass
        output = std_net(data)
        test_loss += criterion(output, labels).item()

    # print the mean squared loss of the test dataset normalized by the mean square of the test  labels
    print('Average mean squared error {:.6f}'.format(test_loss / torch.mean(torch.pow(y_test, 2))))

In [17]:
test_error = []
for w in range(1,21):
    print("width:",w)
    test_error.append(student_net(w))

width: 1
Average mean squared error 10.930061
width: 2
Average mean squared error 10.916376
width: 3
Average mean squared error 10.914468
width: 4
Average mean squared error 10.910889
width: 5
Average mean squared error 10.913124
width: 6
Average mean squared error 10.910731
width: 7
Average mean squared error 10.910593
width: 8
Average mean squared error 10.910300
width: 9
Average mean squared error 10.910397
width: 10
Average mean squared error 10.910455
width: 11
Average mean squared error 10.910155
width: 12
Average mean squared error 10.910148
width: 13
Average mean squared error 10.909992
width: 14
Average mean squared error 10.909336
width: 15
Average mean squared error 10.910934
width: 16
Average mean squared error 10.909252
width: 17
Average mean squared error 10.909606
width: 18
Average mean squared error 10.909761
width: 19
Average mean squared error 10.908634
width: 20
Average mean squared error 10.908721


###### Varying Sample size
Having width as 20 and changing the Sample size

In [18]:
# sample size
N1 = 5 * 20 * d_input

# random data from standard normal distribution
x_train = torch.randn(N1, d_input)
x_test = torch.randn(N1, d_input)

# teacher network with random weights
teacher = Net(d_input, width)

# generate labels using the teacher network
y_train = torch.FloatTensor([teacher.forward(x) for x in x_train])
y_test = torch.FloatTensor([teacher.forward(x) for x in x_test])

# combine the data and labels into pytorch friendly format
train_data = torch.utils.data.TensorDataset(x_train, y_train)
test_data = torch.utils.data.TensorDataset(x_test, y_test)

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

In [19]:
def student_net(w):
    std_net = Net(d_input,w)
    std_net.train() # prep model for training
    optimizer = torch.optim.SGD(std_net.parameters(), lr=0.01)

    for epoch in range(n_epochs):
        train_loss = 0.0

        # train the model 
        for idx, (data, labels) in enumerate(train_loader):
            # ------------------
            # Write your implementation here.

            # clear the gradients of all optimized variables
            optimizer.zero_grad()
            # forward pass: compute predicted outputs by passing inputs to the model
            output = std_net(data)
            # calculate the loss
            loss = criterion(output,labels)
            # backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()
            # perform a single optimization step (parameter update)
            optimizer.step()
            # update running training loss
            train_loss += loss
            
            t_loss = train_loss / torch.mean(torch.pow(y_train, 2))

            # ------------------
        
    # Test the performance of the trained model
    std_net.eval()
    test_loss = 0.0

    for idx, (data, labels) in enumerate(test_loader):
        # forward pass
        output = std_net(data)
        test_loss += criterion(output, labels).item()

    # print the mean squared loss of the test dataset normalized by the mean square of the test  labels
    print('Average mean squared error {:.6f}'.format(test_loss / torch.mean(torch.pow(y_test, 2))))

In [20]:
test_error = []
test_error.append(student_net(20))

Average mean squared error 87.164070


Varying sample size is increasing the *mean squared error*. However we can try for different vlaues of *Width* and *d_input* to experiment what actually happens for smaller to larger values of *N(Sample Size)*

In [35]:
class AddGaussianNoise(object):
    def __init__(self, mean=0., std=1.):
        self.std = std
        self.mean = mean
        
    def __call__(self, tensor):
        return tensor + torch.randn(tensor.size()) * self.std + self.mean
    
    def __repr__(self):
        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)

In [39]:
# sample size
N = 5 * width * d_input

# random data from standard normal distribution
x_train = torch.randn(N, d_input)
x_test = torch.randn(N, d_input)

# teacher network with random weights
teacher = Net(d_input, width)

# generate labels using the teacher network
y_train = torch.FloatTensor([teacher.forward(x) for x in x_train])
y_test = torch.FloatTensor([teacher.forward(x) for x in x_test])

# combine the data and labels into pytorch friendly format
train_data = torch.utils.data.TensorDataset(x_train, y_train)
test_data = torch.utils.data.TensorDataset(x_test, y_test)


transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)),
    AddGaussianNoise(0., 1.)
])


train_data.transform = transform
test_data.transform = transform

# prepare data loaders
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

In [40]:
n_epochs = 2000 # the number of epochs can be tuned for better performance

criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(teacher.parameters(), lr=0.01)
teacher.train() # prep model for training

for epoch in range(n_epochs):
    train_loss = 0.0

    # train the model 
    for idx, (data, labels) in enumerate(train_loader):
        # ------------------
        # Write your implementation here.
        
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = teacher(data)
        # calculate the loss
        loss = criterion(output,labels)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss

        # ------------------

    # print the mean squared loss of the training dataset normalized by the mean square of the training dataset labels
    print('Epoch: {} \tTraining Loss: {:.6f}'.format(
        epoch+1, 
        train_loss / torch.mean(torch.pow(y_train, 2))))

Epoch: 1 	Training Loss: 47.371262
Epoch: 2 	Training Loss: 43.498249
Epoch: 3 	Training Loss: 40.663643
Epoch: 4 	Training Loss: 38.483887
Epoch: 5 	Training Loss: 36.756279
Epoch: 6 	Training Loss: 35.356884
Epoch: 7 	Training Loss: 34.207561
Epoch: 8 	Training Loss: 33.250862
Epoch: 9 	Training Loss: 32.443417
Epoch: 10 	Training Loss: 31.755939
Epoch: 11 	Training Loss: 31.163343
Epoch: 12 	Training Loss: 30.647829
Epoch: 13 	Training Loss: 30.195847
Epoch: 14 	Training Loss: 29.796682
Epoch: 15 	Training Loss: 29.442566
Epoch: 16 	Training Loss: 29.126474
Epoch: 17 	Training Loss: 28.843119
Epoch: 18 	Training Loss: 28.588062
Epoch: 19 	Training Loss: 28.357546
Epoch: 20 	Training Loss: 28.148333
Epoch: 21 	Training Loss: 27.957737
Epoch: 22 	Training Loss: 27.783419
Epoch: 23 	Training Loss: 27.623901
Epoch: 24 	Training Loss: 27.477463
Epoch: 25 	Training Loss: 27.342577
Epoch: 26 	Training Loss: 27.218096
Epoch: 27 	Training Loss: 27.102879
Epoch: 28 	Training Loss: 26.995913
E

Epoch: 225 	Training Loss: 24.993959
Epoch: 226 	Training Loss: 24.993719
Epoch: 227 	Training Loss: 24.993484
Epoch: 228 	Training Loss: 24.993254
Epoch: 229 	Training Loss: 24.993029
Epoch: 230 	Training Loss: 24.992807
Epoch: 231 	Training Loss: 24.992594
Epoch: 232 	Training Loss: 24.992376
Epoch: 233 	Training Loss: 24.992174
Epoch: 234 	Training Loss: 24.991972
Epoch: 235 	Training Loss: 24.991772
Epoch: 236 	Training Loss: 24.991575
Epoch: 237 	Training Loss: 24.991386
Epoch: 238 	Training Loss: 24.991199
Epoch: 239 	Training Loss: 24.991013
Epoch: 240 	Training Loss: 24.990837
Epoch: 241 	Training Loss: 24.990662
Epoch: 242 	Training Loss: 24.990490
Epoch: 243 	Training Loss: 24.990324
Epoch: 244 	Training Loss: 24.990154
Epoch: 245 	Training Loss: 24.989992
Epoch: 246 	Training Loss: 24.989836
Epoch: 247 	Training Loss: 24.989676
Epoch: 248 	Training Loss: 24.989532
Epoch: 249 	Training Loss: 24.989382
Epoch: 250 	Training Loss: 24.989235
Epoch: 251 	Training Loss: 24.989084
E

Epoch: 446 	Training Loss: 24.982189
Epoch: 447 	Training Loss: 24.982187
Epoch: 448 	Training Loss: 24.982180
Epoch: 449 	Training Loss: 24.982180
Epoch: 450 	Training Loss: 24.982174
Epoch: 451 	Training Loss: 24.982168
Epoch: 452 	Training Loss: 24.982172
Epoch: 453 	Training Loss: 24.982168
Epoch: 454 	Training Loss: 24.982162
Epoch: 455 	Training Loss: 24.982161
Epoch: 456 	Training Loss: 24.982159
Epoch: 457 	Training Loss: 24.982155
Epoch: 458 	Training Loss: 24.982155
Epoch: 459 	Training Loss: 24.982155
Epoch: 460 	Training Loss: 24.982151
Epoch: 461 	Training Loss: 24.982145
Epoch: 462 	Training Loss: 24.982141
Epoch: 463 	Training Loss: 24.982140
Epoch: 464 	Training Loss: 24.982140
Epoch: 465 	Training Loss: 24.982138
Epoch: 466 	Training Loss: 24.982134
Epoch: 467 	Training Loss: 24.982134
Epoch: 468 	Training Loss: 24.982134
Epoch: 469 	Training Loss: 24.982124
Epoch: 470 	Training Loss: 24.982121
Epoch: 471 	Training Loss: 24.982121
Epoch: 472 	Training Loss: 24.982119
E

Epoch: 667 	Training Loss: 24.981943
Epoch: 668 	Training Loss: 24.981943
Epoch: 669 	Training Loss: 24.981939
Epoch: 670 	Training Loss: 24.981943
Epoch: 671 	Training Loss: 24.981939
Epoch: 672 	Training Loss: 24.981943
Epoch: 673 	Training Loss: 24.981939
Epoch: 674 	Training Loss: 24.981939
Epoch: 675 	Training Loss: 24.981943
Epoch: 676 	Training Loss: 24.981937
Epoch: 677 	Training Loss: 24.981937
Epoch: 678 	Training Loss: 24.981936
Epoch: 679 	Training Loss: 24.981936
Epoch: 680 	Training Loss: 24.981932
Epoch: 681 	Training Loss: 24.981932
Epoch: 682 	Training Loss: 24.981932
Epoch: 683 	Training Loss: 24.981932
Epoch: 684 	Training Loss: 24.981932
Epoch: 685 	Training Loss: 24.981936
Epoch: 686 	Training Loss: 24.981930
Epoch: 687 	Training Loss: 24.981930
Epoch: 688 	Training Loss: 24.981932
Epoch: 689 	Training Loss: 24.981932
Epoch: 690 	Training Loss: 24.981932
Epoch: 691 	Training Loss: 24.981932
Epoch: 692 	Training Loss: 24.981930
Epoch: 693 	Training Loss: 24.981928
E

Epoch: 888 	Training Loss: 24.981817
Epoch: 889 	Training Loss: 24.981817
Epoch: 890 	Training Loss: 24.981817
Epoch: 891 	Training Loss: 24.981813
Epoch: 892 	Training Loss: 24.981813
Epoch: 893 	Training Loss: 24.981813
Epoch: 894 	Training Loss: 24.981813
Epoch: 895 	Training Loss: 24.981817
Epoch: 896 	Training Loss: 24.981812
Epoch: 897 	Training Loss: 24.981812
Epoch: 898 	Training Loss: 24.981812
Epoch: 899 	Training Loss: 24.981812
Epoch: 900 	Training Loss: 24.981810
Epoch: 901 	Training Loss: 24.981812
Epoch: 902 	Training Loss: 24.981810
Epoch: 903 	Training Loss: 24.981806
Epoch: 904 	Training Loss: 24.981810
Epoch: 905 	Training Loss: 24.981806
Epoch: 906 	Training Loss: 24.981806
Epoch: 907 	Training Loss: 24.981806
Epoch: 908 	Training Loss: 24.981806
Epoch: 909 	Training Loss: 24.981806
Epoch: 910 	Training Loss: 24.981806
Epoch: 911 	Training Loss: 24.981806
Epoch: 912 	Training Loss: 24.981806
Epoch: 913 	Training Loss: 24.981804
Epoch: 914 	Training Loss: 24.981804
E

Epoch: 1106 	Training Loss: 24.981691
Epoch: 1107 	Training Loss: 24.981688
Epoch: 1108 	Training Loss: 24.981688
Epoch: 1109 	Training Loss: 24.981691
Epoch: 1110 	Training Loss: 24.981691
Epoch: 1111 	Training Loss: 24.981688
Epoch: 1112 	Training Loss: 24.981688
Epoch: 1113 	Training Loss: 24.981688
Epoch: 1114 	Training Loss: 24.981686
Epoch: 1115 	Training Loss: 24.981686
Epoch: 1116 	Training Loss: 24.981682
Epoch: 1117 	Training Loss: 24.981682
Epoch: 1118 	Training Loss: 24.981682
Epoch: 1119 	Training Loss: 24.981686
Epoch: 1120 	Training Loss: 24.981680
Epoch: 1121 	Training Loss: 24.981682
Epoch: 1122 	Training Loss: 24.981682
Epoch: 1123 	Training Loss: 24.981678
Epoch: 1124 	Training Loss: 24.981678
Epoch: 1125 	Training Loss: 24.981678
Epoch: 1126 	Training Loss: 24.981678
Epoch: 1127 	Training Loss: 24.981674
Epoch: 1128 	Training Loss: 24.981674
Epoch: 1129 	Training Loss: 24.981674
Epoch: 1130 	Training Loss: 24.981678
Epoch: 1131 	Training Loss: 24.981674
Epoch: 1132 

Epoch: 1321 	Training Loss: 24.981560
Epoch: 1322 	Training Loss: 24.981560
Epoch: 1323 	Training Loss: 24.981556
Epoch: 1324 	Training Loss: 24.981556
Epoch: 1325 	Training Loss: 24.981554
Epoch: 1326 	Training Loss: 24.981556
Epoch: 1327 	Training Loss: 24.981554
Epoch: 1328 	Training Loss: 24.981554
Epoch: 1329 	Training Loss: 24.981552
Epoch: 1330 	Training Loss: 24.981554
Epoch: 1331 	Training Loss: 24.981552
Epoch: 1332 	Training Loss: 24.981552
Epoch: 1333 	Training Loss: 24.981552
Epoch: 1334 	Training Loss: 24.981554
Epoch: 1335 	Training Loss: 24.981552
Epoch: 1336 	Training Loss: 24.981548
Epoch: 1337 	Training Loss: 24.981552
Epoch: 1338 	Training Loss: 24.981548
Epoch: 1339 	Training Loss: 24.981546
Epoch: 1340 	Training Loss: 24.981548
Epoch: 1341 	Training Loss: 24.981546
Epoch: 1342 	Training Loss: 24.981546
Epoch: 1343 	Training Loss: 24.981546
Epoch: 1344 	Training Loss: 24.981543
Epoch: 1345 	Training Loss: 24.981543
Epoch: 1346 	Training Loss: 24.981543
Epoch: 1347 

Epoch: 1536 	Training Loss: 24.981424
Epoch: 1537 	Training Loss: 24.981424
Epoch: 1538 	Training Loss: 24.981422
Epoch: 1539 	Training Loss: 24.981421
Epoch: 1540 	Training Loss: 24.981421
Epoch: 1541 	Training Loss: 24.981421
Epoch: 1542 	Training Loss: 24.981421
Epoch: 1543 	Training Loss: 24.981417
Epoch: 1544 	Training Loss: 24.981417
Epoch: 1545 	Training Loss: 24.981417
Epoch: 1546 	Training Loss: 24.981417
Epoch: 1547 	Training Loss: 24.981417
Epoch: 1548 	Training Loss: 24.981417
Epoch: 1549 	Training Loss: 24.981417
Epoch: 1550 	Training Loss: 24.981415
Epoch: 1551 	Training Loss: 24.981417
Epoch: 1552 	Training Loss: 24.981413
Epoch: 1553 	Training Loss: 24.981415
Epoch: 1554 	Training Loss: 24.981413
Epoch: 1555 	Training Loss: 24.981413
Epoch: 1556 	Training Loss: 24.981413
Epoch: 1557 	Training Loss: 24.981413
Epoch: 1558 	Training Loss: 24.981413
Epoch: 1559 	Training Loss: 24.981413
Epoch: 1560 	Training Loss: 24.981413
Epoch: 1561 	Training Loss: 24.981409
Epoch: 1562 

Epoch: 1751 	Training Loss: 24.981287
Epoch: 1752 	Training Loss: 24.981287
Epoch: 1753 	Training Loss: 24.981287
Epoch: 1754 	Training Loss: 24.981283
Epoch: 1755 	Training Loss: 24.981283
Epoch: 1756 	Training Loss: 24.981283
Epoch: 1757 	Training Loss: 24.981283
Epoch: 1758 	Training Loss: 24.981283
Epoch: 1759 	Training Loss: 24.981281
Epoch: 1760 	Training Loss: 24.981281
Epoch: 1761 	Training Loss: 24.981283
Epoch: 1762 	Training Loss: 24.981281
Epoch: 1763 	Training Loss: 24.981277
Epoch: 1764 	Training Loss: 24.981277
Epoch: 1765 	Training Loss: 24.981276
Epoch: 1766 	Training Loss: 24.981276
Epoch: 1767 	Training Loss: 24.981276
Epoch: 1768 	Training Loss: 24.981276
Epoch: 1769 	Training Loss: 24.981276
Epoch: 1770 	Training Loss: 24.981274
Epoch: 1771 	Training Loss: 24.981274
Epoch: 1772 	Training Loss: 24.981274
Epoch: 1773 	Training Loss: 24.981274
Epoch: 1774 	Training Loss: 24.981270
Epoch: 1775 	Training Loss: 24.981270
Epoch: 1776 	Training Loss: 24.981270
Epoch: 1777 

Epoch: 1966 	Training Loss: 24.981148
Epoch: 1967 	Training Loss: 24.981148
Epoch: 1968 	Training Loss: 24.981148
Epoch: 1969 	Training Loss: 24.981148
Epoch: 1970 	Training Loss: 24.981142
Epoch: 1971 	Training Loss: 24.981142
Epoch: 1972 	Training Loss: 24.981144
Epoch: 1973 	Training Loss: 24.981142
Epoch: 1974 	Training Loss: 24.981142
Epoch: 1975 	Training Loss: 24.981142
Epoch: 1976 	Training Loss: 24.981142
Epoch: 1977 	Training Loss: 24.981142
Epoch: 1978 	Training Loss: 24.981142
Epoch: 1979 	Training Loss: 24.981142
Epoch: 1980 	Training Loss: 24.981142
Epoch: 1981 	Training Loss: 24.981138
Epoch: 1982 	Training Loss: 24.981138
Epoch: 1983 	Training Loss: 24.981136
Epoch: 1984 	Training Loss: 24.981136
Epoch: 1985 	Training Loss: 24.981136
Epoch: 1986 	Training Loss: 24.981134
Epoch: 1987 	Training Loss: 24.981134
Epoch: 1988 	Training Loss: 24.981134
Epoch: 1989 	Training Loss: 24.981134
Epoch: 1990 	Training Loss: 24.981131
Epoch: 1991 	Training Loss: 24.981129
Epoch: 1992 

In [41]:
# Test the performance of the trained model

teacher.eval()
test_loss = 0.0

for idx, (data, labels) in enumerate(test_loader):
    # forward pass
    output = teacher(data)
    test_loss += criterion(output, labels).item()

# print the mean squared loss of the test dataset normalized by the mean square of the test  labels
print('Average mean squared error {:.6f}'.format(test_loss / torch.mean(torch.pow(y_test, 2))))

Average mean squared error 24.944223


In [None]:
test_error = []
for w in range(1,21):
    print("width:",w)
    test_error.append(student_net(w))

width: 1
Average mean squared error 24.944986
width: 2
Average mean squared error 24.945137
width: 3
Average mean squared error 24.953253
width: 4
Average mean squared error 24.945488
width: 5
Average mean squared error 24.949078
width: 6
Average mean squared error 24.944769
width: 7
Average mean squared error 24.944811
width: 8
Average mean squared error 24.945496
width: 9
