HW3 <br>
Question 2 <br>
Mahdi Koloushani <br>
401300066

In [None]:
import torch
import torch.nn as nn
from torchvision import models, transforms
import torchvision

## Load CIFAR10 Dataset

In [None]:
split_ratio = 0.8 #split ratio for train and validation datasets
batch_size = 32

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Resize((224,224))])
target_transform = torchvision.transforms.Compose([lambda x:torch.LongTensor([x]),
                                                   lambda x:nn.functional.one_hot(x,10)])


trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform, target_transform = target_transform)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform, target_transform = target_transform)

trainset_size = int(len(trainset) * split_ratio)
validset_size = len(trainset) - trainset_size
trainset, validset = torch.utils.data.random_split(trainset, [trainset_size, validset_size])

trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)
validloader = torch.utils.data.DataLoader(validset, batch_size=batch_size,
                                          shuffle=True, num_workers=2)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=2)


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# from google.colab import drive
# drive.mount("/content/gdrive")            only for google drive using cases
loss_fn = nn.CrossEntropyLoss()


Mounted at /content/gdrive


#Part A
## Linear Tuning the ResNet50 

### Pretrained Weights Reading and train only the FC Layer

In [None]:
weights = models.ResNet50_Weights.IMAGENET1K_V2
model_t = models.resnet50(weights = weights)
for param in model_t.parameters():
    param.requires_grad = False
model_t.fc = nn.Linear(model_t.fc.in_features,10)

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

In [None]:
lr = 0.001 # learning rate
epochs = 20
optimizer_t = torch.optim.Adam(model_t.parameters(), lr=lr)
model_t = model_t.to(device)
model_t.train()


best_val_loss = 1e10
for epoch in range(epochs):
  for i, data in enumerate(trainloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    optimizer_t.zero_grad()
    yhat = model_t(X)
    loss = loss_fn(yhat,y)
    loss.backward()
    optimizer_t.step()


  acc_t = 0
  loss_t = 0
  for i, data in enumerate(trainloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    yhat = model_t.forward(X)
    loss_t += loss_fn(yhat,y)
    acc_t +=torch.sum(torch.argmax(y,dim=1)==torch.argmax(yhat,dim=1))
  acc_t = acc_t/len(trainset)
  loss_t /= len(trainloader)
  
  
  acc_v = 0
  loss_v = 0
  for i, data in enumerate(validloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    yhat = model_t.forward(X)
    loss_v += loss_fn(yhat,y)
    acc_v +=torch.sum(torch.argmax(y,dim=1)==torch.argmax(yhat,dim=1))
  acc_v = acc_v/len(validset)
  loss_v /= len(validloader)

  if loss_v < best_val_loss:
    best_val_loss = loss_v
    torch.save(model_t, "BestModel_partA.pth")

  # report printing
  print(f"##### Epoch: {epoch+1}  #####")
  print(f"Train Loss = {loss_t}  Train Accuracy = {acc_t}  Validation Loss = {loss_v}  Validation Accuracy = {acc_v}")
print("Training is Finished")

##### Epoch: 1  #####
Train Loss = 0.6572936773300171  Train Accuracy = 0.7805749773979187  Validation Loss = 0.6946480870246887  Validation Accuracy = 0.7664999961853027
##### Epoch: 2  #####
Train Loss = 0.5857818722724915  Train Accuracy = 0.8032499551773071  Validation Loss = 0.6475958228111267  Validation Accuracy = 0.778499960899353
##### Epoch: 3  #####
Train Loss = 0.539250373840332  Train Accuracy = 0.8172999620437622  Validation Loss = 0.6169648766517639  Validation Accuracy = 0.7917999625205994
##### Epoch: 4  #####
Train Loss = 0.512354850769043  Train Accuracy = 0.8253999948501587  Validation Loss = 0.5975148677825928  Validation Accuracy = 0.7922999858856201
##### Epoch: 5  #####
Train Loss = 0.49392014741897583  Train Accuracy = 0.831849992275238  Validation Loss = 0.5864300727844238  Validation Accuracy = 0.7978000044822693
##### Epoch: 6  #####
Train Loss = 0.4967106282711029  Train Accuracy = 0.8282249569892883  Validation Loss = 0.6088764667510986  Validation Accurac

### Evaluate the model on the test dataset

In [None]:
Model_teacher = torch.load("BestModel_partA.pth")
acc_t = 0
loss_t = 0
for i, data in enumerate(testloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    yhat = Model_teacher.forward(X)
    loss_t += loss_fn(yhat,y)
    acc_t +=torch.sum(torch.argmax(y, dim=1)==torch.argmax(yhat,dim=1))
acc_t = acc_t/len(testset)
loss_t /= len(testloader)

print(f"Part A:     Test Loss = {loss_t}    Test Accuracy = {acc_t}")

Part A:     Test Loss = 0.5958660840988159    Test Accuracy = 0.7925999760627747


# Part B
## Train the resnet18 as student

### Use the best model in part A and load ResNet18 model

In [None]:
Model_teacher = torch.load("BestModel_partA.pth")
Model_student = models.resnet18()
Model_student.fc = nn.Linear(Model_student.fc.in_features,10)

### Define Distillation Loss

In [None]:
def distillation_loss(y, zs, zt, alpha, tau):
  loss = (1-alpha)*loss_fn(zs, y)+alpha*tau**2*loss_fn(zs/tau,nn.functional.softmax(zt/tau, dim=1))
  return loss

### Train ResNet18 as Student and use the model in part A as teacher

لازم به ذکر است که برای یافتن مقدار مناسب برای پارامترهای آلفا و تاو، آزمایش های متعددی انجام شد و مقدار 0.3 برای آلفا و 5 برای تاو بهترین نتایج را در میان آزمایش های انجام شده داشتند. 

In [None]:
lr = 0.001
alpha = 0.3
tau = 5
epochs = 20

Model_student = Model_student.to(device)
Model_teacher = Model_teacher.to(device)
optimizer_s = torch.optim.Adam(Model_student.parameters(), lr=lr)
Model_student.train()

best_val_loss = 1e10
for epoch in range(epochs):
  for i, data in enumerate(trainloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    optimizer_s.zero_grad()
    yhat_t = Model_teacher.forward(X)
    yhat_s = Model_student.forward(X)
    loss = distillation_loss(y, yhat_s, yhat_t, alpha, tau)
    loss.backward()
    optimizer_s.step()

  acc_t = 0
  loss_t = 0
  for i, data in enumerate(trainloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    yhat = Model_student.forward(X)
    loss = loss_fn(yhat,y)
    loss_t += loss.item()
    acc_t +=torch.sum(torch.argmax(y,dim=1)==torch.argmax(yhat,dim=1))
  acc_t = acc_t/len(trainset)
  loss_t /= len(trainloader)

  acc_v = 0
  loss_v = 0
  for i, data in enumerate(validloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    yhat = Model_student.forward(X)
    loss = loss_fn(yhat,y)
    loss_v += loss.item()
    acc_v +=torch.sum(torch.argmax(y,dim=1)==torch.argmax(yhat,dim=1))
  acc_v = acc_v/len(validset)
  loss_v /= len(validloader)

  # Save the best model
  if loss_v < best_val_loss:
    best_val_loss = loss_v
    torch.save(Model_student, "BestModel_partB.pth")

  # report printing
  print(f"##### Epoch: {epoch+1}  #####")
  print(f"Train Loss = {loss_t}  Train Accuracy = {acc_t}  Validation Loss = {loss_v}  Validation Accuracy = {acc_v}")
print("Training is Finished")

##### Epoch: 1  #####
Train Loss = 0.9602156081199646  Train Accuracy = 0.6556249856948853  Validation Loss = 0.9985341727733612  Validation Accuracy = 0.642300009727478
##### Epoch: 2  #####
Train Loss = 0.6731352981925011  Train Accuracy = 0.7646499872207642  Validation Loss = 0.7578128780991125  Validation Accuracy = 0.7339000105857849
##### Epoch: 3  #####
Train Loss = 0.49725982327461243  Train Accuracy = 0.830549955368042  Validation Loss = 0.6098419675430932  Validation Accuracy = 0.7879999876022339
##### Epoch: 4  #####
Train Loss = 0.39322271631360056  Train Accuracy = 0.8723999857902527  Validation Loss = 0.53312053876563  Validation Accuracy = 0.818399965763092
##### Epoch: 5  #####
Train Loss = 0.3251759474158287  Train Accuracy = 0.8971499800682068  Validation Loss = 0.495485064415886  Validation Accuracy = 0.8331999778747559
##### Epoch: 6  #####
Train Loss = 0.23977132348418237  Train Accuracy = 0.93402498960495  Validation Loss = 0.46076193689919126  Validation Accuracy

###Evaluate the model on the test dataset

In [None]:
Model_student = torch.load("BestModel_partB.pth")
acc_t = 0
loss_t = 0
for i, data in enumerate(testloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    yhat = Model_student.forward(X)
    loss_t += loss_fn(yhat,y).item()
    acc_t +=torch.sum(torch.argmax(y, dim=1)==torch.argmax(yhat,dim=1))
acc_t = acc_t/len(testset)
loss_t /= len(testloader)

print(f"Part B:     Test Loss = {loss_t}    Test Accuracy = {acc_t}")

Part B:     Test Loss = 0.3983389796159519    Test Accuracy = 0.8685999512672424


# Part C
## Train ResNet18 without teacher

In [None]:
lr = 0.001
epochs = 20

Model_resnet18 = models.resnet18()
Model_resnet18.fc = nn.Linear(Model_resnet18.fc.in_features,10)
Model_resnet18 = Model_resnet18.to(device)
optimizer_resnet18 = torch.optim.Adam(Model_resnet18.parameters(), lr=lr)
Model_resnet18.train()

best_val_loss = 1e10
for epoch in range(epochs):
  for i, data in enumerate(trainloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    optimizer_resnet18.zero_grad()
    yhat = Model_resnet18(X)
    loss = loss_fn(yhat,y)
    loss.backward()
    optimizer_resnet18.step()


  acc_t = 0
  loss_t = 0
  for i, data in enumerate(trainloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    yhat = Model_resnet18.forward(X)
    loss_t += loss_fn(yhat,y).item()
    acc_t +=torch.sum(torch.argmax(y,dim=1)==torch.argmax(yhat,dim=1))
  acc_t = acc_t/len(trainset)
  loss_t /= len(trainloader)


  acc_v = 0
  loss_v = 0
  for i, data in enumerate(validloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    yhat = Model_resnet18.forward(X)
    loss_v += loss_fn(yhat,y).item()
    acc_v +=torch.sum(torch.argmax(y,dim=1)==torch.argmax(yhat,dim=1))
  acc_v = acc_v/len(validset)
  loss_v /= len(validloader)

  # Save the best model
  if loss_v < best_val_loss:
    best_val_loss = loss_v
    torch.save(Model_resnet18, "BestModel_partC.pth")

  # report printing
  print(f"##### Epoch: {epoch+1}  #####")
  print(f"Train Loss = {loss_t}  Train Accuracy = {acc_t}  Validation Loss = {loss_v}  Validation Accuracy = {acc_v}")
print("Training is Finished")

##### Epoch: 1  #####
Train Loss = 1.0467313847780229  Train Accuracy = 0.6325749754905701  Validation Loss = 1.0627844510748745  Validation Accuracy = 0.6243999600410461
##### Epoch: 2  #####
Train Loss = 0.7164566014766693  Train Accuracy = 0.7508499622344971  Validation Loss = 0.774321542285121  Validation Accuracy = 0.7297999858856201
##### Epoch: 3  #####
Train Loss = 0.563320746076107  Train Accuracy = 0.8040750026702881  Validation Loss = 0.6600154584017806  Validation Accuracy = 0.7723000049591064
##### Epoch: 4  #####
Train Loss = 0.464159046292305  Train Accuracy = 0.8402249813079834  Validation Loss = 0.6178289564748922  Validation Accuracy = 0.7899999618530273
##### Epoch: 5  #####
Train Loss = 0.3230120234906673  Train Accuracy = 0.8890500068664551  Validation Loss = 0.5411715676038029  Validation Accuracy = 0.8198999762535095
##### Epoch: 6  #####
Train Loss = 0.2589898925811052  Train Accuracy = 0.9099999666213989  Validation Loss = 0.5394009427902416  Validation Accurac

###Evaluate the model on the test dataset

In [None]:
Model_resnet18 = torch.load("BestModel_partC.pth")
acc_t = 0
loss_t = 0
for i, data in enumerate(testloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    yhat = Model_resnet18.forward(X)
    loss_t += loss_fn(yhat,y).item()
    acc_t +=torch.sum(torch.argmax(y, dim=1)==torch.argmax(yhat,dim=1))
acc_t = acc_t/len(testset)
loss_t /= len(testloader)

print(f"Part C:     Test Loss = {loss_t}    Test Accuracy = {acc_t}")

Part C:     Test Loss = 0.5774145754762351    Test Accuracy = 0.8134999871253967


با توجه به نتایج دو بخش ب و ج، دقت تست در حالتی که از آموزگار استفاده کرده ایم حدود 5 درصد بیشتر است و این نتیجه را به همراه دارد که استفاده از آموزگار به افزایش دقت تست منجر شده است، زیرا مدل دانش آموز از اطلاعات و ویژگی های های مدل آموزگار در یادگیری خود بهره می برد.

# Part D
### Train entire of ResNet50

In [None]:
weights = models.ResNet50_Weights.IMAGENET1K_V2
Model_resnet50 = models.resnet50(weights=weights)
Model_resnet50.fc = nn.Linear(Model_resnet50.fc.in_features,10)

lr = 0.001 # learning rate
epochs = 8
optimizer_RS50 = torch.optim.Adam(Model_resnet50.parameters(), lr=lr)
Model_resnet50 = Model_resnet50.to(device)
Model_resnet50.train()


best_val_loss = 1e10
for epoch in range(epochs):
  for i, data in enumerate(trainloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    optimizer_RS50.zero_grad()
    yhat = Model_resnet50(X)
    loss = loss_fn(yhat,y)
    loss.backward()
    optimizer_RS50.step()


  acc_t = 0
  loss_t = 0
  for i, data in enumerate(trainloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    yhat = Model_resnet50.forward(X)
    loss_t += loss_fn(yhat,y).item()
    acc_t +=torch.sum(torch.argmax(y,dim=1)==torch.argmax(yhat,dim=1))
  acc_t = acc_t/len(trainset)
  loss_t /= len(trainloader)
  
  
  acc_v = 0
  loss_v = 0
  for i, data in enumerate(validloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    yhat = Model_resnet50.forward(X)
    loss_v += loss_fn(yhat,y).item()
    acc_v +=torch.sum(torch.argmax(y,dim=1)==torch.argmax(yhat,dim=1))
  acc_v = acc_v/len(validset)
  loss_v /= len(validloader)

  if loss_v < best_val_loss:
    best_val_loss = loss_v
    torch.save(Model_resnet50, "BestModel_partD_1.pth")

  # report printing
  print(f"##### Epoch: {epoch+1}  #####")
  print(f"Train Loss = {loss_t}  Train Accuracy = {acc_t}  Validation Loss = {loss_v}  Validation Accuracy = {acc_v}")
print("Training is Finished")

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

##### Epoch: 1  #####
Train Loss = 0.4111626893699169  Train Accuracy = 0.8620499968528748  Validation Loss = 0.4937072696681982  Validation Accuracy = 0.830299973487854
##### Epoch: 2  #####
Train Loss = 0.26940629081726075  Train Accuracy = 0.9092249870300293  Validation Loss = 0.3928187133643193  Validation Accuracy = 0.8623999953269958
##### Epoch: 3  #####
Train Loss = 0.19612814363464712  Train Accuracy = 0.9314749836921692  Validation Loss = 0.3668680572138427  Validation Accuracy = 0.8780999779701233
##### Epoch: 4  #####
Train Loss = 0.14029297150596975  Train Accuracy = 0.952174961566925  Validation Loss = 0.33070680657371926  Validation Accuracy = 0.8931999802589417
##### Epoch: 5  #####
Train Loss = 0.1161082179101184  Train Accuracy = 0.9605749845504761  Validation Loss = 0.3480715492067817  Validation Accuracy = 0.8876999616622925
##### Epoch: 6  #####
Train Loss = 0.10640660134367645  Train Accuracy = 0.9648249745368958  Validation Loss = 0.3581659682273579  Validation A

### Train ResNet18 as student

In [None]:
Model_teacher = torch.load("BestModel_partD_1.pth")
Model_student = models.resnet18()
Model_student.fc = nn.Linear(Model_student.fc.in_features,10)

def distillation_loss(y, zs, zt, alpha, tau):
  loss = (1-alpha)*loss_fn(zs, y)+alpha*tau**2*loss_fn(zs/tau,nn.functional.softmax(zt/tau, dim=1))
  return loss

In [None]:
lr = 0.001
alpha = 0.3
tau = 5
epochs = 20

Model_student = Model_student.to(device)
Model_teacher = Model_teacher.to(device)
optimizer_s = torch.optim.Adam(Model_student.parameters(), lr=lr)
Model_student.train()

best_val_loss = 1e10
for epoch in range(epochs):
  for i, data in enumerate(trainloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    optimizer_s.zero_grad()
    yhat_t = Model_teacher.forward(X)
    yhat_s = Model_student.forward(X)
    loss = distillation_loss(y, yhat_s, yhat_t, alpha, tau)
    loss.backward()
    optimizer_s.step()

  acc_t = 0
  loss_t = 0
  for i, data in enumerate(trainloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    yhat = Model_student.forward(X)
    loss = loss_fn(yhat,y)
    loss_t += loss.item()
    acc_t +=torch.sum(torch.argmax(y,dim=1)==torch.argmax(yhat,dim=1))
  acc_t = acc_t/len(trainset)
  loss_t /= len(trainloader)

  acc_v = 0
  loss_v = 0
  for i, data in enumerate(validloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    yhat = Model_student.forward(X)
    loss = loss_fn(yhat,y)
    loss_v += loss.item()
    acc_v +=torch.sum(torch.argmax(y,dim=1)==torch.argmax(yhat,dim=1))
  acc_v = acc_v/len(validset)
  loss_v /= len(validloader)

  # Save the best model
  if loss_v < best_val_loss:
    best_val_loss = loss_v
    torch.save(Model_student, "BestModel_partD_2.pth")

  # report printing
  print(f"##### Epoch: {epoch+1}  #####")
  print(f"Train Loss = {loss_t}  Train Accuracy = {acc_t}  Validation Loss = {loss_v}  Validation Accuracy = {acc_v}")
print("Training is Finished")

##### Epoch: 1  #####
Train Loss = 1.1532155022621156  Train Accuracy = 0.6640999913215637  Validation Loss = 1.172196808333595  Validation Accuracy = 0.6588000059127808
##### Epoch: 2  #####
Train Loss = 0.7690562581121921  Train Accuracy = 0.7824249863624573  Validation Loss = 0.8555902276461879  Validation Accuracy = 0.7628999948501587
##### Epoch: 3  #####
Train Loss = 0.5182725816532969  Train Accuracy = 0.8436499834060669  Validation Loss = 0.6879124032994048  Validation Accuracy = 0.8084999918937683
##### Epoch: 4  #####
Train Loss = 0.37980186268538235  Train Accuracy = 0.8798499703407288  Validation Loss = 0.6061018494942698  Validation Accuracy = 0.8264999985694885
##### Epoch: 5  #####
Train Loss = 0.27487609679996966  Train Accuracy = 0.9099499583244324  Validation Loss = 0.5368820542035202  Validation Accuracy = 0.8468999862670898
##### Epoch: 6  #####
Train Loss = 0.1834117807213217  Train Accuracy = 0.9375999569892883  Validation Loss = 0.5061329411455808  Validation Acc

###Evaluate the model on the test dataset

In [None]:
Model_student = torch.load("BestModel_partD_2.pth")
acc_t = 0
loss_t = 0
for i, data in enumerate(testloader):
    X, y = data
    X, y = X.to(device), y.to(device)
    y = torch.reshape(y, (y.shape[0],10)).to(torch.float)
    yhat = Model_student.forward(X)
    loss_t += loss_fn(yhat,y).item()
    acc_t +=torch.sum(torch.argmax(y, dim=1)==torch.argmax(yhat,dim=1))
acc_t = acc_t/len(testset)
loss_t /= len(testloader)

print(f"Part D:     Test Loss = {loss_t}    Test Accuracy = {acc_t}")

Part D:     Test Loss = 0.41297273423534614    Test Accuracy = 0.8768999576568604


در این حالت که آموزگار را به طور کامل فاین تیون کردیم، دقت تست روی مدل دانش آموز نسبت به قسمت ب، 0.8 درصد افزایش داشت، زیرا در این حالت از آن جا که دقت مدل آموزگار بیشتر از بخش الف است و به عبارتی مدل آموزگار بهتر روی دادگان فیت شده است، دارای اطلاعات بیشتری بوده و مدل دانش آموز را بهتر آموزش می دهد.