## ДЗ 6:
### Экспорт моделей в ращличные форматы Tensort, ONNX

In [1]:
import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '3'
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'

In [3]:
import ultralytics
from ultralytics import YOLO

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets

# Check if GPU is available, and if not, use the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
torch.manual_seed(42)

<torch._C.Generator at 0x7f3745b3ff50>

In [6]:
# Below we are preprocessing data for CIFAR-10. We use an arbitrary batch size of 128.
transforms_cifar = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Loading the CIFAR-10 dataset:
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms_cifar)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms_cifar)

Files already downloaded and verified
Files already downloaded and verified


In [7]:
num_classes = len(train_dataset.class_to_idx)

In [8]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=10)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=4)

In [9]:
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights

In [10]:
teacher  = efficientnet_b0(weights=EfficientNet_B0_Weights)
teacher.classifier = nn.Sequential(
    nn.Dropout(p=0.3, inplace=True),
    nn.Linear(in_features=1280, out_features=num_classes)
)

teacher.to(device)
print()






In [11]:
class StudentNN(nn.Module):
    def __init__(self, num_classes=10):
        super(StudentNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

In [12]:
def train(model, train_loader, epochs, learning_rate, device):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

    model.train()

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")

In [13]:
def test(model, test_loader, device):
    model.to(device)
    model.eval()

    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")
    return accuracy

In [14]:
train(teacher, train_loader, epochs=10, learning_rate=3e-4, device=device)

Epoch 1/10, Loss: 1.3108956644602139
Epoch 2/10, Loss: 0.7538900801440334
Epoch 3/10, Loss: 0.5705682607105625
Epoch 4/10, Loss: 0.4483378721624994
Epoch 5/10, Loss: 0.36057517340268624
Epoch 6/10, Loss: 0.2881256671398497
Epoch 7/10, Loss: 0.24069182258432784
Epoch 8/10, Loss: 0.19387548782712663
Epoch 9/10, Loss: 0.16705116221819388
Epoch 10/10, Loss: 0.14473693817853928


In [15]:
test_accuracy_teacher = test(teacher, test_loader, device)

Test Accuracy: 82.57%


In [16]:
student = StudentNN()
student.to(device)
print()




In [17]:
train(student, train_loader, epochs=10, learning_rate=3e-4, device=device)

Epoch 1/10, Loss: 1.6700829374210915
Epoch 2/10, Loss: 1.3536559404314632
Epoch 3/10, Loss: 1.2380183339118958
Epoch 4/10, Loss: 1.1539991257135824
Epoch 5/10, Loss: 1.0868773929908147
Epoch 6/10, Loss: 1.0336100071897287
Epoch 7/10, Loss: 0.9885983236915316
Epoch 8/10, Loss: 0.9435950840830498
Epoch 9/10, Loss: 0.903449016763731
Epoch 10/10, Loss: 0.8690053416639948


In [18]:
test_accuracy_student = test(student, test_loader, device)

Test Accuracy: 65.70%


In [22]:
new_student = StudentNN()
new_student.to(device)
print()




In [23]:
def train_knowledge_distillation(teacher, student, train_loader, epochs, learning_rate, T, soft_target_loss_weight, ce_loss_weight, device):
    ce_loss = nn.CrossEntropyLoss()
    optimizer = optim.Adam(student.parameters(), lr=learning_rate)

    teacher.eval()  # Teacher set to evaluation mode
    student.train() # Student to train mode

    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            # Forward pass with the teacher model - do not save gradients here as we do not change the teacher's weights
            with torch.no_grad():
                teacher_logits = teacher(inputs)

            # Forward pass with the student model
            student_logits = student(inputs)

            #Soften the student logits by applying softmax first and log() second
            soft_targets = nn.functional.softmax(teacher_logits / T, dim=-1)
            soft_prob = nn.functional.log_softmax(student_logits / T, dim=-1)

            # Calculate the soft targets loss. Scaled by T**2 as suggested by the authors of the paper "Distilling the knowledge in a neural network"
            soft_targets_loss = -torch.sum(soft_targets * soft_prob) / soft_prob.size()[0] * (T**2)

            # Calculate the true label loss
            label_loss = ce_loss(student_logits, labels)

            # Weighted sum of the two losses
            loss = soft_target_loss_weight * soft_targets_loss + ce_loss_weight * label_loss

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss / len(train_loader)}")

In [24]:
train_knowledge_distillation(
    teacher=teacher,
    student=new_student,
    train_loader=train_loader,
    epochs=10,
    learning_rate=3e-3,
    T=2,
    soft_target_loss_weight=0.25,
    ce_loss_weight=0.75,
    device=device
)

Epoch 1/10, Loss: 2.6183919107822504
Epoch 2/10, Loss: 2.0680025099488475
Epoch 3/10, Loss: 1.8383152500137954
Epoch 4/10, Loss: 1.6770722555077595
Epoch 5/10, Loss: 1.5451056182841816
Epoch 6/10, Loss: 1.427340096220031
Epoch 7/10, Loss: 1.3362619992717149
Epoch 8/10, Loss: 1.2390680241462824
Epoch 9/10, Loss: 1.1710671430353619
Epoch 10/10, Loss: 1.1042206671536732


In [25]:
test_accuracy_student_kd = test(new_student_student, test_loader, device)

Test Accuracy: 70.24%


In [26]:
print(f"Teacher accuracy: {test_accuracy_teacher:.2f}%")
print(f"Student accuracy: {test_accuracy_student:.2f}%")
print(f"Student + KD accuracy: {test_accuracy_student_kd:.2f}%")

Teacher accuracy: 82.57%
Student accuracy: 65.70%
Student + KD accuracy: 70.24%


In [27]:
total_params_teacher = "{:,}".format(sum(p.numel() for p in teacher.parameters()))
print(f"Teacher parameters: {total_params_teacher}")
total_params_student = "{:,}".format(sum(p.numel() for p in student.parameters()))
print(f"Student parameters: {total_params_student}")

Teacher parameters: 4,020,358
Student parameters: 267,738
