## 导入必要的库

In [1]:
%cd ..
!make

/root/dlsys/needle
-- Found pybind11: /usr/local/include (found version "2.13.0dev1")
-- Found cuda, building cuda backend
Thu May  2 17:56:00 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 517.00       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| N/A   47C    P8     6W /  N/A |     10MiB /  6144MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                             

In [2]:
%set_env PYTHONPATH ./python
%set_env NEEDLE_BACKEND nd

import sys
sys.path.append('./python')

env: PYTHONPATH=./python
env: NEEDLE_BACKEND=nd


In [3]:
import numpy as np
import needle
import needle.nn as nn
from needle.data import Dataset, DataLoader
import glob 
import random
from tqdm import tqdm

Using needle backend


## 使用cuda作为NDArray的后端

In [4]:
device = needle.cuda()
print(device)

cuda()


#### **数据集 (CIFAR10Dataset)**

In [5]:
import urllib.request
import os

!mkdir -p './data/ptb'

# Download CIFAR-10 dataset
if not os.path.isdir("./data/cifar-10-batches-py"):
    urllib.request.urlretrieve("https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz", "./data/cifar-10-python.tar.gz")
    !tar -xvzf './data/cifar-10-python.tar.gz' -C './data'

#### **定义 DataLoader**

In [None]:
batch_size = 128

train_dataset = needle.data.CIFAR10Dataset("data/cifar-10-batches-py", train=True)
test_dataset = needle.data.CIFAR10Dataset("data/cifar-10-batches-py", train=False)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

### 使用needle定义模型

In [None]:
class ConvBN(nn.Module):
    def __init__(self, a, b, k, s, device=None, dtype="float32") -> None:
        self.a = a
        self.b = b
        self.k = k
        self.s = s

        self.conv2d = nn.Conv(a, b, k, s, device=device, dtype=dtype)
        self.batch_norm = nn.BatchNorm2d(b, device=device, dtype=dtype)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.conv2d(x)
        x = self.batch_norm(x)
        return self.relu(x)
    

## currently not implement pooling
## replace pooling with striding
class ResNet34(nn.Module):
    def __init__(self, num_classes, device=None, dtype="float32"):
        super().__init__()
        self.layer1 = nn.Sequential(
            ConvBN(3, 64, 7, 4, device=device, dtype=dtype),
            ConvBN(64, 128, 3, 2, device=device, dtype=dtype),
        )
        self.layer2 = nn.Sequential(
            ConvBN(128, 128, 3, 1, device=device, dtype=dtype),
            ConvBN(128, 128, 3, 1, device=device, dtype=dtype),
            ConvBN(128, 128, 3, 1, device=device, dtype=dtype),
            ConvBN(128, 128, 3, 1, device=device, dtype=dtype),
        )
        self.layer3 = nn.Sequential(
            ConvBN(128, 256, 3, 2, device=device, dtype=dtype),
            ConvBN(256, 512, 3, 2, device=device, dtype=dtype),
            ConvBN(256, 512, 3, 2, device=device, dtype=dtype),
            ConvBN(256, 512, 3, 2, device=device, dtype=dtype),
            ConvBN(256, 512, 3, 2, device=device, dtype=dtype),
            ConvBN(256, 512, 3, 2, device=device, dtype=dtype),
        )
        self.layer4 = nn.Sequential(
            ConvBN(512, 512, 3, 1, device=device, dtype=dtype),
            ConvBN(512, 512, 3, 1, device=device, dtype=dtype),
            ConvBN(512, 512, 3, 1, device=device, dtype=dtype),
        )
        self.linear1 = nn.Linear(512, num_classes, device=device, dtype=dtype)

    def forward(self, x):
        x = self.layer1(x)
        x = nn.Residual(self.layer2)(x)
        x = self.layer3(x)
        x = nn.Residual(self.layer4)(x)
        x = nn.Flatten()(x)
        x = self.linear1(x)
        return x

## 训练和测试

### 超参数设置

在开始训练网络前，我们需要设置一些超参数。网络训练的的超参数包括 epoch、batch size、学习率等。可以通过搜索尝试的方式找出最佳的超参数。

In [None]:
"""
这里提供的超参数不一定是最优的超参数，建议自行对超参数的大小进行一下搜索。
"""
num_classes = 10  # 类别数
num_epochs = 10
learning_rate = 1e-4

model = ResNet34(num_classes=num_classes, device=device)
    
# 损失函数和优化器
"""
你可以自行尝试一些更为先进的优化器，如 Adam
"""
criterion = nn.SoftmaxLoss()
optimizer = needle.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)

### 模型训练


现在，模型已经准备好进行训练了，在这里我们先了解一下在 PyTorch 中模型训练的步骤：

* 首先使用 <code>train_loader</code> 为每个批量加载输入的图像，并使用之前定义的设备变量 ```device``` 将数据移动到 GPU上

* 随后使用模型 ```model``` 对数据标签进行预测，使用之前定义的损失函数 ```criterion```  计算预测值与真实值之间的损失 

* 模型根据损失学习新的参数值，我们将损失值输入到反向传播方法中，<code>loss.backward()</code>，并更新权重，<code>optimizer.step()</code>。 每次更新之前，需要首先用 ```optimizer.zero_grad()``` 将参数的梯度设置为零，否则梯度会不断累积，之前计算得到的的梯度仍然存在（这是 PyTorch 的默认行为）

* 在每个 epoch 的最后，需要在验证集上测试模型，由于我们在评估时不需要梯度，因此可以使用 <code>torch.no_grad()</code> 将梯度计算关闭，加快模型推导速度。

### 任务 4（10 Points）
请对训练代码进行补充。

In [None]:
import gc

train_loss_lst = []

model.train()
for epoch in range(num_epochs):
    with tqdm(train_loader, desc='Train [{}/{}]'.format(epoch+1, num_epochs), mininterval=1.0) as t:
        for batch in t:
            images, labels = batch[0].to(device), batch[1].to(device)

            optimizer.reset_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss_lst.append(loss.numpy().item())
            t.set_postfix(loss=loss.numpy().item())
            del images, labels, outputs
            gc.collect()

    print ('Epoch [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, loss.numpy().item()))

### 训练损失曲线可视化

以下的代码用于将训练过程中训练集上的损失以及验证集上的精确度可视化为曲线，并展示结果。

In [None]:
import matplotlib.pyplot as plt


def learning_curve(train_loss_lst, num_epochs, _ylabel1='train loss', smooth_factor=10):
    """
    Args:
        smooth_factor: 平滑因子，只能取正整数。平滑因子越大，损失曲线越平滑
    """
    
    assert smooth_factor > 0 and type(smooth_factor) == int
    
    fig, ax1 = plt.subplots()
    color = 'tab:blue'
    ax1.set_xlabel('epoch')
    ax1.set_ylabel(_ylabel1, color=color)
    train_loss_x = [i for i in range(num_epochs)]
    ax1.plot(train_loss_x[::smooth_factor], train_loss_lst[::smooth_factor], color=color)
    ax1.tick_params(axis='y', labelcolor=color)

    fig.tight_layout()
    plt.show()


In [None]:
print('========================= LEARNING CURVE =======================')
learning_curve(train_loss_lst, num_epochs, smooth_factor=3)

### 模型测试

In [None]:
model.eval()
correct = 0
total = 0
for images, labels in tqdm(test_loader, desc='Test', mininterval=1.0):
    images = images.to(device)
    labels = labels.to(device)
    outputs = model(images)

    pred = np.argmax(outputs.numpy(), axis=1)

    total += labels.shape[0]
    correct += (pred == labels).sum().item()
    del images, labels, outputs

print('Accuracy of the network on the {} test images: {:.4f} %'.format(len(test_dataset), 100 * correct / total))   