In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
torch.cuda.empty_cache()

In [4]:
# preprocess
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=2)

Files already downloaded and verified
Files already downloaded and verified


In [5]:
# GELU
class Bottleneck(nn.Module):
    expansion = 4
    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        
        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes * self.expansion:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes * self.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * self.expansion)
            )
    
    def forward(self, x):
        out = F.gelu(self.bn1(self.conv1(x)))
        out = F.gelu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.gelu(out)
        return out

In [6]:
# Half filters
# GELU
class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 32
        
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layer1 = self._make_layer(block, 32, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 64, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 128, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 256, num_blocks[3], stride=2)
        self.linear = nn.Linear(256 * block.expansion, num_classes)
    
    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)
    
    def forward(self, x):
        out = F.gelu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, out.size()[2:])
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def Net():
    return ResNet(Bottleneck, [3, 4, 6, 3])

In [7]:
model = Net().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)

In [8]:
def train_model(num_epochs=100):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, (images, labels) in enumerate(trainloader):
            images, labels = images.to(device), labels.to(device)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            if (i + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}], Loss: {running_loss/100:.4f}')
                running_loss = 0.0
        scheduler.step()


In [9]:
def test_model():
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in testloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Accuracy on test set: {100 * correct / total:.2f}%')
    

In [10]:
train_model(num_epochs=100)

Epoch [1/100], Step [100], Loss: 2.0737
Epoch [1/100], Step [200], Loss: 1.7491
Epoch [1/100], Step [300], Loss: 1.5644
Epoch [2/100], Step [100], Loss: 1.2543
Epoch [2/100], Step [200], Loss: 1.1613
Epoch [2/100], Step [300], Loss: 1.0847
Epoch [3/100], Step [100], Loss: 0.9321
Epoch [3/100], Step [200], Loss: 0.9054
Epoch [3/100], Step [300], Loss: 0.8697
Epoch [4/100], Step [100], Loss: 0.7927
Epoch [4/100], Step [200], Loss: 0.7568
Epoch [4/100], Step [300], Loss: 0.7325
Epoch [5/100], Step [100], Loss: 0.6692
Epoch [5/100], Step [200], Loss: 0.6491
Epoch [5/100], Step [300], Loss: 0.6439
Epoch [6/100], Step [100], Loss: 0.5916
Epoch [6/100], Step [200], Loss: 0.5783
Epoch [6/100], Step [300], Loss: 0.5753
Epoch [7/100], Step [100], Loss: 0.5391
Epoch [7/100], Step [200], Loss: 0.5324
Epoch [7/100], Step [300], Loss: 0.5341
Epoch [8/100], Step [100], Loss: 0.5105
Epoch [8/100], Step [200], Loss: 0.5019
Epoch [8/100], Step [300], Loss: 0.4939
Epoch [9/100], Step [100], Loss: 0.4633


In [11]:
test_model()

Accuracy on test set: 93.58%


In [12]:
# save model
path = './models/resnet_gelu.pth'
torch.save(model.state_dict(), path)

In [13]:
train_model(num_epochs=20) #

Epoch [1/20], Step [100], Loss: 0.0047
Epoch [1/20], Step [200], Loss: 0.0060
Epoch [1/20], Step [300], Loss: 0.0044
Epoch [2/20], Step [100], Loss: 0.0036
Epoch [2/20], Step [200], Loss: 0.0045
Epoch [2/20], Step [300], Loss: 0.0046
Epoch [3/20], Step [100], Loss: 0.0048
Epoch [3/20], Step [200], Loss: 0.0043
Epoch [3/20], Step [300], Loss: 0.0054
Epoch [4/20], Step [100], Loss: 0.0052
Epoch [4/20], Step [200], Loss: 0.0040
Epoch [4/20], Step [300], Loss: 0.0051
Epoch [5/20], Step [100], Loss: 0.0043
Epoch [5/20], Step [200], Loss: 0.0039
Epoch [5/20], Step [300], Loss: 0.0036
Epoch [6/20], Step [100], Loss: 0.0044
Epoch [6/20], Step [200], Loss: 0.0043
Epoch [6/20], Step [300], Loss: 0.0046
Epoch [7/20], Step [100], Loss: 0.0048
Epoch [7/20], Step [200], Loss: 0.0048
Epoch [7/20], Step [300], Loss: 0.0038
Epoch [8/20], Step [100], Loss: 0.0047
Epoch [8/20], Step [200], Loss: 0.0040
Epoch [8/20], Step [300], Loss: 0.0045
Epoch [9/20], Step [100], Loss: 0.0053
Epoch [9/20], Step [200],

KeyboardInterrupt: 

从损失来看，100 个 epoch 已经基本达到理想状态了。