In [1]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import cv2
from PIL import Image
import numpy as np
import os, time
import random
random.seed(0)


readvdnames = lambda x: open(x).read().rstrip().split('\n')

################################# DEFINE DATASET #################################
class TinySegData(Dataset):
    def __init__(self, db_root="TinySeg", img_size=256, phase='train'):
        classes = ['person', 'bird', 'car', 'cat', 'plane', ]
        seg_ids = [1, 2, 3, 4, 5]

        templ_image = db_root + "/JPEGImages/{}.jpg"
        templ_mask = db_root + "/Annotations/{}.png"

        ids = readvdnames(db_root + "/ImageSets/" + phase + ".txt")

        # build training and testing dbs
        samples = []
        for i in ids:
            samples.append([templ_image.format(i), templ_mask.format(i)])
        self.samples = samples
        self.phase = phase
        self.db_root = db_root
        self.img_size = img_size

        self.color_transform = torchvision.transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.2)

        if not self.phase == 'train':
            print ("resize and augmentation will not be applied...")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        if self.phase == 'train':
            return self.get_train_item(idx)
        else:
            return self.get_train_item(idx)

    def get_train_item(self, idx):
        sample = self.samples[idx]
        image = Image.open(sample[0])

        if random.randint(0, 1) > 0:
            image = self.color_transform(image)
        image = np.asarray(image)[..., ::-1]     # to BGR
        seg_gt = (np.asarray(Image.open(sample[1]).convert('P'))).astype(np.uint8)

        image = image.astype(np.float32)
        image = image / 127.5 - 1        # -1~1

        if random.randint(0, 1) > 0:
            image = image[:, ::-1, :]       # HWC
            seg_gt = seg_gt[:, ::-1]

        # random crop to 256x256
        height, width = image.shape[0], image.shape[1]
        if height == width:
            miny, maxy = 0, 256
            minx, maxx = 0, 256
        elif height > width:
            miny = np.random.randint(0, height-256)
            maxy = miny+256
            minx = 0
            maxx = 256
        else:
            miny = 0
            maxy = 256
            minx = np.random.randint(0, width-256)
            maxx = minx+256
        image = image[miny:maxy, minx:maxx, :].copy()
        seg_gt = seg_gt[miny:maxy, minx:maxx].copy()

        if self.img_size != 256:
            new_size = (self.img_size, self.img_size)
            image = cv2.resize(image, new_size, interpolation=cv2.INTER_LINEAR)
            seg_gt = cv2.resize(seg_gt, new_size, interpolation=cv2.INTER_NEAREST)

        image = np.transpose(image, (2, 0, 1))      # To CHW

        # cv2.imwrite("test.png", np.concatenate([(image[0]+1)*127.5, seg_gt*255], axis=0))
        return image, seg_gt, sample

    def get_test_item(self, idx):
        sample = self.samples[idx]
        image = cv2.imread(sample[0])
        seg_gt = (np.asarray(Image.open(sample[1]).convert('P'))).astype(np.uint8)

        image = image.astype(np.float32)
        image = image / 127.5 - 1        # -1~1
        image = np.transpose(image, (2, 0, 1))

        # cv2.imwrite("test.png", np.concatenate([(image[0]+1)*127.5, seg_gt*255], axis=0))
        return image, seg_gt, sample

################################# FUNCTIONS #################################
def get_confusion_matrix(gt_label, pred_label, class_num):
        """
        Calcute the confusion matrix by given label and pred
        :param gt_label: the ground truth label
        :param pred_label: the pred label
        :param class_num: the number of class
        :return: the confusion matrix
        """
        index = (gt_label * class_num + pred_label).astype('int32')

        label_count = np.bincount(index)
        confusion_matrix = np.zeros((class_num, class_num))

        for i_label in range(class_num):
            for i_pred_label in range(class_num):
                cur_index = i_label * class_num + i_pred_label
                if cur_index < len(label_count):
                    confusion_matrix[i_label, i_pred_label] = label_count[cur_index]

        return confusion_matrix

def get_confusion_matrix_for_3d(gt_label, pred_label, class_num):
    confusion_matrix = np.zeros((class_num, class_num))

    for sub_gt_label, sub_pred_label in zip(gt_label, pred_label):
        sub_gt_label = sub_gt_label[sub_gt_label != 255]
        sub_pred_label = sub_pred_label[sub_pred_label != 255]
        cm = get_confusion_matrix(sub_gt_label, sub_pred_label, class_num)
        confusion_matrix += cm
    return confusion_matrix



In [2]:
dataset = TinySegData(img_size=128, phase='train')
test = TinySegData(img_size=128,phase='val')
resnetdataset = TinySegData(img_size=224, phase='train')
resnettest = TinySegData(img_size=224,phase='val')

resize and augmentation will not be applied...
resize and augmentation will not be applied...


In [11]:
train_loader = DataLoader(dataset, batch_size=32)
test_loader = DataLoader(test,batch_size=32)


In [4]:
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet,self).__init__()
        # 定义网络层  
        self.conv1 = nn.Conv2d(3, 18, kernel_size=21)  
        self.conv2 = nn.Conv2d(18, 48, kernel_size=21)  
        self.fc1 = nn.Linear(48 * 17 * 17, 360)  
        self.fc2 = nn.Linear(360, 84)  
        self.fc3 = nn.Linear(84, 20)  

    def forward(self, x):  
        # 定义前向传播过程  
        x = nn.functional.relu(self.conv1(x))  
        x = nn.functional.max_pool2d(x, 2)  
        x = nn.functional.relu(self.conv2(x))  
        x = nn.functional.max_pool2d(x, 2)  
        x = x.view(-1, 48 * 17 * 17)  # flatten  
        x = nn.functional.relu(self.fc1(x))  
        x = nn.functional.relu(self.fc2(x))  
        x = self.fc3(x)  
        return x  
    


In [5]:
class vgg16(nn.Module):
    def __init__(self,):
        super(vgg16,self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3,64,kernel_size=3,padding=1),
            nn.GELU(),
            nn.Conv2d(64,64,3,padding=1),
            nn.GELU(),
            nn.MaxPool2d(kernel_size=2,stride=2),#这里使用两个3X3的卷积核代替5X5


            nn.Conv2d(64, 128, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.Conv2d(128, 128, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.MaxPool2d(kernel_size=2, stride=2),  


            nn.Conv2d(128, 256, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.Conv2d(256, 256, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.Conv2d(256, 256, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.MaxPool2d(kernel_size=2, stride=2), 

            nn.Conv2d(256, 512, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.Conv2d(512, 512, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.Conv2d(512, 512, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.MaxPool2d(kernel_size=2, stride=2), 

            nn.Conv2d(512, 1024, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.Conv2d(1024, 1024, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.MaxPool2d(kernel_size=2, stride=2), 

            nn.Conv2d(1024, 2048, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.Conv2d(2048, 2048, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.Conv2d(2048, 2048, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.MaxPool2d(kernel_size=2, stride=2), 

            nn.Conv2d(2048, 4096, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.Conv2d(4096, 4096, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.Conv2d(4096, 4096, kernel_size=3, padding=1),  
            nn.GELU(),  
            nn.MaxPool2d(kernel_size=2, stride=2), 
        )


        self.classifier = nn.Sequential(
            nn.Linear(4096 * 1 * 1, 4096),  
            nn.GELU(),  
            nn.Dropout(),  
            nn.Linear(4096, 4096),  
            nn.GELU(),  
            nn.Dropout(),  #防过拟合
            nn.Linear(4096, 20),  
        )

    def forward(self,x):
        x = self.features(x)
        x = x.view(x.size(0),-1)
        x = self.classifier(x)
        return x



In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  


In [13]:
import math

In [15]:
example = LeNet().to(device)

lr = 0.001
epoches = 40
criterion = nn.BCEWithLogitsLoss().to(device)
loss_list = []
accuracy_list = []
loss_list1 = []
accuracy_list1 = []
optimizer = torch.optim.SGD(example.parameters(),lr=lr)

for epoch in range(20):
    train_loss = 0
    train_acc = 0


    for img,label,_ in train_loader:
        label = torch.max(label.flatten(1),dim=1)[0].long()
        label = nn.functional.one_hot(label,20).float()
        img,label = img.to(device),label.to(device)
        output = example(img)
        loss = criterion(output,label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _,pred = output.max(1)
        _,result = label.max(1)
        num_correct = (pred==result).sum().item()
        acc = num_correct / img.shape[0]
        train_acc += acc
    loss_list.append(train_loss/len(train_loader))
    accuracy_list.append(train_acc/len(train_loader))
    print('epoch: {}, Train Loss: {:.6f}, Train Acc: {:.6f}'.format(epoch+1, train_loss/len(train_loader), train_acc/len(train_loader)))

epoch: 1, Train Loss: 0.690297, Train Acc: 0.059176
epoch: 2, Train Loss: 0.672497, Train Acc: 0.010472
epoch: 3, Train Loss: 0.552837, Train Acc: 0.130818
epoch: 4, Train Loss: 0.238769, Train Acc: 0.306017
epoch: 5, Train Loss: 0.144769, Train Acc: 0.511802
epoch: 6, Train Loss: 0.126908, Train Acc: 0.513298
epoch: 7, Train Loss: 0.120437, Train Acc: 0.510140
epoch: 8, Train Loss: 0.117039, Train Acc: 0.510971
epoch: 9, Train Loss: 0.115428, Train Acc: 0.509973
epoch: 10, Train Loss: 0.113891, Train Acc: 0.512965
epoch: 11, Train Loss: 0.112842, Train Acc: 0.512301
epoch: 12, Train Loss: 0.112729, Train Acc: 0.512467
epoch: 13, Train Loss: 0.111668, Train Acc: 0.512965
epoch: 14, Train Loss: 0.111061, Train Acc: 0.510805
epoch: 15, Train Loss: 0.111326, Train Acc: 0.510140
epoch: 16, Train Loss: 0.110707, Train Acc: 0.510472
epoch: 17, Train Loss: 0.110605, Train Acc: 0.510805
epoch: 18, Train Loss: 0.110458, Train Acc: 0.510306


KeyboardInterrupt: 

In [16]:
test_acc = 0
for img,label,_ in test_loader:
        label = torch.max(label.flatten(1),dim=1)[0].long()
        label = nn.functional.one_hot(label,20).float()
        img,label = img.to(device),label.to(device)
        output = example(img)



        _,pred = output.max(1)
        _,result = label.max(1)
        num_correct = (pred==result).sum().item()
        acc = num_correct / img.shape[0]
        test_acc += acc
print(test_acc/len(test_loader))

0.571875


In [7]:
example = vgg16().to(device)

lr = 0.0001
epoches = 40
criterion = nn.BCEWithLogitsLoss().to(device)
loss_list = []
accuracy_list = []
loss_list1 = []
accuracy_list1 = []
optimizer = torch.optim.Adam(example.parameters(),lr=lr)

for epoch in range(200):
    train_loss = 0
    train_acc = 0

    for img,label,_ in train_loader:
        label = torch.max(label.flatten(1),dim=1)[0].long()
        label = nn.functional.one_hot(label,20).float()
        img,label = img.to(device),label.to(device)
        output = example(img)
        loss = criterion(output,label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _,pred = output.max(1)
        _,result = label.max(1)
        num_correct = (pred==result).sum().item()
        acc = num_correct / img.shape[0]
        train_acc += acc
    loss_list.append(train_loss/len(train_loader))
    accuracy_list.append(train_acc/len(train_loader))
    print('epoch: {}, Train Loss: {:.6f}, Train Acc: {:.6f}'.format(epoch+1, train_loss/len(train_loader), train_acc/len(train_loader)))


epoch: 1, Train Loss: 0.468922, Train Acc: 0.459774
epoch: 2, Train Loss: 0.442489, Train Acc: 0.485705
epoch: 3, Train Loss: 0.126226, Train Acc: 0.493684
epoch: 4, Train Loss: 0.683302, Train Acc: 0.496509
epoch: 5, Train Loss: 0.206603, Train Acc: 0.502660
epoch: 6, Train Loss: 14.916968, Train Acc: 0.473072
epoch: 7, Train Loss: 0.179896, Train Acc: 0.500000
epoch: 8, Train Loss: 0.113079, Train Acc: 0.501330
epoch: 9, Train Loss: 0.256043, Train Acc: 0.507979
epoch: 10, Train Loss: 0.113292, Train Acc: 0.505153
epoch: 11, Train Loss: 0.105343, Train Acc: 0.522108
epoch: 12, Train Loss: 3.100542, Train Acc: 0.512965
epoch: 13, Train Loss: 0.135791, Train Acc: 0.506483
epoch: 14, Train Loss: 0.126079, Train Acc: 0.505319
epoch: 15, Train Loss: 0.178775, Train Acc: 0.508810
epoch: 16, Train Loss: 5.463059, Train Acc: 0.487367
epoch: 17, Train Loss: 1.115323, Train Acc: 0.504322
epoch: 18, Train Loss: 1.001642, Train Acc: 0.506316
epoch: 19, Train Loss: 0.111011, Train Acc: 0.512467
e

KeyboardInterrupt: 

In [None]:
test_acc = 0
for img,label,_ in test_loader:
        label = torch.max(label.flatten(1),dim=1)[0].long()
        label = nn.functional.one_hot(label,20).float()
        img,label = img.to(device),label.to(device)
        output = example(img)



        _,pred = output.max(1)
        _,result = label.max(1)
        num_correct = (pred==result).sum().item()
        acc = num_correct / img.shape[0]
        test_acc += acc
print(test_acc/len(test_loader))

0.5734375


: 

In [5]:
resnetexample = torchvision.models.resnet18(pretrained=True)
resnetexample.fc = nn.Linear(resnetexample.fc.in_features,20)
resnetexample = resnetexample.to(device)

In [9]:
lr = 0.0001
epoches = 40
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(resnetexample.parameters(),lr=lr)
loss_list = []
accuracy_list = []
loss_list1 = []
accuracy_list1 = []

for epoch in range(200):
    train_loss = 0
    train_acc = 0

    for img,label,_ in train_loader:
        label = torch.max(label.flatten(1),dim=1)[0].long()
        label = nn.functional.one_hot(label,20).float()
        img,label = img.to(device),label.to(device)
        output = resnetexample(img)
        loss = criterion(output,label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _,pred = output.max(1)
        _,result = label.max(1)
        num_correct = (pred==result).sum().item()
        acc = num_correct / img.shape[0]
        train_acc += acc
    loss_list.append(train_loss/len(train_loader))
    accuracy_list.append(train_acc/len(train_loader))
    print('epoch: {}, Train Loss: {:.6f}, Train Acc: {:.6f}'.format(epoch+1, train_loss/len(train_loader), train_acc/len(train_loader)))


epoch: 1, Train Loss: 0.645103, Train Acc: 0.827793
epoch: 2, Train Loss: 0.293204, Train Acc: 0.906749
epoch: 3, Train Loss: 0.210051, Train Acc: 0.934009
epoch: 4, Train Loss: 0.154426, Train Acc: 0.951130
epoch: 5, Train Loss: 0.145466, Train Acc: 0.952128
epoch: 6, Train Loss: 0.116719, Train Acc: 0.962932
epoch: 7, Train Loss: 0.097587, Train Acc: 0.970412
epoch: 8, Train Loss: 0.101398, Train Acc: 0.967919
epoch: 9, Train Loss: 0.095263, Train Acc: 0.969914
epoch: 10, Train Loss: 0.074589, Train Acc: 0.977726
epoch: 11, Train Loss: 0.076528, Train Acc: 0.977061
epoch: 12, Train Loss: 0.066376, Train Acc: 0.979887
epoch: 13, Train Loss: 0.067370, Train Acc: 0.977726
epoch: 14, Train Loss: 0.066079, Train Acc: 0.981051
epoch: 15, Train Loss: 0.053978, Train Acc: 0.983544
epoch: 16, Train Loss: 0.070342, Train Acc: 0.978557
epoch: 17, Train Loss: 0.060927, Train Acc: 0.979887
epoch: 18, Train Loss: 0.058307, Train Acc: 0.980053
epoch: 19, Train Loss: 0.054309, Train Acc: 0.982879
ep

KeyboardInterrupt: 