# STAT4012 Project

- Implement residual network
- 4-fold cross validation
- Ensembling
- Add lr_scheduler
- Focal loss

In [2]:
! nvidia-smi -L

Tue Apr 11 12:41:07 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import multiprocessing as mp
num_cpu = mp.cpu_count()
num_cpu

2

# Download data


# Training

In [7]:
_exp_name = "CNN4"

In [8]:
# Import necessary packages.
import numpy as np
import pandas as pd
import torch
import os
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
from torch.utils.data import ConcatDataset, DataLoader, Subset, Dataset
from torchvision.datasets import DatasetFolder, VisionDataset
from tqdm import tqdm
import random

In [9]:
myseed = 4012  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

## **Transforms**
Torchvision provides lots of useful utilities for image preprocessing, data wrapping as well as data augmentation.

Please refer to PyTorch official website for details about different transforms.

In [10]:
test_tfm = transforms.Compose([
    # (height = width = 128)
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
])

train_tfm = transforms.Compose([
    # (height = width = 128)
    #transforms.CenterCrop()
    transforms.RandomResizedCrop((128, 128), scale=(0.7, 1.0)),
    #transforms.AutoAugment(transforms.AutoAugmentPolicy.IMAGENET),
    #transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    transforms.RandomHorizontalFlip(0.5),
    transforms.RandomVerticalFlip(0.5),
    transforms.RandomRotation(180),
    transforms.RandomAffine(30),
    #transforms.RandomInvert(p=0.2),
    #transforms.RandomPosterize(bits=2),
    #transforms.RandomSolarize(threshold=192.0, p=0.2),
    #transforms.RandomEqualize(p=0.2),
    transforms.RandomGrayscale(p=0.2),
    transforms.ToTensor(),
    #transforms.RandomApply(torch.nn.ModuleList([]))
])


## **Datasets**
The data is labelled by the name, so we load images and label while calling '__getitem__'

In [11]:
class FoodDataset(Dataset):

    def __init__(self,path=None,tfm=test_tfm,files=None):
        super(FoodDataset).__init__()
        self.path = path
        if path:
            self.files = sorted([os.path.join(path, x) for x in os.listdir(path) if x.endswith(".jpg")])
        else:
            self.files = files
        self.transform = tfm
  
    def __len__(self):
        return len(self.files)
  
    def __getitem__(self,idx):
        fname = self.files[idx]
        im = Image.open(fname)
        im = self.transform(im)
        #im = self.data[idx]
        try:
            label = int(fname.split("/")[-1].split("_")[0])
        except:
            label = -1 # test has no label
        return im,label

In [16]:
class Residual_Block(nn.Module):
    def __init__(self, ic, oc, stride=1):
        # torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
        # torch.nn.MaxPool2d(kernel_size, stride, padding)
        super().__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(ic, oc, kernel_size=3, stride=stride, padding=1),
            nn.BatchNorm2d(oc),
            nn.ReLU(inplace=True)
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(oc, oc, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(oc),
        )
        
        self.relu = nn.ReLU(inplace=True)
    
        self.downsample = None
        if stride != 1 or (ic != oc):
            self.downsample = nn.Sequential(
                nn.Conv2d(ic, oc, kernel_size=1, stride=stride),
                nn.BatchNorm2d(oc),
            )
        
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.conv2(out)
        
        if self.downsample:
            residual = self.downsample(x)
            
        out += residual
        return self.relu(out)

class Classifier(nn.Module):
    def __init__(self, block, num_layers, num_classes=11):
        super().__init__()
        self.preconv = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
        )
        
        self.layer0 = self.make_residual(block, 32, 64,  num_layers[0], stride=2)
        self.layer1 = self.make_residual(block, 64, 128, num_layers[1], stride=2)
        self.layer2 = self.make_residual(block, 128, 256, num_layers[2], stride=2)
        self.layer3 = self.make_residual(block, 256, 512, num_layers[3], stride=2)
        
#         self.avgpool = nn.AvgPool2d(2)
        
        self.fc = nn.Sequential(            
            nn.Dropout(0.4),
            nn.Linear(512*4*4, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Dropout(0.2),
            nn.Linear(512, 11),
        )
        
        
    def make_residual(self, block, ic, oc, num_layer, stride=1):
        layers = []
        layers.append(block(ic, oc, stride))
        for i in range(1, num_layer):
            layers.append(block(oc, oc))
        return nn.Sequential(*layers)
    
    def forward(self, x):
        # [3, 128, 128]
        out = self.preconv(x)  # [32, 64, 64]
        out = self.layer0(out) # [64, 32, 32]
        out = self.layer1(out) # [128, 16, 16]
        out = self.layer2(out) # [256, 8, 8]
        out = self.layer3(out) # [512, 4, 4]
#         out = self.avgpool(out) # [512, 2, 2]
        out = self.fc(out.view(out.size(0), -1)) 
        return out

In [None]:
import torch.nn.functional as F
from torch.autograd import Variable

class FocalLoss(nn.Module):
    def __init__(self, class_num, alpha=None, gamma=2, size_average=True):
        super().__init__()
        if alpha is None:
            self.alpha = Variable(torch.ones(class_num, 1))
        else:
            if isinstance(alpha, Variable):
                self.alpha = alpha
            else:
                self.alpha = Variable(alpha)
        self.gamma = gamma
        self.class_num = class_num
        self.size_average = size_average
        
    def forward(self, inputs, targets):
        N = inputs.size(0)
        C = inputs.size(1)
        P = F.softmax(inputs, dim=1)
        
        class_mask = inputs.data.new(N, C).fill_(0)
        class_mask = Variable(class_mask)
        ids = targets.view(-1, 1)
        class_mask.scatter_(1, ids.data, 1.)
        
        if inputs.is_cuda and not self.alpha.is_cuda:
            self.alpha = self.alpha.cuda()
        alpha = self.alpha[ids.data.view(-1)]
        probs = (P*class_mask).sum(1).view(-1, 1)
        
        log_p = probs.log()
        
        batch_loss = -alpha*(torch.pow((1-probs), self.gamma))*log_p
        
        if self.size_average:
            loss = batch_loss.mean()
        else:
            loss = batch_loss.sum()
            
        return loss

In [17]:
batch_size = 128
num_layers = [2, 3, 3, 1] # residual number layers
# （0, 994), (1, 429), (2, 1500), (3, 986), (4, 848), (5, 1325)
#  (6, 440), (7, 280), (8, 855), (9, 1500), (10, 709)
alpha = torch.Tensor([1.4, 3.2, 1, 1.4, 1.5, 1, 3.2, 4.5, 1.5, 1, 2])

n_epochs = 300
patience = 20 # If no improvement in 'patience' epochs, early stop

k_fold = 4

In [19]:
train_dir = "food-11/training"
val_dir = "food-11/validation"

train_files = [os.path.join(train_dir, x) for x in os.listdir(train_dir) if x.endswith('.jpg')]
val_files = [os.path.join(val_dir, x) for x in os.listdir(val_dir) if x.endswith('.jpg')]
total_files = train_files + val_files
random.seed(myseed)
random.shuffle(total_files)

num = len(total_files) // k_fold
len(total_files)

13296

In [None]:
# "cuda" only when GPUs are available.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

test_fold = k_fold

for i in range(test_fold):
    fold = i+1
    print(f'\n\nStarting Fold: {fold} ********************************************')
    model = Classifier(Residual_Block, num_layers).to(device)
    print(next(model.parameters()).device)
    criterion = FocalLoss(11, alpha=alpha)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0004, weight_decay=1e-5) 
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=20, T_mult=1)
    stale = 0
    best_acc = 0
    
    val_data = total_files[i*num: (i+1)*num]
    train_data = total_files[:i*num] + total_files[(i+1)*num:]
    
    train_set = FoodDataset(tfm=train_tfm, files=train_data)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=num_cpu, pin_memory=True)
    
    valid_set = FoodDataset(tfm=test_tfm, files=val_data)
    valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True, num_workers=num_cpu, pin_memory=True)
    
    for epoch in range(n_epochs):
    
        # ---------- Training ----------
        # Make sure the model is in train mode before training.
        model.train()
    
        # These are used to record information in training.
        train_loss = []
        train_accs = []
        lr = optimizer.param_groups[0]["lr"]
        
        pbar = tqdm(train_loader)
        pbar.set_description(f'T: {epoch+1:03d}/{n_epochs:03d}')
        for batch in pbar:
    
            # A batch consists of image data and corresponding labels.
            imgs, labels = batch
            #imgs = imgs.half()
            #print(imgs.shape,labels.shape)
    
            # Forward the data. (Make sure data and model are on the same device.)
            logits = model(imgs.to(device))
    
            # Calculate the cross-entropy loss.
            # We don't need to apply softmax before computing cross-entropy as it is done automatically.
            loss = criterion(logits, labels.to(device))
    
            # Gradients stored in the parameters in the previous step should be cleared out first.
            optimizer.zero_grad()
    
            # Compute the gradients for parameters.
            loss.backward()
    
            # Clip the gradient norms for stable training.
            grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)
    
            # Update the parameters with computed gradients.
            optimizer.step()
    
            # Compute the accuracy for current batch.
            acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()
    
            # Record the loss and accuracy.
            train_loss.append(loss.item())
            train_accs.append(acc)
            pbar.set_postfix({'lr':lr, 'b_loss':loss.item(), 'b_acc':acc.item(),
                    'loss':sum(train_loss)/len(train_loss), 'acc': sum(train_accs).item()/len(train_accs)})
            
        train_loss = sum(train_loss) / len(train_loss)
        train_acc = sum(train_accs) / len(train_accs)
        # Print the information.
        print(f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")
        scheduler.step()
        
        
        # Make sure the model is in eval mode so that some modules like dropout are disabled and work normally.
        model.eval()
    
        # These are used to record information in validation.
        valid_loss = []
        valid_accs = []
    
        # Iterate the validation set by batches.
        pbar = tqdm(valid_loader)
        pbar.set_description(f'V: {epoch+1:03d}/{n_epochs:03d}')
        for batch in pbar:

            # A batch consists of image data and corresponding labels.
            imgs, labels = batch
            #imgs = imgs.half()
    
            # We don't need gradient in validation.
            # Using torch.no_grad() accelerates the forward process.
            with torch.no_grad():
                logits = model(imgs.to(device))
    
            # We can still compute the loss (but not the gradient).
            loss = criterion(logits, labels.to(device))
    
            # Compute the accuracy for current batch.
            acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()
    
            # Record the loss and accuracy.
            valid_loss.append(loss.item())
            valid_accs.append(acc)
            pbar.set_postfix({'v_loss':sum(valid_loss)/len(valid_loss), 
                              'v_acc': sum(valid_accs).item()/len(valid_accs)})
        
    
        # The average loss and accuracy for entire validation set is the average of the recorded values.
        valid_loss = sum(valid_loss) / len(valid_loss)
        valid_acc = sum(valid_accs) / len(valid_accs)

        # Print the information.
        print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")

        # update logs
        if valid_acc > best_acc:
            with open(f"{_exp_name}_fold_{fold}_log.txt","a") as f:
                newline = '\n'
                item = f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f} -> best{newline}"
                f.write(item)
                print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f} -> best")
        else:
            with open(f"{_exp_name}_fold_{fold}_log.txt","a") as f:
                newline = '\n'
                item = f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}{newline}"
                f.write(item)
                print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")
    
        # save models
        if valid_acc > best_acc:
            print(f"Best model found at fold {fold} epoch {epoch+1}, acc={valid_acc:.5f}, saving model")
            torch.save(model.state_dict(), f"Fold_{fold}_best.ckpt")
            # only save best to prevent output memory exceed error
            best_acc = valid_acc
            stale = 0
        else:
            stale += 1
            if stale > patience:
                print(f"No improvment {patience} consecutive epochs, early stopping")
                break

cuda


Starting Fold: 1 ********************************************
cuda:0


T: 001/300: 100%|██████████| 78/78 [01:33<00:00,  1.19s/it, lr=0.0004, b_loss=1.95, b_acc=0.319, loss=2.12, acc=0.247] 


[ Train | 001/300 ] loss = 2.11805, acc = 0.24658


V: 001/300: 100%|██████████| 26/26 [00:27<00:00,  1.07s/it, v_loss=2.03, v_acc=0.299]


[ Valid | 001/300 ] loss = 2.03091, acc = 0.29854
[ Valid | 001/300 ] loss = 2.03091, acc = 0.29854 -> best
Best model found at fold 1 epoch 1, acc=0.29854, saving model


T: 002/300: 100%|██████████| 78/78 [01:07<00:00,  1.16it/s, lr=0.000398, b_loss=1.81, b_acc=0.414, loss=1.94, acc=0.306]


[ Train | 002/300 ] loss = 1.94454, acc = 0.30569


V: 002/300: 100%|██████████| 26/26 [00:19<00:00,  1.34it/s, v_loss=1.82, v_acc=0.367]


[ Valid | 002/300 ] loss = 1.81855, acc = 0.36651
[ Valid | 002/300 ] loss = 1.81855, acc = 0.36651 -> best
Best model found at fold 1 epoch 2, acc=0.36651, saving model


T: 003/300: 100%|██████████| 78/78 [01:05<00:00,  1.18it/s, lr=0.00039, b_loss=2.01, b_acc=0.302, loss=1.88, acc=0.338]


[ Train | 003/300 ] loss = 1.88497, acc = 0.33820


V: 003/300: 100%|██████████| 26/26 [00:19<00:00,  1.32it/s, v_loss=2.12, v_acc=0.298]


[ Valid | 003/300 ] loss = 2.12169, acc = 0.29815
[ Valid | 003/300 ] loss = 2.12169, acc = 0.29815


T: 004/300: 100%|██████████| 78/78 [01:07<00:00,  1.16it/s, lr=0.000378, b_loss=1.75, b_acc=0.397, loss=1.79, acc=0.369]


[ Train | 004/300 ] loss = 1.79490, acc = 0.36857


V: 004/300: 100%|██████████| 26/26 [00:18<00:00,  1.38it/s, v_loss=2.06, v_acc=0.321]


[ Valid | 004/300 ] loss = 2.05865, acc = 0.32090
[ Valid | 004/300 ] loss = 2.05865, acc = 0.32090


T: 005/300: 100%|██████████| 78/78 [01:07<00:00,  1.16it/s, lr=0.000362, b_loss=1.82, b_acc=0.405, loss=1.71, acc=0.401]


[ Train | 005/300 ] loss = 1.70568, acc = 0.40063


V: 005/300: 100%|██████████| 26/26 [00:19<00:00,  1.32it/s, v_loss=1.76, v_acc=0.407]


[ Valid | 005/300 ] loss = 1.75774, acc = 0.40674
[ Valid | 005/300 ] loss = 1.75774, acc = 0.40674 -> best
Best model found at fold 1 epoch 5, acc=0.40674, saving model


T: 006/300: 100%|██████████| 78/78 [01:06<00:00,  1.17it/s, lr=0.000341, b_loss=1.74, b_acc=0.388, loss=1.65, acc=0.427]


[ Train | 006/300 ] loss = 1.65336, acc = 0.42735


V: 006/300: 100%|██████████| 26/26 [00:19<00:00,  1.32it/s, v_loss=1.63, v_acc=0.442]


[ Valid | 006/300 ] loss = 1.63194, acc = 0.44161
[ Valid | 006/300 ] loss = 1.63194, acc = 0.44161 -> best
Best model found at fold 1 epoch 6, acc=0.44161, saving model


T: 007/300: 100%|██████████| 78/78 [01:07<00:00,  1.16it/s, lr=0.000318, b_loss=1.49, b_acc=0.491, loss=1.59, acc=0.446]


[ Train | 007/300 ] loss = 1.58626, acc = 0.44640


V: 007/300: 100%|██████████| 26/26 [00:18<00:00,  1.37it/s, v_loss=1.58, v_acc=0.453]


[ Valid | 007/300 ] loss = 1.58459, acc = 0.45334
[ Valid | 007/300 ] loss = 1.58459, acc = 0.45334 -> best
Best model found at fold 1 epoch 7, acc=0.45334, saving model


T: 008/300: 100%|██████████| 78/78 [01:07<00:00,  1.15it/s, lr=0.000291, b_loss=1.53, b_acc=0.526, loss=1.51, acc=0.476]


[ Train | 008/300 ] loss = 1.51147, acc = 0.47619


V: 008/300: 100%|██████████| 26/26 [00:18<00:00,  1.38it/s, v_loss=1.59, v_acc=0.452]


[ Valid | 008/300 ] loss = 1.59191, acc = 0.45155
[ Valid | 008/300 ] loss = 1.59191, acc = 0.45155


T: 009/300: 100%|██████████| 78/78 [01:06<00:00,  1.17it/s, lr=0.000262, b_loss=1.51, b_acc=0.44, loss=1.47, acc=0.494] 


[ Train | 009/300 ] loss = 1.46563, acc = 0.49412


V: 009/300: 100%|██████████| 26/26 [00:19<00:00,  1.33it/s, v_loss=1.54, v_acc=0.466]


[ Valid | 009/300 ] loss = 1.54291, acc = 0.46638
[ Valid | 009/300 ] loss = 1.54291, acc = 0.46638 -> best
Best model found at fold 1 epoch 9, acc=0.46638, saving model


T: 010/300: 100%|██████████| 78/78 [01:06<00:00,  1.18it/s, lr=0.000231, b_loss=1.34, b_acc=0.474, loss=1.42, acc=0.51] 


[ Train | 010/300 ] loss = 1.41721, acc = 0.50988


V: 010/300: 100%|██████████| 26/26 [00:20<00:00,  1.29it/s, v_loss=1.44, v_acc=0.497]


[ Valid | 010/300 ] loss = 1.44383, acc = 0.49665
[ Valid | 010/300 ] loss = 1.44383, acc = 0.49665 -> best
Best model found at fold 1 epoch 10, acc=0.49665, saving model


T: 011/300: 100%|██████████| 78/78 [01:05<00:00,  1.19it/s, lr=0.0002, b_loss=1.25, b_acc=0.569, loss=1.37, acc=0.534]


[ Train | 011/300 ] loss = 1.36530, acc = 0.53414


V: 011/300: 100%|██████████| 26/26 [00:18<00:00,  1.40it/s, v_loss=1.56, v_acc=0.476]


[ Valid | 011/300 ] loss = 1.56298, acc = 0.47595
[ Valid | 011/300 ] loss = 1.56298, acc = 0.47595


T: 012/300: 100%|██████████| 78/78 [01:05<00:00,  1.19it/s, lr=0.000169, b_loss=1.37, b_acc=0.543, loss=1.32, acc=0.549]


[ Train | 012/300 ] loss = 1.31890, acc = 0.54883


V: 012/300: 100%|██████████| 26/26 [00:19<00:00,  1.33it/s, v_loss=1.24, v_acc=0.567]


[ Valid | 012/300 ] loss = 1.23980, acc = 0.56741
[ Valid | 012/300 ] loss = 1.23980, acc = 0.56741 -> best
Best model found at fold 1 epoch 12, acc=0.56741, saving model


T: 013/300:  96%|█████████▌| 75/78 [01:03<00:02,  1.32it/s, lr=0.000138, b_loss=1.44, b_acc=0.547, loss=1.27, acc=0.555]

In [None]:
test_dir = "food-11/test"
test_set = FoodDataset(test_dir, tfm=test_tfm)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=num_cpu, pin_memory=True)

# Testing and generate prediction CSV

In [None]:
models = []
for i in range(test_fold):
    fold = i + 1
    model_best = Classifier(Residual_Block, num_layers).to(device)
    model_best.load_state_dict(torch.load(f"Fold_{fold}_best.ckpt"))
    model_best.eval()
    models.append(model_best)

prediction = []            
with torch.no_grad():
    for data,_ in test_loader:
        test_preds = [] 
        for model_best in models:
            test_preds.append(model_best(data.to(device)).cpu().data.numpy())
        test_preds = sum(test_preds)
        test_label = np.argmax(test_preds, axis=1)
        prediction += test_label.squeeze().tolist()

In [None]:
#create test csv
def pad4(i):
    return "0"*(4-len(str(i)))+str(i)
df = pd.DataFrame()
df["Id"] = [pad4(i) for i in range(1,len(test_set)+1)]
df["Category"] = prediction
df.to_csv("prediction_CNN3.csv",index = False)