In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import shutil as sh
import os
import time
import random

print(f"Cuda available: {torch.cuda.is_available()}")

Configuration of hyperparams and other variables that will be used later

In [None]:
img_size = 256

# Autoencoder stuff
batch_size = 64
learning_rate = 1e-3
autoencoder_epochs = 10
autoencoder_dropout = 0.1
convolutional_kernel = 4
convolutional_stride = 2
convolutional_padding = 1
max_pool_kernel = 2
amount_of_pictures_to_show = 10
encoder_name = "models/encoder_V2.pth"
decoder_name = "models/decoder_V2.pth"
autoencoder_name = "models/autoencoder_V2.pth"

# Classificator stuff
classifier_dropout = 0.1
classification_batch_size = 64
classification_learning_rate = 1e-3
classification_epochs = 15
encoder_name_to_load = encoder_name
classifier_name = "models/classifier_V2.pth"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device {device}")

workers = os.cpu_count() - 2
print(f"Using {workers} workers for loading the datasets")

### Fixing the food-101 dataset structure

In [None]:
def fixFood101(path: str):
    if os.path.exists(path):
        if os.path.exists(path + "/images/test/"):
            print("The structure is already fixed!")
            return
        
        file = open(path + "/meta/train.txt")
        list = file.readlines()
        file.close()

        for item in list:
            os.makedirs(path + "/images/train/"+item.split("/")[0], exist_ok=True)
            sh.move(path + "/images/"+item[:-1]+".jpg", path + "/images/train/"+item[:-1]+".jpg")

        file = open(path + "/meta/test.txt")
        list = file.readlines()
        file.close()

        for item in list:
            os.makedirs(path + "/images/test/"+item.split("/")[0], exist_ok=True)
            sh.move(path + "/images/"+item[:-1]+".jpg", path + "/images/test/"+item[:-1]+".jpg")

        for dirPath, _, _ in os.walk(path + "/images/"):
            try:
                os.rmdir(dirPath)
            except:
                pass
    else:
        print("Couldn't find food-101")
fixFood101("./Datasets/food-101")

### Preprocessing function

In [5]:
def preprocess(org_path: str, train: bool):
    arr = org_path.split("/")
    dst_start = "/".join(arr[:arr.index("Datasets") + 1]) # Gets the path to the Dataset folder 
    dst = ""
    if train:
        dst = dst_start + "/Cleaned_V2/train/" + "/".join(org_path.split("/")[-2:]) # ./Datasets/Cleaned_V2/train/class_name/file_name.extension
    else:
        dst = dst_start + "/Cleaned_V2/validation/" + "/".join(org_path.split("/")[-2:]) # ./Datasets/Cleaned_V2/validation/class_name/file_name.extension

    if not (dst.endswith(".jpg") or dst.endswith(".jpeg")):
        dst = ".".join(dst.split(".")[:-1]) + ".jpg" # ./Datasets/Cleaned_V2/x/class_name/file_name.jpg

    if os.path.exists(dst):
        print("File already preprocessed")
        return

    image = Image.open(org_path)
    image = image.resize((img_size, img_size), Image.Resampling.HAMMING) # Hamming is a resampling filter that produces good quality outputs

    if image.mode in ("RGBA", "LA"): #If the image has transparency, get rid of it
        background = Image.new("RGB", image.size, (255, 255, 255)) #Create a white image to act as the background
        background.paste(image, mask=image.split()[3]) #Apply this background where there is transparency on the image
        image = background

    os.makedirs("/".join(dst.split("/")[:-1]), exist_ok=True) # "/".join(dst.split("/")[:-1] = ./Datasets/Cleaned_V2/x/class_name/
                                                              # exists_ok=True means that if the directory already exists, no error should be thrown or exception raised
    image.save(dst, "JPEG")     

In [6]:
try:
    for dirPath, _, files in os.walk("./Datasets/food-101/images/train/"):
        if files:
            for file in files:
                preprocess(dirPath+"/"+file, True)

    for dirPath, _, files in os.walk("./Datasets/food-101/images/test/"):
        if files:
            for file in files:
                preprocess(dirPath+"/"+file, False)
except FileNotFoundError:
    print("Couldn't find an image. Problably the dataset doesn't exist or its path is wrong")

# Residual Autoencoder

Encodes the images first and then it decodes them, so it learns the most important patterns needed to recreate the image. <br>
This second version has residual connections in the Residual Blocks, and skip connections in the forward funtcion of the ResidualAutoencoder class

In [5]:
class ResidualEncoderBlock(nn.Module): # The blocks follow a structure similar to ResNet V2
    def __init__(self, in_channels, out_channels):
        super(ResidualEncoderBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(p=autoencoder_dropout)
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=convolutional_kernel, stride=convolutional_stride, padding=convolutional_padding, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(p=autoencoder_dropout)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)

        self.adjust = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=convolutional_stride, bias=False) if in_channels != out_channels else None

    def forward(self, x):
        shorcut = x
        if self.adjust:
            shorcut = self.adjust(x)

        out = self.bn1(x)
        out = self.relu1(out)
        out = self.dropout1(out)
        out = self.conv1(out)
        out = self.bn2(out)
        out = self.relu2(out)
        out = self.dropout2(out)
        out = self.conv2(out)

        out =  out + shorcut
        return out

class ResidualEncoder(nn.Module):
    def __init__(self):
        super(ResidualEncoder, self).__init__()
        self.block1 = ResidualEncoderBlock(3, 64)
        self.pool1 = nn.MaxPool2d(max_pool_kernel)
        self.block2 = ResidualEncoderBlock(64, 128)
        self.pool2 = nn.MaxPool2d(max_pool_kernel)
        self.block3 = ResidualEncoderBlock(128, 256)

    def forward(self, x): # This forward is only used in the classificator. The autoencoder itself deals with calling the layers from the encoder, as it needs to add the skip connections
        x = self.block1(x)
        x = self.pool1(x)
        x = self.block2(x)
        x = self.pool2(x)
        x = self.block3(x)
        return x
    
class ResidualDecoderBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ResidualDecoderBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(p=autoencoder_dropout)
        self.conv1 = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=convolutional_kernel, stride=convolutional_stride, padding=convolutional_padding, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(p=autoencoder_dropout)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)

        self.adjust = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=1, stride=convolutional_stride, output_padding=1, bias=False) if in_channels != out_channels else None

    def forward(self, x):
        shortcut = x
        if self.adjust:
            shortcut = self.adjust(x)

        out = self.bn1(x)
        out = self.relu1(out)
        out = self.dropout1(out)
        out = self.conv1(out)
        out = self.bn2(out)
        out = self.relu2(out)
        out = self.dropout2(out)
        out = self.conv2(out)
        
        out = out + shortcut
        return out


class ResidualDecoder(nn.Module):
    def __init__(self):
        super(ResidualDecoder, self).__init__()
        self.block1 = ResidualDecoderBlock(256, 128)
        self.up1 = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
        self.block2 = ResidualDecoderBlock(128*2, 64) # *2 as it has the concat of the skip connection
        self.up2 = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True)
        self.block3 = ResidualDecoderBlock(64*2, 3) # same


class ResidualAutoencoder(nn.Module):
    def __init__(self):
        super(ResidualAutoencoder, self).__init__()
        self.encoder = ResidualEncoder()
        self.decoder = ResidualDecoder()

    def forward(self, x):
        skip_connections = [None, None]
        
        # Input dimensions: 256x256x3

        # ENCODER

        x = self.encoder.block1(x) # 128x128x64
        skip_connections[0] = x
        x = self.encoder.pool1(x) # 64x64x64

        x = self.encoder.block2(x) # 32x32x128
        skip_connections[1] = x
        x = self.encoder.pool2(x) # 16x16x128

        latent = self.encoder.block3(x) # 8x8x256

        # DECODER

        x = self.decoder.block1(latent) # 16x16x128
        x = self.decoder.up1(x) # 32x32x128
        
        x = torch.cat([x, skip_connections[1]], dim=1) # 32x32x256 (the dimensions of the shortcut are concatenated to x, so they double)
        x = self.decoder.block2(x) #64x64x64
        x = self.decoder.up2(x) # 128x128x64

        x = torch.cat([x, skip_connections[0]], dim=1) # 128x128x128
        reconstructed = self.decoder.block3(x) # 256x256x3

        return latent, reconstructed

# Classifier

Uses the encoder from the autoencoder to "get" the important patterns of the image, and uses them for classifiying them

In [6]:
class Classifier(nn.Module):
    def __init__(self, encoder):
        super(Classifier, self).__init__()
        self.encoder = encoder
        for param in self.encoder.parameters():
            param.requires_grad = True # Sets the complete encoder to retrain (so the complete encoder gets fine tuned)

        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(256*8*8, 256*8) # 16384 -> 2048
        self.bn1 = nn.BatchNorm1d(256 * 8)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(p=classifier_dropout)
        self.fc2 = nn.Linear(256*8, 512) # 2048 -> 512
        self.bn2 = nn.BatchNorm1d(512)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(p=classifier_dropout)
        self.fc3 = nn.Linear(512, 101)
        
    def forward(self, x):
        x = self.encoder(x)
        x = self.flatten(x)

        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        # 
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.dropout2(x)

        x = self.fc3(x)
        return x

# Loading of the datasets

In [7]:
transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(30),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1), # Has a little less hue change that the V1
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # Normalizes the values to the [-1, 1] range (mean 0, stadndard eviation 1)
])

transform_validation = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = ImageFolder('./Datasets/Cleaned_V2/train', transform=transform_train)
validation_dataset = ImageFolder('./Datasets/Cleaned_V2/validation', transform=transform_validation)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=workers)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, num_workers=workers)
dataloaders = {"train": train_loader, "validation": validation_loader}

### Function to display images

In [13]:
def show_image(original, reconstructed, epoch):
    original = original.cpu().numpy().transpose(1, 2, 0)  # Convert the format of the image (H, W, C)
    reconstructed = reconstructed.cpu().numpy().transpose(1, 2, 0)

    # Denormalize the images so they can be printed ([-1, 1] range to [0, 1])
    original = (original + 1) / 2
    original = np.clip(original, 0, 1)
    reconstructed = (reconstructed + 1) / 2
    reconstructed = np.clip(reconstructed, 0, 1)

    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.imshow(original)
    plt.title("Original Image")
    plt.axis('off')

    plt.subplot(1, 2, 2)
    plt.imshow(reconstructed)
    plt.title(f"Reconstructed Image (Epoch {epoch+1})")
    plt.axis('off')

    plt.show()

def get_selected_image():
    iterator = iter(dataloaders["validation"])
    # Iterates a random amount of times the validation dataset. Each iteration is a different batch
    for _ in range(0, random.randrange(0, len(dataloaders["validation"]) - 1)):
        next(iterator)
    image, _ = next(iterator) # Gets the first image of a random batch
    return image

selected_image = get_selected_image()

# Autoencoder training

In [None]:
def train_autoencoder(model, criterion, optimizer, num_epochs):
    train_loss = []
    validation_loss = []

    since = time.time()
    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        for phase in ['train', 'validation']:
            if phase == 'train':
                model.train()  # Training mode
            else:
                model.eval()  # Evaluating mode

            running_loss = 0.0
            total_samples = 0
            
            with tqdm(total=len(dataloaders[phase]), desc=f"{phase} phase") as pbar:
                for inputs, labels in dataloaders[phase]:
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # Forward pass
                    with torch.set_grad_enabled(phase == 'train'):
                        _, outputs = model(inputs)
                        loss = criterion(outputs, inputs)

                        # Backward pass and optimization only in trainig
                        if phase == 'train':
                            optimizer.zero_grad()
                            loss.backward()
                            optimizer.step()

                    running_loss += loss.item() * inputs.size(0)
                    total_samples += inputs.size(0)
                    
                    pbar.set_postfix({"Loss": f"{running_loss / total_samples:.4f}"})
                    pbar.update(1)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            print(f"{phase} Loss: {epoch_loss:.4f}")

            if phase == "train":
                train_loss.append(float(epoch_loss))
            else:
                validation_loss.append(float(epoch_loss))

                with torch.no_grad():
                    # First show always the same image for a continuous evaluation
                    fixed_image = selected_image[0].unsqueeze(0).to(device)
                    _, reconstructed_image = model(fixed_image)
                    show_image(fixed_image[0], reconstructed_image[0], epoch)

                    # Then some random images to see how other classes are doing
                    for _ in range(0, amount_of_pictures_to_show):
                        iterator = iter(dataloaders["validation"])
                        for _ in range(0, random.randrange(0, len(dataloaders["validation"]) - 1)):
                            next(iterator)
                        image, _ = next(iterator)

                        i = random.randrange(0, len(image))
                        image = image[i].unsqueeze(0).to(device)
                        _, reconstructed_image = model.forward(image)
                        show_image(image[0], reconstructed_image[0], epoch)


        time_elapsed = time.time() - since
        print(f'Epoch finished at {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')

    # Saving the model
    torch.save(model.encoder.state_dict(), encoder_name)
    torch.save(model.decoder.state_dict(), decoder_name)
    torch.save(model.state_dict(), autoencoder_name)

    plt.plot(range(len(train_loss)), train_loss, label="Train")
    plt.plot(range(len(validation_loss)), validation_loss, label="Validation")
    plt.title("Loss")
    plt.legend("upper right")
    plt.show()

In [None]:
model = ResidualAutoencoder()
model = model.to(device)
print(f"Device for model: {next(model.parameters()).device}")

criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)


fixed_image = selected_image[0].unsqueeze(0).to(device)
show_image(fixed_image[0], fixed_image[0], -2)

try:
    train_autoencoder(model, criterion, optimizer, autoencoder_epochs)
except KeyboardInterrupt:
    print("Interrupted")
except Exception as e:
    print("Other exception " + str(e))
    raise e

In [None]:
pretrained_encoder = ResidualAutoencoder().encoder
pretrained_encoder.load_state_dict(torch.load(encoder_name_to_load))
pretrained_encoder = pretrained_encoder.to(device)

model = Classifier(pretrained_encoder).to(device)

train_loader = DataLoader(train_dataset, batch_size=classification_batch_size, shuffle=True, num_workers=workers)
validation_loader = DataLoader(validation_dataset, batch_size=classification_batch_size, shuffle=False, num_workers=workers)
dataloaders = {"train": train_loader, "validation": validation_loader}


# Classifier training

In [None]:
def train_classifier(model, criterion, optimizer, scheduler, num_epochs):
    since = time.time()
    train_acc = []
    train_loss = []
    validation_acc = []
    validation_loss = []
    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'validation']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            with tqdm(total=len(dataloaders[phase]), desc=f"{phase} phase") as pbar:
                for inputs, labels in dataloaders[phase]:
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            # zero the parameter gradients
                            optimizer.zero_grad()
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                    batch_acc = (torch.sum(preds == labels.data).item() / inputs.size(0)) * 100
                    pbar.set_postfix({"Loss": running_loss, "Accuracy": f"{batch_acc:.2f}%"})
                    pbar.update(1)
                    

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            if phase == 'train':
                scheduler.step(epoch_loss)
                
            if phase == "train":
                train_acc.append(float(epoch_acc))
                train_loss.append(float(epoch_loss))
            else:
                validation_acc.append(float(epoch_acc))
                validation_loss.append(float(epoch_loss))

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

        print()

        time_elapsed = time.time() - since
        print(f'Epoch finished at {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
        

    torch.save(model.state_dict(), classifier_name)

    plt.subplot(1, 2, 1)
    plt.plot(range(len(train_acc)), train_acc, label="Train")
    plt.plot(range(len(validation_acc)), validation_acc, label="Validation")
    plt.legend(loc="lower right")
    plt.title("Accuracy")

    plt.subplot(1, 2, 2)
    plt.plot(range(len(train_loss)), train_loss, label="Train")
    plt.plot(range(len(validation_loss)), validation_loss, label="Validation")
    plt.legend("lower right")
    plt.title("Loss")

    plt.show()

    return model

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=classification_learning_rate, weight_decay=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

try:
    model = train_classifier(model, criterion, optimizer, scheduler, classification_epochs)
except KeyboardInterrupt:
    print("Interrupted")
except Exception as e:
    print("Other exception: " + str(e))
    raise e

The image predictor class is in the Flask app