# Modelo ConvNeXt

% Explicacion de lo que se hará y cómo se hará

### Preparación de los datos

Entrenaremos al modelo en el conjunto de datos CIFAR-10. Este dataset consiste de 60000 imágenes a color en 10 clases distintas, donde no hay intersección entre las distintas clases. Se puede acceder al dataset mediante las herramientas de la paquetería de pytorch, o también en la página oficial: https://www.cs.toronto.edu/~kriz/cifar.html

In [1]:
# Importamos las paqueterías necesarias para el notebook
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler

# De ser posible utilizaremos GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def data_loader(data_dir,
                batch_size,
                random_seed=42,
                valid_size=0.1,
                shuffle=True,
                test=False):
    """
    Función para cargar los datos de CIFAR-10
    """
    
    # Definimos el transform para normalizar los datos con pytorch
    # Los valores fueron obtenidos en el notebook de datos "data_extraction.ipynb"
    normalize = transforms.Normalize(  
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    # Definimos el transform para preporcesar los datos
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),  # Tomo imágenes de 34x34 para evitar una excepción en pytorch al normalizar la última capa
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize
    ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        normalize
    ])
    
    # Obtener los datos del conjunto de prueba
    if test:
        dataset = datasets.CIFAR10(
          root=data_dir, train=False,
          download=True, transform=transform_test,
        )

        data_loader = torch.utils.data.DataLoader(
            dataset, batch_size=batch_size, shuffle=shuffle
        )

        return data_loader

    # Cargamos una copia de los datos de entrenamiento
    train_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=transform_train,
    )
    
    # Cargamos una copia extra de los datos de entrenamiento para dividirlo después en el conjunto de validación
    valid_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=transform_train,
    )
    
    # Separamos los datos de entrenamiento y validación mediante índices
    num_train = len(train_dataset)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(42)
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)
    
    # Finalmente, definimos los conjuntos de entrenamiento y validación
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler)

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler)

    return (train_loader, valid_loader)


# Aplicamos la función para cargar los datos de CIFAR-10, los guardamos en el directorio actual
train_loader, valid_loader = data_loader(data_dir='./data',
                                         batch_size=64)

test_loader = data_loader(data_dir='./data',
                              batch_size=64,
                              test=True)    
cifar10_classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


### Función de entrenamiento

Con la siguiente función obtendremos los resultados de cada modelo, explicar que los datos salen de la primera parte de modificación del paper de convnext

In [3]:
import gc
def entrenamiento(model, epocas):
    
    model = model.to(device)
    
    # variables para guardar los resultados
    accuracy_training_epochs = []
    accuracy_validation_epochs = []
    loss_epoch = []
    test_accuracy = []
    
    # parámetros de entrenamiento
    num_epochs = epocas

    optimizer = optim.AdamW(model.parameters(),
                            lr=0.004,
                            betas=(0.9, 0.999),
                            weight_decay=0.05
                            )
    
    criterion = nn.CrossEntropyLoss()

    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epocas)

    
    # entrenamiento
    for epoch in range(num_epochs):
        start_time = time.time()
        for i, (images, labels) in enumerate(train_loader):

            # Mover a los tensores a GPU de ser posible
            images = images.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Ahorro de memoria
            del images, labels, outputs
            torch.cuda.empty_cache()
            gc.collect()

        loss_epoch.append(loss.item()) # Guardar la información del loss de esta época
        lr_scheduler.step() # Implementación de learning rate decay

        # Medición de la exactitud en el conjunto de validación
        with torch.no_grad():
            correct = 0
            total = 0
            for images, labels in valid_loader:
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                del images, labels, outputs
            val_accuracy = correct/total
            accuracy_validation_epochs.append(val_accuracy)

        # Medición de la exactitud sobre todo el conjunto de entrenamiento
        with torch.no_grad():
            correct = 0
            total = 0
            for images, labels in train_loader:
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                del images, labels, outputs
            train_accuracy = correct/total
            accuracy_training_epochs.append(train_accuracy)

        # Medición de la exactitud en el conjunto de prueba
        with torch.no_grad():
            correct = 0
            total = 0
            for images, labels in test_loader:
                images = images.to(device)
                labels = labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                del images, labels, outputs
            t_acc = correct/total
            test_accuracy.append(t_acc)


        # Imprimir la pérdida, la exactitud en la validación y la exactitud en los datos de entrenamiento, de esta época.
        print(f"Epoch [{epoch+1}/{num_epochs}], Training accuracy: {round(train_accuracy,3)}, Validation accuracy: {round(val_accuracy,3)}, loss = {round(loss_epoch[-1],3)}")
        print(f"Time spent on epoch {epoch+1}: {round((time.time()-start_time)/60,2)}min")
        
   # return final model, training accuracy, validation accuracy, test accuracy, loss. Info de todas las épocas.     
    return [model,
            accuracy_training_epochs,
            accuracy_validation_epochs,
            test_accuracy,
            loss_epoch]  


### ResNet-50

El paper parte de una ResNet-50 como modelo base al que irá modificndo.
La explicación del código se encuentra en el notebook de ResNet50

In [4]:
class utilConv(nn.Sequential):
    """
    Capa de utilidad compuesta por una capa de convolución seguida de una de normalización y luego una de activación.
    """
    def __init__(self, in_features, out_features, kernel_size, stride = 1, norm = nn.BatchNorm2d, act = nn.ReLU, bias=True):
        super().__init__(
            nn.Conv2d(in_features, out_features, kernel_size=kernel_size ,padding=kernel_size // 2, stride=stride, bias=bias),
            norm(out_features),
            act()
        )
        
class BottleNeckBlock(nn.Module):
    def __init__(self,in_features, out_features, reduction = 4, stride = 1):
        super().__init__()
        reduced_features = out_features // reduction
        self.block = nn.Sequential(
            # Reducción de canales
            utilConv(in_features, reduced_features, kernel_size=1, stride=stride, bias=False), # el stride puede ser 2 para aplicar downsampling
            # El número de canales se mantiene fijo
            utilConv(reduced_features, reduced_features, kernel_size=3, bias=False),
            # Aumento de canales
            utilConv(reduced_features, out_features, kernel_size=1, bias=False, act=nn.Identity), 
        )
        
        # self.shortcut es utilizado para transformar al input a las dimensiones correctas para poder sumarlo a la salida del bloque
        if in_features != out_features:
            self.shortcut =nn.Sequential(utilConv(in_features, out_features, kernel_size=1, stride=stride, bias=False))
        else:
            self.shortcut = nn.Identity()

        self.act = nn.ReLU()

    def forward(self, x):
        res = x
        x = self.block(x)
        res = self.shortcut(res)
        x += res
        x = self.act(x)
        return x
    
class Stage(nn.Sequential):
    """
    Esta capa define al "stage", el cual es un conjunto de bloques residuales
    depth es el número de bloques residuales
    in_features el numero de canales con que empieza
    out_features el numer de canales con que termina
    """
    def __init__(self, in_features, out_features, depth, stride = 2):  # in_features y out_features deben ser distintos, sino se aplicará downsampling y el Bottleneck no aplicará la identidad
        super().__init__(
            
            BottleNeckBlock(in_features, out_features, stride=stride), # Aquí se lleva a cabo el downsampling
            *[BottleNeckBlock(out_features, out_features) for _ in range(depth - 1)]
        )
        
        
class Stem(nn.Sequential):
    def __init__(self, in_features, out_features):
        super().__init__(
            utilConv(in_features, out_features, kernel_size=3, stride=1),  # en el caso de ImageNet, el kernel es de tamaño 7
#             nn.MaxPool2d(kernel_size=3, stride=2, padding=1)  
        )
        
class Encoder(nn.Module):
    
    """
    widths es una lista con el número de canales respectivos al final de cada "stage"
    depths es una lista con el numero de bloques residuales que tendrá cada "stage"
    stem_features es el número de canales que resulta de la primera capa de downsampling
    
    """
    def __init__(self, in_channels, stem_features, depths, widths):  # 
        super().__init__()
        self.stem = Stem(in_channels, stem_features)

        in_out_widths = list(zip(widths, widths[1:]))

        
        self.stages = nn.ModuleList() # lista de pytorch con los stages
        
        self.stages.append(Stage(stem_features, widths[0], depths[0], stride=1)) # se puede inferir de la figura 1 del artículo que el primer bloque del stage1 tiene stride 1
        
        for (in_features, out_features), depth in zip(in_out_widths, depths[1:]):
            # añadir cada uno de los stages
            self.stages.append(Stage(in_features, out_features, depth))
            

    def forward(self, x):
        x = self.stem(x)
        for stage in self.stages:

            x = stage(x)
        return x
    

    
class Decoder(nn.Module):
    """
    Capa para clasificar los datos
    """
    def __init__(self, in_features, n_classes):
        super().__init__()
        self.avg = nn.AdaptiveAvgPool2d((1, 1))
        self.decoder = nn.Linear(in_features, n_classes)

    def forward(self, x):
        x = self.avg(x)
        x = x.view(x.size(0), -1)
        x = self.decoder(x)
        x = F.softmax(x, dim=1)
        return x

Con esto podemos definir nuestro modelo base

In [5]:
class ResNet(nn.Module):
    
    def __init__(self, in_channels, n_classes, stem_features, depths, widths ):
        super().__init__()
        self.encoder = Encoder(in_channels=in_channels, stem_features=stem_features, depths=depths, widths=widths)
        self.decoder = Decoder(widths[-1], n_classes)
        
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [None]:
# repetimos 3 veces el experimento 

model1 = ResNet(in_channels=3, n_classes = 10, stem_features=64, depths=[3,4,6,3], widths=[64, 128, 256,512]).to(device)
model2 = ResNet(in_channels=3, n_classes = 10, stem_features=64, depths=[3,4,6,3], widths=[64, 128, 256,512]).to(device)
model3 = ResNet(in_channels=3, n_classes = 10, stem_features=64, depths=[3,4,6,3], widths=[64, 128, 256,512]).to(device)

model1, training1, validation1, test1, loss1 = entrenamiento(model1, 200)
model2, training2, validation2, test2, loss2 = entrenamiento(model2, 200)
model3, training3, validation3, test3, loss3 = entrenamiento(model3, 200)

Epoch [1/200], Training accuracy: 0.252, Validation accuracy: 0.257, loss = 2.228
Time spent on epoch 1: 3.13min
Epoch [2/200], Training accuracy: 0.275, Validation accuracy: 0.271, loss = 2.324
Time spent on epoch 2: 3.25min
Epoch [3/200], Training accuracy: 0.321, Validation accuracy: 0.319, loss = 2.2
Time spent on epoch 3: 3.33min
Epoch [4/200], Training accuracy: 0.342, Validation accuracy: 0.335, loss = 2.332
Time spent on epoch 4: 3.22min
Epoch [5/200], Training accuracy: 0.375, Validation accuracy: 0.369, loss = 2.23
Time spent on epoch 5: 3.05min
Epoch [6/200], Training accuracy: 0.39, Validation accuracy: 0.392, loss = 2.17
Time spent on epoch 6: 3.1min
Epoch [7/200], Training accuracy: 0.41, Validation accuracy: 0.405, loss = 2.076
Time spent on epoch 7: 3.02min
Epoch [8/200], Training accuracy: 0.423, Validation accuracy: 0.419, loss = 2.204
Time spent on epoch 8: 3.02min
Epoch [9/200], Training accuracy: 0.469, Validation accuracy: 0.459, loss = 1.752
Time spent on epoch 9

Epoch [73/200], Training accuracy: 0.701, Validation accuracy: 0.692, loss = 2.187
Time spent on epoch 73: 3.0min
Epoch [74/200], Training accuracy: 0.723, Validation accuracy: 0.71, loss = 2.156
Time spent on epoch 74: 3.0min
Epoch [75/200], Training accuracy: 0.718, Validation accuracy: 0.699, loss = 1.67
Time spent on epoch 75: 3.02min
Epoch [76/200], Training accuracy: 0.725, Validation accuracy: 0.714, loss = 1.549
Time spent on epoch 76: 3.0min
Epoch [77/200], Training accuracy: 0.721, Validation accuracy: 0.714, loss = 1.729
Time spent on epoch 77: 3.0min
Epoch [78/200], Training accuracy: 0.725, Validation accuracy: 0.722, loss = 2.076
Time spent on epoch 78: 3.0min
Epoch [79/200], Training accuracy: 0.722, Validation accuracy: 0.72, loss = 1.711
Time spent on epoch 79: 3.0min
Epoch [80/200], Training accuracy: 0.733, Validation accuracy: 0.719, loss = 1.82
Time spent on epoch 80: 3.02min
Epoch [81/200], Training accuracy: 0.727, Validation accuracy: 0.716, loss = 2.013
Time sp

Epoch [144/200], Training accuracy: 0.815, Validation accuracy: 0.792, loss = 1.717
Time spent on epoch 144: 3.21min
Epoch [145/200], Training accuracy: 0.821, Validation accuracy: 0.791, loss = 1.461
Time spent on epoch 145: 3.21min
Epoch [146/200], Training accuracy: 0.828, Validation accuracy: 0.806, loss = 1.67
Time spent on epoch 146: 3.22min
Epoch [147/200], Training accuracy: 0.832, Validation accuracy: 0.804, loss = 1.586
Time spent on epoch 147: 3.21min
Epoch [148/200], Training accuracy: 0.83, Validation accuracy: 0.805, loss = 1.596
Time spent on epoch 148: 3.24min
Epoch [149/200], Training accuracy: 0.828, Validation accuracy: 0.801, loss = 1.603
Time spent on epoch 149: 3.27min
Epoch [150/200], Training accuracy: 0.825, Validation accuracy: 0.806, loss = 1.69
Time spent on epoch 150: 3.22min
Epoch [151/200], Training accuracy: 0.832, Validation accuracy: 0.817, loss = 1.633
Time spent on epoch 151: 3.26min
Epoch [152/200], Training accuracy: 0.832, Validation accuracy: 0.8

Epoch [15/200], Training accuracy: 0.576, Validation accuracy: 0.569, loss = 1.763
Time spent on epoch 15: 3.04min
Epoch [16/200], Training accuracy: 0.586, Validation accuracy: 0.569, loss = 2.089
Time spent on epoch 16: 3.02min
Epoch [17/200], Training accuracy: 0.593, Validation accuracy: 0.582, loss = 1.948
Time spent on epoch 17: 3.02min
Epoch [18/200], Training accuracy: 0.589, Validation accuracy: 0.583, loss = 2.11
Time spent on epoch 18: 3.02min
Epoch [19/200], Training accuracy: 0.597, Validation accuracy: 0.585, loss = 1.956
Time spent on epoch 19: 3.02min
Epoch [20/200], Training accuracy: 0.604, Validation accuracy: 0.594, loss = 1.955
Time spent on epoch 20: 3.02min
Epoch [21/200], Training accuracy: 0.621, Validation accuracy: 0.61, loss = 1.828
Time spent on epoch 21: 3.02min
Epoch [22/200], Training accuracy: 0.623, Validation accuracy: 0.607, loss = 1.77
Time spent on epoch 22: 3.02min
Epoch [23/200], Training accuracy: 0.621, Validation accuracy: 0.611, loss = 2.001


Epoch [87/200], Training accuracy: 0.742, Validation accuracy: 0.731, loss = 1.844
Time spent on epoch 87: 2.9min
Epoch [88/200], Training accuracy: 0.738, Validation accuracy: 0.721, loss = 1.709
Time spent on epoch 88: 2.89min
Epoch [89/200], Training accuracy: 0.756, Validation accuracy: 0.742, loss = 1.908
Time spent on epoch 89: 2.87min
Epoch [90/200], Training accuracy: 0.756, Validation accuracy: 0.745, loss = 1.596
Time spent on epoch 90: 2.88min
Epoch [91/200], Training accuracy: 0.749, Validation accuracy: 0.741, loss = 1.552
Time spent on epoch 91: 2.88min
Epoch [92/200], Training accuracy: 0.752, Validation accuracy: 0.741, loss = 1.869
Time spent on epoch 92: 2.87min
Epoch [93/200], Training accuracy: 0.741, Validation accuracy: 0.726, loss = 1.837
Time spent on epoch 93: 2.88min
Epoch [94/200], Training accuracy: 0.763, Validation accuracy: 0.749, loss = 1.612
Time spent on epoch 94: 2.98min


In [None]:
#guardar resultados
results_dict1 = {"loss": loss1,
    'Train':training1,
     'Validation': validation1,
     "Test":test1}
results_dict2 = {"loss": loss2,
    'Train':training2,
     'Validation': validation2,
     "Test":test2}
results_dict3 = {"loss": loss3,
    'Train':training3,
     'Validation': validation3,
     "Test":test3}

results1_base = pd.DataFrame(results_dict1)
results2_base = pd.DataFrame(results_dict2)
results3_base = pd.DataFrame(results_dict3)

In [None]:
accuracy = (results1_base["Test"].max() + results2_base["Test"].max() + results3_base["Test"].max())/3
print(f"Accuracy del modelo base: {accuracy}")

In [None]:
results1_base.to_csv("./results/results_convnext_base_1.csv",index=False)
results2_base.to_csv("./results/results_convnext_base_2.csv",index=False)
results3_base.to_csv("./results/results_convnext_base_3.csv",index=False)

% Changing stage compute ratio

% Changing stem to patchify

% resnextify y aumento de canales originalmente de 64 a 96, tambien hacer el depthwise en el bloque de bottleneck

% inverted bottleneck

% Large Kernel Sizes

% Micro Design