In [1]:
!pip install --upgrade transformers
!pip install torchvision



In [2]:
import torch
import numpy as np
import csv
from google.colab import drive
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from tabulate import tabulate
from tqdm import trange
import random
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModel
import torch
import torchvision
import torchvision.transforms as transforms
import os

torch.cuda.empty_cache()

# Verificar si hay una GPU disponible
if torch.cuda.is_available():
    device = torch.device("cuda")

In [3]:
def getData(bs, transform_train, transform_test):
  # Download dataset and define data loader
  trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
  trainloader = torch.utils.data.DataLoader(trainset, batch_size=bs, shuffle=True, num_workers=2)

  testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
  testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2)

  num_classes = 10
  return trainset, trainloader, testset, testloader, num_classes

In [4]:
def getAugmentation(img_size):
  transform_train = transforms.Compose([
      transforms.Resize((img_size,img_size)),
      transforms.RandomCrop(img_size, padding=img_size//8),
      transforms.RandomHorizontalFlip(),
      transforms.ToTensor(),
      #transforms.Normalize((0.49139968, 0.48215841, 0.44653091), (0.24703223, 0.24348513, 0.26158784)),
  ])

  transform_test = transforms.Compose([
      transforms.Resize((img_size,img_size)),
      transforms.ToTensor(),
      #transforms.Normalize((0.49139968, 0.48215841, 0.44653091), (0.24703223, 0.24348513, 0.26158784)),
  ])

  return transform_train,transform_test

In [5]:
img_size = 224
img_channels=3

# Training params
bs = 32 # batch size
epochs = 100 # total training epochs
load_check = False # to load a checkpoint
patch_size= 16 # patch size (square)
d_model=768 # dimensionality transformer representation

In [6]:
transform_train,transform_test=getAugmentation(img_size)
trainset,trainloader,testset,testloader,num_classes=getData(bs, transform_train, transform_test)

Files already downloaded and verified
Files already downloaded and verified


In [7]:
cifar_trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True  )
data = cifar_trainset.data / 255 # data is numpy array

mean = data.mean(axis = (0,1,2))
std = data.std(axis = (0,1,2))
print(f"Mean : {mean}   STD: {std}") #Mean : [0.491 0.482 0.446]   STD: [0.247 0.243 0.261]

Files already downloaded and verified
Mean : [0.49139968 0.48215841 0.44653091]   STD: [0.24703223 0.24348513 0.26158784]


In [8]:
trainset[0]

(tensor([[[0.0000, 0.0000, 0.0000,  ..., 0.6196, 0.6157, 0.6118],
          [0.0000, 0.0000, 0.0000,  ..., 0.6000, 0.5961, 0.5922],
          [0.0000, 0.0000, 0.0000,  ..., 0.5804, 0.5765, 0.5725],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
 
         [[0.0000, 0.0000, 0.0000,  ..., 0.5176, 0.5137, 0.5098],
          [0.0000, 0.0000, 0.0000,  ..., 0.4941, 0.4902, 0.4863],
          [0.0000, 0.0000, 0.0000,  ..., 0.4667, 0.4627, 0.4627],
          ...,
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
          [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
 
         [[0.0000, 0.0000, 0.0000,  ..., 0.4235, 0.4196, 0.4157],
          [0.0000, 0.0000, 0.0000,  ..., 0.3922, 0.3882, 0.3882],
          [0.0000, 0.0000, 0.0000,  ...,

In [9]:
from transformers import AutoImageProcessor

model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = AutoImageProcessor.from_pretrained(model_name_or_path)

model = AutoModel.from_pretrained(model_name_or_path)

for param in model.parameters():
    param.requires_grad = False

In [10]:
from transformers import AutoModel, AutoConfig
config = AutoConfig.from_pretrained(model_name_or_path)
d_model = config.hidden_size

print("El valor de d_model es:", d_model)

El valor de d_model es: 768


In [11]:
class modelViT(nn.Module):

    def __init__(self, ViT, d_model, num_classes):

        super(modelViT, self).__init__()
        self.d_model=d_model
        self.num_classes=num_classes

        self.ViT = ViT
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, num_classes)
        )

    def forward(self, data):

        x = self.ViT(data)
        last_hidden_state = x["last_hidden_state"]
        x = last_hidden_state.mean(dim=1)
        x = self.mlp_head(x)

        return x


In [12]:
model = modelViT(model, d_model, num_classes)
model = model.to(device)

In [13]:
# Se crea el optimizador
optimizer = optim.Adam(model.parameters(), lr=0.00005)
# Se crea la funcion de perdida en base a esos pesos
mse_loss = torch.nn.CrossEntropyLoss()
epochs = 4
best_acc = 0.0
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs, eta_min=1e-5)

In [14]:
# Para entrenar el modelo
def train():

  model.train()

  total_loss = 0
  correct = 0
  total = 0

  # lista de predicciones
  total_preds=[]

  # Se itera sobre los batches
  for step,batch in enumerate(trainloader):
    data, labels = batch
    data = data.to(device)
    labels = labels.to(device)
    features = feature_extractor(data,  return_tensors='pt', do_rescale=False)  # extract features
    features=features['pixel_values']
    features= features.to(device)

    # Se obtienen las predicciones del modelo
    preds = model(features)
    loss = mse_loss(preds, labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    total_loss += loss.item()
    _, predicted = preds.max(1)

    total += labels.size(0)
    correct += predicted.eq(labels).sum().item()

    print('\r %d %d -- Loss: %.3f | Acc: %.2f%%' % (step+1, len(trainloader), total_loss/(step+1), 100.*correct/total), end="")

  return total_loss/(step+1),100.*correct/total

In [15]:
# Para entrenar el modelo
def test():
  global best_acc
  model.eval()

  test_loss = 0
  correct = 0
  total = 0

  with torch.no_grad():
    # Se itera sobre los batches
    for step,batch in enumerate(testloader):
      data, labels = batch
      data = data.to(device)
      labels = labels.to(device)
      features = feature_extractor(data,  return_tensors='pt', do_rescale=False)  # extract features
      features=features['pixel_values']
      features= features.to(device)

      # Se obtienen las predicciones del modelo
      preds = model(features)
      loss = mse_loss(preds, labels)

      test_loss += loss.item()
      _, predicted = preds.max(1)

      total += labels.size(0)
      correct += predicted.eq(labels).sum().item()

      print('\r %d %d -- Loss: %.3f | Acc: %.2f%%' % (step+1, len(testloader), test_loss/(step+1), 100.*correct/total), end="")

  # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        print("")
        print('Saving checkpoint..')
        if not os.path.isdir('checkpoint'):
            os.mkdir('checkpoint')
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': test_loss,
            'acc': acc}, './checkpoint/vit-ckpt.t7')
        best_acc = acc
  return test_loss/(step+1),100.*correct/total

In [16]:
for epoch in range(epochs):
    print('\n============ Epoch: %d ==============' % epoch)
    print()

    print("Training, lr= %f" %(optimizer.param_groups[0]['lr']))
    trainloss,acc = train()
    print("")

    print("Validation, best acc=%f" %(best_acc))
    val_loss, acc = test()
    print("")

    #scheduler.step(trainloss) # step scheduling
    scheduler.step() # step scheduling



Training, lr= 0.000050
 1563 1563 -- Loss: 0.542 | Acc: 88.80%
Validation, best acc=0.000000
 100 100 -- Loss: 0.221 | Acc: 95.34%
Saving checkpoint..



Training, lr= 0.000044
 1563 1563 -- Loss: 0.151 | Acc: 96.44%
Validation, best acc=95.340000
 100 100 -- Loss: 0.158 | Acc: 95.97%
Saving checkpoint..



Training, lr= 0.000030
 1563 1563 -- Loss: 0.122 | Acc: 96.70%
Validation, best acc=95.970000
 100 100 -- Loss: 0.141 | Acc: 96.29%
Saving checkpoint..



Training, lr= 0.000016
 1563 1563 -- Loss: 0.112 | Acc: 96.91%
Validation, best acc=96.290000
 100 100 -- Loss: 0.136 | Acc: 96.35%
Saving checkpoint..

