### Importing the model

In [None]:
!pip install transformers

In [1]:
from transformers import ViTImageProcessor, ViTForImageClassification
from PIL import Image
import requests

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
# model predicts one of the 1000 ImageNet classes
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx])


  from .autonotebook import tqdm as notebook_tqdm


Predicted class: Egyptian cat


## Configuring the model


In [None]:
model

In [2]:
from torchvision.datasets import ImageFolder
from torchvision import transforms
from torch.utils.data import DataLoader
import os 
import torch.nn as nn
import torch.optim as optim

import torch
from torcheval.metrics import MulticlassF1Score, MulticlassPrecision, MulticlassRecall, MulticlassConfusionMatrix


In [None]:
model.classifier.out_features

In [3]:
# Creating our classification layer

# freeze the layers
for param in model.parameters():
    param.requires_grad = False

# Replaces the FC layer for our classes
num_ftrs = model.classifier.in_features
model.classifier = nn.Linear(num_ftrs, 3)



In [4]:
# Now the model has 3 output features instead of 1000 in the classifier layer

model

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermed

## Training 

In [5]:
# Using our functions

def dataload(path):


    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor()]) # I removed the normalize part because of the tokenizer from the ViT

    path_test = os.path.join(path, "test")
    path_train = os.path.join(path, "train")
    path_eval = os.path.join(path, "eval")

    test = ImageFolder(root=path_test, transform=transform) # Automatically classifies the folder order as 0, 1 and 2 respectively
    train = ImageFolder(root=path_train, transform=transform)
    eval = ImageFolder(root=path_eval, transform=transform)



    train_dataloader = DataLoader(train, batch_size=32, num_workers=4,shuffle=True)
    test_dataloader = DataLoader(test, batch_size=32, num_workers=4,shuffle=True)
    eval_dataloader = DataLoader(eval, batch_size=32, num_workers=4,shuffle=True)

    return train_dataloader, test_dataloader, eval_dataloader

train_dataloader, test_dataloader, eval_dataloader = dataload("C:/Users/Lucas/Documents/GitHub/OrangeDetect/data/processed")

In [6]:
epochs =1 
device ='cuda'

In [7]:
import torch.optim as optim

# Configuração do otimizador
optimizer = optim.AdamW(
    model.parameters(),
    lr=3e-4,  # learning rate padrão usado no paper
    weight_decay=0.3,  # weight decay para regularização
    betas=(0.9, 0.999)  # valores padrão do Adam
)

# Learning rate scheduler (opcional, mas recomendado)
scheduler = optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=epochs,  # número total de épocas
    eta_min=1e-6   # learning rate mínimo
)

In [8]:
criterion = nn.CrossEntropyLoss()

In [None]:
for batch, (X, y) in enumerate(train_dataloader):
    test = processor(X, return_tensors="pt", do_rescale=False)
    model(**test)
    break

In [9]:
model.to(device)
model.train()

for epoch in range(epochs):
    print(f'\n------ Epoch {epoch + 1} ------')
    
    # Training phase
    for batch, (X, y) in enumerate(train_dataloader):
        X, y = X.to(device), y.to(device)
        X = processor(X, return_tensors="pt", do_rescale=False).to(device)
        
        # Forward pass
        optimizer.zero_grad()
        pred = model(**X)
        loss = criterion(pred['logits'], y)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        if batch % 100 == 0:
            print(f'Loss: {loss.item():.4f}')



------ Epoch 1 ------
Loss: 1.2162
Loss: 0.1423
Loss: 0.0762


In [11]:
print('\nEvaluating...')

# Evaluation phase
model.eval()
correct = 0
total_loss = 0
y_correct = []
y_pred = []

# Initialize metrics
f1score = MulticlassF1Score(num_classes=3)
precision_metric = MulticlassPrecision(num_classes=3)
recall_metric = MulticlassRecall(num_classes=3)

metrics = {
    'loss': [],
    'correct': [],
    'f1': [],
    'precision': [],
    'recall': []
}

with torch.no_grad():
    for X_test, y_test in eval_dataloader:
        X_test, y_test = X_test.to(device), y_test.to(device)
        pred = model(X_test)
        #pred = pred['logits'],
        loss = criterion(pred['logits'], y_test).item()
        total_loss += loss

        predicted = pred['logits'].argmax(dim=1)
        correct_batch = (predicted == y_test).sum().item()
        correct += correct_batch

        y_pred.append(predicted)
        y_correct.append(y_test)

        # Update metrics
        f1score.update(y_test, predicted)
        precision_metric.update(y_test, predicted)
        recall_metric.update(y_test, predicted)

        metrics['loss'].append(loss)
        metrics['correct'].append(correct_batch)

# Compute final metrics
metrics['f1'] = f1score.compute().tolist()
metrics['precision'] = precision_metric.compute().tolist()
metrics['recall'] = recall_metric.compute().tolist()

accuracy = correct / len(eval_dataloader.dataset)
mean_loss = sum(metrics['loss']) / len(metrics['loss'])

print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Mean Loss: {mean_loss:.4f}')
print(f'F1 Score: {metrics["f1"]:.4f}')
print(f'Precision: {metrics["precision"]:.4f}')
print(f'Recall: {metrics["recall"]:.4f}')


Evaluating...
Accuracy: 96.00%
Mean Loss: 0.1826
F1 Score: 0.9600
Precision: 0.9600
Recall: 0.9600


In [12]:
metrics

{'loss': [0.15992891788482666,
  0.2399546504020691,
  0.21071556210517883,
  0.10720616579055786,
  0.17166641354560852,
  0.3222771883010864,
  0.15104974806308746,
  0.16683505475521088,
  0.13099372386932373,
  0.16512082517147064],
 'correct': [32, 31, 30, 32, 30, 27, 31, 31, 32, 12],
 'f1': 0.9599999785423279,
 'precision': 0.9599999785423279,
 'recall': 0.9599999785423279}

In [None]:
pred['logits'].shape