## Importing the model

In [2]:
!pip install transformers




[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from transformers import ViTImageProcessor, ViTForImageClassification
from PIL import Image
import requests

url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits
# model predicts one of the 1000 ImageNet classes
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx])


  from .autonotebook import tqdm as notebook_tqdm


Predicted class: Egyptian cat


## Configuring the model


In [4]:
model

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermed

In [5]:
from torchvision.datasets import ImageFolder
from torchvision import transforms
from torch.utils.data import DataLoader
import os 
import torch.nn as nn
import torch.optim as optim

import torch
from torcheval.metrics import MulticlassF1Score, MulticlassPrecision, MulticlassRecall, MulticlassConfusionMatrix


In [6]:
model.classifier.out_features

1000

In [7]:
# Creating our classification layer

# freeze the layers
for param in model.parameters():
    param.requires_grad = False

# Replaces the FC layer for our classes
num_ftrs = model.classifier.in_features
model.classifier = nn.Linear(num_ftrs, 3)



In [8]:
# Now the model has 3 output features instead of 1000 in the classifier layer

model

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermed

## Training 

In [9]:
# Using our functions

def dataload(path):


    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor()]) # I removed the normalize part because of the tokenizer from the ViT

    path_test = os.path.join(path, "test")
    path_train = os.path.join(path, "train")
    path_eval = os.path.join(path, "eval")

    test = ImageFolder(root=path_test, transform=transform) # Automatically classifies the folder order as 0, 1 and 2 respectively
    train = ImageFolder(root=path_train, transform=transform)
    eval = ImageFolder(root=path_eval, transform=transform)



    train_dataloader = DataLoader(train, batch_size=32, num_workers=4,shuffle=True)
    test_dataloader = DataLoader(test, batch_size=32, num_workers=4,shuffle=True)
    eval_dataloader = DataLoader(eval, batch_size=32, num_workers=4,shuffle=True)

    return train_dataloader, test_dataloader, eval_dataloader

train_dataloader, test_dataloader, eval_dataloader = dataload("C:/Users/Lucas/Documents/GitHub/OrangeDetect/data/processed")

In [10]:
epochs =1 
device ='cuda'

In [11]:
import torch.optim as optim


optimizer = optim.AdamW(
    model.parameters(),
    lr=3e-4,  
    weight_decay=0.3,  
    betas=(0.9, 0.999)  
)

# Learning rate scheduler 
scheduler = optim.lr_scheduler.CosineAnnealingLR(
    optimizer,
    T_max=epochs,  
    eta_min=1e-6   #
)

In [12]:
criterion = nn.CrossEntropyLoss()

In [13]:
for batch, (X, y) in enumerate(train_dataloader):
    test = processor(X, return_tensors="pt", do_rescale=False)
    model(**test)
    break

In [14]:
model.to(device)
model.train()

for epoch in range(epochs):
    print(f'\n------ Epoch {epoch + 1} ------')
    
    # Training phase
    for batch, (X, y) in enumerate(train_dataloader):
        X, y = X.to(device), y.to(device)
        X = processor(X, return_tensors="pt", do_rescale=False).to(device)
        
        # Forward pass
        optimizer.zero_grad()
        pred = model(**X)
        loss = criterion(pred['logits'], y)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        if batch % 100 == 0:
            print(f'Loss: {loss.item():.4f}')



------ Epoch 1 ------
Loss: 1.3606
Loss: 0.1581
Loss: 0.1262


In [15]:
print('\nEvaluating...')

# Evaluation phase
model.eval()
correct = 0
total_loss = 0
y_correct = []
y_pred = []

# Initialize metrics
f1score = MulticlassF1Score(num_classes=3)
precision_metric = MulticlassPrecision(num_classes=3)
recall_metric = MulticlassRecall(num_classes=3)

metrics = {
    'loss': [],
    'correct': [],
    'f1': [],
    'precision': [],
    'recall': []
}

with torch.no_grad():
    for X_test, y_test in eval_dataloader:
        X_test, y_test = X_test.to(device), y_test.to(device)
        pred = model(X_test)
        #pred = pred['logits'],
        loss = criterion(pred['logits'], y_test).item()
        total_loss += loss

        predicted = pred['logits'].argmax(dim=1)
        correct_batch = (predicted == y_test).sum().item()
        correct += correct_batch

        y_pred.append(predicted)
        y_correct.append(y_test)

        # Update metrics
        f1score.update(y_test, predicted)
        precision_metric.update(y_test, predicted)
        recall_metric.update(y_test, predicted)

        metrics['loss'].append(loss)
        metrics['correct'].append(correct_batch)

# Compute final metrics
metrics['f1'] = f1score.compute().tolist()
metrics['precision'] = precision_metric.compute().tolist()
metrics['recall'] = recall_metric.compute().tolist()

accuracy = correct / len(eval_dataloader.dataset)
mean_loss = sum(metrics['loss']) / len(metrics['loss'])

print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Mean Loss: {mean_loss:.4f}')
print(f'F1 Score: {metrics["f1"]:.4f}')
print(f'Precision: {metrics["precision"]:.4f}')
print(f'Recall: {metrics["recall"]:.4f}')


Evaluating...
Accuracy: 96.67%
Mean Loss: 0.1657
F1 Score: 0.9667
Precision: 0.9667
Recall: 0.9667


In [16]:
metrics

{'loss': [0.18581648170948029,
  0.19089628756046295,
  0.13942116498947144,
  0.124359130859375,
  0.18860331177711487,
  0.09645327925682068,
  0.2514939606189728,
  0.19607584178447723,
  0.10955896973609924,
  0.1739671677350998],
 'correct': [30, 31, 31, 32, 30, 32, 29, 32, 31, 12],
 'f1': 0.9666666388511658,
 'precision': 0.9666666388511658,
 'recall': 0.9666666388511658}

In [17]:
pred['logits'].shape

torch.Size([12, 3])

## Testing

In [20]:
img_tensor

tensor([[[[0.3333, 0.3373, 0.4078,  ..., 0.4902, 0.2471, 0.0118],
          [0.3569, 0.3373, 0.3412,  ..., 0.4980, 0.2471, 0.0118],
          [0.3608, 0.3373, 0.3333,  ..., 0.5059, 0.2667, 0.0118],
          ...,
          [0.5333, 0.5451, 0.5843,  ..., 0.1725, 0.2510, 0.3412],
          [0.5294, 0.5294, 0.5451,  ..., 0.3490, 0.3686, 0.4039],
          [0.5490, 0.5451, 0.5294,  ..., 0.2824, 0.3020, 0.3490]],

         [[0.4118, 0.4118, 0.4000,  ..., 0.5843, 0.2941, 0.0196],
          [0.4314, 0.4118, 0.4000,  ..., 0.5843, 0.2980, 0.0235],
          [0.4353, 0.4039, 0.4000,  ..., 0.5961, 0.3176, 0.0275],
          ...,
          [0.6431, 0.6588, 0.7020,  ..., 0.2000, 0.2980, 0.4039],
          [0.6510, 0.6510, 0.6588,  ..., 0.4078, 0.4431, 0.4784],
          [0.6706, 0.6667, 0.6471,  ..., 0.3412, 0.3647, 0.4039]],

         [[0.2980, 0.3216, 0.3373,  ..., 0.3412, 0.1647, 0.0118],
          [0.2980, 0.3020, 0.3098,  ..., 0.3412, 0.1647, 0.0118],
          [0.2745, 0.2706, 0.2902,  ..., 0

In [43]:
import cv2

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),  # Convert to tensor first
])

img = cv2.imread(r'C:\Users\Lucas\Documents\GitHub\OrangeDetect\ck1.jpg')
img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
img_tensor = transform(img).unsqueeze(0).to(device='cuda')
result = model(img_tensor)['logits'].argmax()

# Going to change this to Image.open



In [44]:
result


tensor(0, device='cuda:0')