In [2]:
%pip install opencv-python

Note: you may need to restart the kernel to use updated packages.




In [3]:
%pip install tqdm

Note: you may need to restart the kernel to use updated packages.




In [43]:
import numpy as np
import cv2
import os

from transformers import ViTImageProcessor, TrainingArguments, ViTForImageClassification, Trainer
import torch 

from string import ascii_uppercase

from tqdm import tqdm

from datasets import Dataset, load_metric, ClassLabel

from PIL import Image

## 1) Carregando imagens

In [5]:
# Definir classes (letras do alfabeto, excluindo H, J, K, X e Z)
classes = [letter for letter in ascii_uppercase if letter not in {'H', 'J', 'K', 'X', 'Z'}]

num_classes = len(np.unique(classes))

print(f'Classes: {classes}')
print(f'Número de classes: {num_classes}')

Classes: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y']
Número de classes: 21


In [6]:
def load_images_from_folder(folder):
    images = []
    labels = []
    for label in os.listdir(folder):
        label_path = os.path.join(folder, label)
        if os.path.isdir(label_path):
            for filename in tqdm(os.listdir(label_path), desc=f"Loading {label} images"):
                img_path = os.path.join(label_path, filename)
                if img_path.endswith(".jpg") or img_path.endswith(".png"):
                    img = cv2.imread(img_path, cv2.IMREAD_COLOR)

                    # Converter a matriz de volta para a imagem original
                    image = Image.fromarray(img.astype('uint8'))
                    images.append(image)
                    labels.append(label)
                    
    return images, labels

In [7]:
X_train_images, y_train_labels = load_images_from_folder('../libras_dataset/train')
X_test_images, y_test_labels = load_images_from_folder('../libras_dataset/test')

Loading A images: 100%|██████████| 1686/1686 [00:00<00:00, 3973.34it/s]
Loading B images: 100%|██████████| 1662/1662 [00:00<00:00, 3995.18it/s]
Loading C images: 100%|██████████| 1686/1686 [00:00<00:00, 4022.45it/s]
Loading D images: 100%|██████████| 1650/1650 [00:00<00:00, 3989.77it/s]
Loading E images: 100%|██████████| 1670/1670 [00:00<00:00, 3985.67it/s]
Loading F images: 100%|██████████| 1647/1647 [00:00<00:00, 3891.89it/s]
Loading G images: 100%|██████████| 1650/1650 [00:00<00:00, 4004.85it/s]
Loading I images: 100%|██████████| 1650/1650 [00:00<00:00, 3998.27it/s]
Loading L images: 100%|██████████| 1650/1650 [00:00<00:00, 3975.88it/s]
Loading M images: 100%|██████████| 1650/1650 [00:00<00:00, 4018.08it/s]
Loading N images: 100%|██████████| 1650/1650 [00:00<00:00, 4014.57it/s]
Loading O images: 100%|██████████| 1650/1650 [00:00<00:00, 3985.51it/s]
Loading P images: 100%|██████████| 1650/1650 [00:00<00:00, 4021.23it/s]
Loading Q images: 100%|██████████| 1650/1650 [00:00<00:00, 3966.

In [31]:
# Supondo que 'labels' seja a lista de rótulos
labels = classes

# Criar um objeto ClassLabel
class_label = ClassLabel(names=labels)

In [40]:
train_data = {"image": X_train_images, "label": y_train_labels}
test_data = {"image": X_test_images, "label": y_test_labels}

train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)

train_dataset, test_dataset

(Dataset({
     features: ['image', 'label'],
     num_rows: 34714
 }),
 Dataset({
     features: ['image', 'label'],
     num_rows: 11548
 }))

In [41]:
num_classes = len(set(train_dataset["label"]))
labels = train_dataset.features['label']
num_classes, labels

(21, Value(dtype='string', id=None))

In [42]:
train_dataset[0]

{'image': [[[143, 142, 138],
   [143, 143, 138],
   [143, 143, 136],
   [142, 142, 135],
   [140, 140, 134],
   [140, 140, 134],
   [139, 139, 134],
   [140, 140, 134],
   [139, 138, 134],
   [137, 136, 132],
   [137, 136, 132],
   [137, 137, 133],
   [136, 137, 131],
   [136, 137, 130],
   [137, 137, 130],
   [137, 137, 130],
   [137, 137, 130],
   [136, 136, 129],
   [136, 135, 129],
   [134, 133, 127],
   [135, 134, 128],
   [133, 133, 127],
   [131, 131, 125],
   [130, 131, 126],
   [130, 131, 126],
   [130, 129, 125],
   [129, 128, 124],
   [127, 127, 121],
   [128, 128, 121],
   [129, 129, 122],
   [130, 129, 122],
   [129, 127, 121],
   [128, 127, 120],
   [125, 125, 119],
   [125, 125, 119],
   [124, 124, 118],
   [125, 124, 117],
   [124, 124, 117],
   [123, 123, 115],
   [121, 121, 112],
   [121, 121, 111],
   [121, 122, 113],
   [120, 121, 112],
   [120, 121, 112],
   [119, 120, 110],
   [118, 120, 108],
   [118, 120, 108],
   [118, 119, 108],
   [120, 120, 109],
   [118, 11

In [12]:
train_dataset[0]['image']

[[[143, 142, 138],
  [143, 143, 138],
  [143, 143, 136],
  [142, 142, 135],
  [140, 140, 134],
  [140, 140, 134],
  [139, 139, 134],
  [140, 140, 134],
  [139, 138, 134],
  [137, 136, 132],
  [137, 136, 132],
  [137, 137, 133],
  [136, 137, 131],
  [136, 137, 130],
  [137, 137, 130],
  [137, 137, 130],
  [137, 137, 130],
  [136, 136, 129],
  [136, 135, 129],
  [134, 133, 127],
  [135, 134, 128],
  [133, 133, 127],
  [131, 131, 125],
  [130, 131, 126],
  [130, 131, 126],
  [130, 129, 125],
  [129, 128, 124],
  [127, 127, 121],
  [128, 128, 121],
  [129, 129, 122],
  [130, 129, 122],
  [129, 127, 121],
  [128, 127, 120],
  [125, 125, 119],
  [125, 125, 119],
  [124, 124, 118],
  [125, 124, 117],
  [124, 124, 117],
  [123, 123, 115],
  [121, 121, 112],
  [121, 121, 111],
  [121, 122, 113],
  [120, 121, 112],
  [120, 121, 112],
  [119, 120, 110],
  [118, 120, 108],
  [118, 120, 108],
  [118, 119, 108],
  [120, 120, 109],
  [118, 118, 106],
  [116, 116, 106],
  [115, 116, 107],
  [116, 117,

In [28]:
train_dataset[0]['label'] 

# labels.names[train_dataset[0]['label']]

'B'

## 2) Rede com ViT

### 2.1) Modelo vit-base-patch16-224 (https://huggingface.co/google/vit-base-patch16-224) 

In [19]:
model_name_or_path = "google/vit-base-patch16-224"

feature_extractor = ViTImageProcessor .from_pretrained(
    model_name_or_path
)

In [20]:
feature_extractor

ViTImageProcessor {
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

In [11]:
example = feature_extractor(
    train_dataset['image'][0],
    return_tensors="pt"
)

example

In [None]:
print("Shape da Imagem original: ", train_dataset['image'][0].shape, "\n")

print("Shape da Imagem com resize do pytorch: ", example['pixel_values'].shape)	 

Shape da Imagem original:  (64, 64, 3) 

Shape da Imagem com resize do pytorch:  torch.Size([1, 3, 224, 224])


In [None]:
print(torch.cuda.is_available())

False


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device 

device(type='cpu')

## 3) Fine-tuning do modelo

In [None]:
def preprocess(batch): 
    inputs = feature_extractor(
        batch['image'], 
        return_tensors="pt"
    )

    inputs['label'] = batch['label']

    return inputs 

### 3.1) Adquirindo batchs de dados

In [None]:
prepared_train = train_dataset.with_transform(preprocess)
prepared_test = test_dataset.with_transform(preprocess)

In [None]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['label'] for x in batch])
    }

In [None]:
metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(
        predictions=p.predictions.argmax(axis=1),
        references=p.label_ids
    )

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


### 3.2) Argumentos do treinamento

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=4,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    load_best_model_at_end=True,
    # logging_dir="./logs",
    # do_train=True,
    # do_eval=True,
    # metric_for_best_model="accuracy"
)

### 3.3) Adquirindo modelo pré-treinado

In [None]:
labels = classes

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    ignore_mismatched_sizes=True
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([21]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([21, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=7

### 3.4) Preparando o modelo pré-treinado para um novo treinamento

In [None]:
trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=collate_fn,
        compute_metrics=compute_metrics,
        train_dataset=prepared_train,
        eval_dataset=prepared_test,
        tokenizer=feature_extractor
    )

### 3.5) Novo Treinamento do modelo pré-treinado

In [None]:
train_results = trainer.train()

trainer.save_model()
trainer.log_metrics("train.json", train_results.metrics)
trainer.save_metrics("train.json", train_results.metrics)

trainer.save_state()

  0%|          | 0/8680 [00:00<?, ?it/s]

ValueError: Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray, but got <class 'list'>.

### 3.6) Testando modelo 

In [None]:
metrics = trainer.evaluate(prepared_test)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

### 3.7) Visualizando predição com uma imagem especifica do conjunto de teste

In [None]:
image = test_dataset['image'][0].resize((224, 224))
image

NameError: name 'test_dataset' is not defined

In [None]:
actual_label = test_dataset['label'][0]
labels = test_dataset.features['label']

actual_label, labels.names[actual_label]

In [None]:
trainer.predict(image)

# with torch.no_grad():
#     logits = model(**inputs).logits