# Vision Transformers

source : https://www.philschmid.de/image-classification-huggingface-transformers-keras

In [84]:
import os
import random
import datasets
from transformers import DefaultDataCollator,ViTFeatureExtractor
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import accuracy_score

In [85]:
def create_image_folder_dataset(root_path, subset_size=None):
    """creates `Dataset` from image folder structure"""

    # get class names by folders names
    _CLASS_NAMES = [folder for folder in os.listdir(root_path) if not folder.startswith('.')]
    # defines `datasets` features`
    features = datasets.Features({
        "img": datasets.Image(),
        "label": datasets.features.ClassLabel(names=_CLASS_NAMES),
    })
    # temp list holding datapoints for creation
    img_data_files = []
    label_data_files = []
    # load images into list for creation
    for img_class in _CLASS_NAMES:
        class_path = os.path.join(root_path, img_class)
        for img in os.listdir(class_path):
            if img.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):  # Check if it's an image file
                path_ = os.path.join(class_path, img)
                img_data_files.append(path_)
                label_data_files.append(img_class)
    # Shuffle the data
    combined = list(zip(img_data_files, label_data_files))
    random.shuffle(combined)
    img_data_files[:], label_data_files[:] = zip(*combined)
    # Select a subset if specified
    if subset_size is not None:
        img_data_files = img_data_files[:subset_size]
        label_data_files = label_data_files[:subset_size]
    # create dataset
    ds = datasets.Dataset.from_dict({"img": img_data_files, "label": label_data_files}, features=features)
    return ds

In [86]:
train_folder = "Group_6_water_smooth/subsample_train"
valid_folder = "Group_6_water_smooth/valid"
test_folder = "Group_6_water_smooth/test"

train_dataset = create_image_folder_dataset(train_folder)
val_dataset = create_image_folder_dataset(valid_folder)
test_dataset = create_image_folder_dataset(test_folder)

In [87]:
train_dataset

Dataset({
    features: ['img', 'label'],
    num_rows: 6042
})

In [88]:
len(train_dataset),len(val_dataset),len(test_dataset)

(6042, 2460, 2977)

In [89]:
img_class_labels = train_dataset.features["label"].names
img_class_labels

['water_concrete_smooth',
 'water_gravel',
 'water_gravel 2',
 'water_concrete_smooth 2',
 'water_asphalt_smooth']

In [90]:
model_id = "google/vit-base-patch16-224-in21k"

feature_extractor = ViTFeatureExtractor.from_pretrained(model_id)

data_augmentation = keras.Sequential(
    [
        layers.Resizing(feature_extractor.size, feature_extractor.size),
        layers.Rescaling(1./255)
    ]
)
# use keras image data augementation processing
def augmentation(examples):
    # print(examples["img"])
    examples["pixel_values"] = [data_augmentation(image) for image in examples["img"]]
    return examples


# basic processing (only resizing)
def process(examples):
    examples.update(feature_extractor(examples['img'], ))
    return examples

# we are also renaming our label col to labels to use `.to_tf_dataset` later
train_dataset = train_dataset.rename_column("label", "labels")
val_dataset = val_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

In [91]:
train_dataset = train_dataset.map(process, batched=True)

Map:   0%|          | 0/6042 [00:00<?, ? examples/s]

In [92]:
val_dataset = val_dataset.map(process, batched=True)
test_dataset = test_dataset.map(process, batched=True)

Map:   0%|          | 0/2460 [00:00<?, ? examples/s]

Map:   0%|          | 0/2977 [00:00<?, ? examples/s]

In [93]:
id2label = {str(i): label for i, label in enumerate(img_class_labels)}
label2id = {v: k for k, v in id2label.items()}
id2label,label2id

({'0': 'water_concrete_smooth',
  '1': 'water_gravel',
  '2': 'water_gravel 2',
  '3': 'water_concrete_smooth 2',
  '4': 'water_asphalt_smooth'},
 {'water_concrete_smooth': '0',
  'water_gravel': '1',
  'water_gravel 2': '2',
  'water_concrete_smooth 2': '3',
  'water_asphalt_smooth': '4'})

In [94]:
# Hyperparameters

num_train_epochs = 1
train_batch_size = 32
eval_batch_size = 32
learning_rate = 3e-5
weight_decay_rate=0.01
num_warmup_steps=0


In [95]:
# Data collator that will dynamically pad the inputs received, as well as the labels.
data_collator = DefaultDataCollator(return_tensors="tf")

# converting our train dataset to tf.data.Dataset
tf_train_dataset = train_dataset.to_tf_dataset(
   columns=['pixel_values'],
   label_cols=["labels"],
   shuffle=True,
   batch_size=train_batch_size,
   collate_fn=data_collator)

# converting our test dataset to tf.data.Dataset
tf_eval_dataset = val_dataset.to_tf_dataset(
   columns=['pixel_values'],
   label_cols=["labels"],
   shuffle=True,
   batch_size=eval_batch_size,
   collate_fn=data_collator)

tf_test_dataset = test_dataset.to_tf_dataset(
   columns=['pixel_values'],
   label_cols=["labels"],
   shuffle=True,
   batch_size=eval_batch_size,
   collate_fn=data_collator)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [96]:
from transformers import TFViTForImageClassification, create_optimizer
import tensorflow as tf

# create optimizer wight weigh decay
num_train_steps = len(tf_train_dataset) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=learning_rate,
    num_train_steps=num_train_steps,
    weight_decay_rate=weight_decay_rate,
    num_warmup_steps=num_warmup_steps,
)

# load pre-trained ViT model
model = TFViTForImageClassification.from_pretrained(
    model_id,
    num_labels=len(img_class_labels),
    id2label=id2label,
    label2id=label2id,
)

# define loss
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# define metrics
metrics=[
    tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(3, name="top-3-accuracy"),
]

# compile model
model.compile(optimizer=optimizer,
              loss=loss,
              metrics=metrics
              )

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing TFViTForImageClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFViTForImageClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFViTForImageClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [97]:
history = model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_train_epochs,
)

2024-03-21 13:04:27.539412: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-03-21 14:08:50.885508: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




In [98]:
predictions = model.predict(tf_test_dataset)

2024-03-21 14:11:12.196551: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




IOStream.flush timed out




In [99]:
import numpy as np

y_test = list(map(lambda x: x[1], tf_test_dataset))
true_labels = np.concatenate([y.numpy() for y in y_test])


In [100]:
predictions.logits.argmax(axis=1).shape

(2977,)

In [101]:
true_labels.shape

(2977,)

In [102]:
test_accuracy = accuracy_score(true_labels, predictions.logits.argmax(axis=1))
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.5677


In [103]:
model.save("model/vit1", save_format="tf")

INFO:tensorflow:Assets written to: model/vit1/assets


INFO:tensorflow:Assets written to: model/vit1/assets
