## Fine tune pretrained model

Initialise the model and feature extractor

In [2]:
from transformers import ViTImageProcessor

model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTImageProcessor.from_pretrained(model_name_or_path, proxies={'https': 'proxy-ir.intel.com:912'})

In [3]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import tensorflow as tf
import pandas as pd
import os
import numpy as np

In [4]:
from transformers import ViTForImageClassification


# stat_df = pd.read_csv("data/TRAIN_images_metadata.csv")

# stat_df = stat_df.sample(20, random_state=8, ignore_index=True)

def process_image(image_file):
    img_pil = Image.open(os.path.join("I:/TRAIN_IMAGES/", image_file)).convert("RGB")
    inp_img_enc = feature_extractor(img_pil, return_tensors='pt')
    return inp_img_enc['pixel_values']

# stat_df['pixel_values'] = stat_df['image_name'].map(process_image)


In [4]:
from src.util import get_label_map

label_col = 'POA_attribution'

labels_lst = get_label_map()[label_col]
# stat_df[label_col] = stat_df[label_col].map(lambda el:labels_lst[el])

### From Torch ImageFolder 

In [51]:
import evaluate
import numpy as np
import torch
from src.util import get_data_set

# train_data = stat_df[['pixel_values', label_col]].loc[:14].to_dict(orient='records')
# valid_data = stat_df[['pixel_values', label_col]].loc[15:].to_dict(orient='records')

train_data = get_data_set(os.path.join("TRAIN_IMAGES/", label_col), sample_type="train", transform=feature_extractor)
valid_data = get_data_set(os.path.join("TRAIN_IMAGES/", label_col), sample_type="validation", transform=feature_extractor)

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        # 'labels': torch.tensor([x[label_col] for x in batch])
        'labels': torch.tensor([x['labels'] for x in batch])
    }



In [1]:
from datasets import load_dataset

dataset = load_dataset("imagefolder", data_dir=os.path.join("TRAIN_IMAGES/", label_col), drop_labels=False)

# dataset = load_dataset("imagefolder", data_dir="I:/TRAIN_IMAGES/", split="train")
# dataset[0]


Failed!


In [9]:
from src.dataset import AIECVDataSet
import random
import matplotlib.pyplot as plt
train_dataset = AIECVDataSet(csv_file="data/TRAIN_images_metadata.csv", root_dir="I:\Images", label_col=label_col, transform=feature_extractor)

# plt.figure(figsize=(12, 6))
# for i in range(10):
#     idx = random.randint(0, len(train_dataset))
#     image, class_name = train_dataset[idx]
#     ax=plt.subplot(2,5,i+1) # create an axis
#     ax.title.set_text(class_name) # create a name of the axis based on the img name
#     #The final tensor arrays will be of the form (C * H * W), instead of the original (H * W * C), 
#     # hence use permute to change the order
#     plt.imshow(image.permute(1, 2, 0)) # show the img
len(train_dataset)

100029

In [13]:
print(train_dataset)

<src.dataset.AIECVDataSet object at 0x0000026382A8B280>


In [48]:

def transform_image(image_files):
    inputs = feature_extractor([x.convert("RGB") for x in image_files['image']], return_tensors='pt')
    inputs['labels'] = image_files['label']
    return inputs

prep_ds = dataset.with_transform(transform_image)

In [None]:
dataset["train"][0]

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=2351x15950>,
 'label': 0}

In [22]:
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    # return accuracy_score(y_true = labels, y_pred = predictions)
    return {
            "f1": float(
                f1_score(y_true = labels, y_pred = predictions)
            )
        }

In [54]:
from transformers import ViTForImageClassification

Image.MAX_IMAGE_PIXELS = None

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels_lst),
    id2label={v : k for k,v in labels_lst.items()},
    label2id = labels_lst,
    proxies={'https': 'proxy-ir.intel.com:912'}
)


from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-base-AIE-sample",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=10,
  # fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=1e-6,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
)


from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prep_ds["train"],
    eval_dataset=prep_ds["validation"],
    tokenizer=feature_extractor,
)

loading configuration file config.json from cache at C:\Users\bhegde/.cache\huggingface\hub\models--google--vit-base-patch16-224-in21k\snapshots\7cbdb7ee3a6bcdf99dae654893f66519c480a0f8\config.json
Model config ViTConfig {
  "_name_or_path": "google/vit-base-patch16-224-in21k",
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "No",
    "1": "Yes"
  },
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "No": 0,
    "Yes": 1
  },
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.26.1"
}

loading weights file pytorch_model.bin from cache at C:\Users\bhegde/.cache\huggingface\hub\models--google--vit-base-patch16-224-in21k\snapshots\7cbdb7ee3a6bcd

In [55]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

***** Running training *****
  Num examples = 88
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 60
  Number of trainable parameters = 85800194


  0%|          | 0/60 [00:00<?, ?it/s]



{'loss': 0.6958, 'learning_rate': 8.333333333333333e-07, 'epoch': 1.67}
{'loss': 0.6907, 'learning_rate': 6.666666666666666e-07, 'epoch': 3.33}
{'loss': 0.679, 'learning_rate': 5e-07, 'epoch': 5.0}
{'loss': 0.6753, 'learning_rate': 3.333333333333333e-07, 'epoch': 6.67}
{'loss': 0.673, 'learning_rate': 1.6666666666666665e-07, 'epoch': 8.33}




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./vit-base-AIE-sample
Configuration saved in ./vit-base-AIE-sample\config.json


{'loss': 0.671, 'learning_rate': 0.0, 'epoch': 10.0}
{'train_runtime': 1200.9146, 'train_samples_per_second': 0.733, 'train_steps_per_second': 0.05, 'train_loss': 0.6807907978693645, 'epoch': 10.0}


Model weights saved in ./vit-base-AIE-sample\pytorch_model.bin
Image processor saved in ./vit-base-AIE-sample\preprocessor_config.json


***** train metrics *****
  epoch                    =       10.0
  train_loss               =     0.6808
  train_runtime            = 0:20:00.91
  train_samples_per_second =      0.733
  train_steps_per_second   =       0.05


In [58]:
metrics = trainer.evaluate(prep_ds["test"])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 56
  Batch size = 8


  0%|          | 0/7 [00:00<?, ?it/s]

***** eval metrics *****
  epoch                   =       10.0
  eval_f1                 =     0.0769
  eval_loss               =     0.7544
  eval_runtime            = 0:00:32.71
  eval_samples_per_second =      1.712
  eval_steps_per_second   =      0.214
