## Fine tune pretrained model

Initialise the model and feature extractor

In [38]:
from transformers import ViTImageProcessor
from transformers import ViTConfig

model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTImageProcessor.from_pretrained(model_name_or_path, proxies={'https': 'proxy-ir.intel.com:912'})

vit_conf = ViTConfig.from_pretrained(model_name_or_path, proxies={'https': 'proxy-ir.intel.com:912'})
print(feature_extractor)
vit_conf.attention_probs_dropout_prob = 0.1
vit_conf.hidden_dropout_prob = 0.1
vit_conf.image_size = 2240
# vit_conf.patch_size = 160
# vit_conf.encoder_stride = 160

ViTImageProcessor {
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}



In [4]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import os
import numpy as np


In [None]:
# from transformers import ViTForImageClassification


# # stat_df = pd.read_csv("data/TRAIN_images_metadata.csv")

# # stat_df = stat_df.sample(20, random_state=8, ignore_index=True)

# def process_image(image_file):
#     img_pil = Image.open(os.path.join("I:/TRAIN_IMAGES/", image_file)).convert("RGB")
#     inp_img_enc = feature_extractor(img_pil, return_tensors='pt')
#     return inp_img_enc['pixel_values']

# # stat_df['pixel_values'] = stat_df['image_name'].map(process_image)


In [5]:
from src.util import get_label_map

label_col = 'activity_type'

labels_lst = get_label_map()[label_col]
# stat_df[label_col] = stat_df[label_col].map(lambda el:labels_lst[el])

### From Torch ImageFolder 

In [13]:
# # import evaluate
# import numpy as np
# import torch
# from src.util import get_data_set

# # train_data = stat_df[['pixel_values', label_col]].loc[:14].to_dict(orient='records')
# # valid_data = stat_df[['pixel_values', label_col]].loc[15:].to_dict(orient='records')

# train_data = get_data_set(os.path.join("TRAIN_IMAGES/", label_col), sample_type="train", transform=feature_extractor)
# valid_data = get_data_set(os.path.join("TRAIN_IMAGES/", label_col), sample_type="validation", transform=feature_extractor)

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        # 'labels': torch.tensor([x[label_col] for x in batch])
        'labels': torch.tensor([x['labels'] for x in batch])
    }



In [6]:
from datasets import load_dataset

data_dir = os.path.join("TRAIN_IMAGES_50/", label_col)
print(data_dir)
dataset = load_dataset("imagefolder", data_dir=data_dir, drop_labels=False)
print(dataset["train"].features)
print(labels_lst)
# dataset = load_dataset("imagefolder", data_dir="I:/TRAIN_IMAGES/", split="train")
# dataset = dataset.train_test_split(test_size=0.1)


TRAIN_IMAGES_50/activity_type


Resolving data files:   0%|          | 0/334 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/275 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/620 [00:00<?, ?it/s]

Found cached dataset imagefolder (C:/Users/bhegde/.cache/huggingface/datasets/imagefolder/default-649c247ffd244c21/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)


  0%|          | 0/3 [00:00<?, ?it/s]

{'image': Image(decode=True, id=None), 'label': ClassLabel(names=['Billboard-Transit', 'Collateral', 'IndustryPartner.com', 'Magazine-Newspaper', 'Member.com', 'Misc', 'NonPartner.com', 'Online Display'], id=None)}
{'NonPartner.com': 0, 'Member.com': 1, 'Online Display': 2, 'Magazine-Newspaper': 3, 'Billboard-Transit': 4, 'Collateral': 5, 'Misc': 6, 'IndustryPartner.com': 7}


In [None]:
# from src.dataset import AIECVDataSet
# import random
# import matplotlib.pyplot as plt
# train_dataset = AIECVDataSet(csv_file="data/TRAIN_images_metadata.csv", root_dir="/home/jovyan/TRAIN_IMAGES", label_col=label_col, transform=feature_extractor)

# # plt.figure(figsize=(12, 6))
# # for i in range(10):
# #     idx = random.randint(0, len(train_dataset))
# #     image, class_name = train_dataset[idx]
# #     ax=plt.subplot(2,5,i+1) # create an axis
# #     ax.title.set_text(class_name) # create a name of the axis based on the img name
# #     #The final tensor arrays will be of the form (C * H * W), instead of the original (H * W * C), 
# #     # hence use permute to change the order
# #     plt.imshow(image.permute(1, 2, 0)) # show the img
# len(train_dataset)

In [10]:

from torchvision.transforms import RandomResizedCrop, RandomHorizontalFlip, Compose, Normalize, ToTensor, Resize, CenterCrop

normalise = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
feature_size = (
    feature_extractor.size["shortest_edge"]
    if "shortest_edge" in feature_extractor.size
    else (feature_extractor.size["height"], feature_extractor.size["width"])
)
size = (vit_conf.image_size, vit_conf.image_size)

preprocess_train = Compose(
    [
        RandomResizedCrop(size),
        RandomHorizontalFlip(),
        ToTensor(),
        normalise,
    ]
)
preprocess_val = Compose(
    [
        Resize(size),
        CenterCrop(size),
        ToTensor(),
        normalise,
    ]
)

# def transform_image(image_files):
#     inputs = feature_extractor([x.convert("RGB") for x in image_files['image']], return_tensors='pt')
#     inputs['labels'] = image_files['label']
#     return inputs
def train_transform_image(image_files):
    inputs = {}
    inputs['pixel_values'] = [preprocess_train(x.convert("RGB")) for x in image_files['image']]
    inputs['labels'] = image_files['label']
    return inputs

def val_transform_image(image_files):
    inputs = {}
    inputs['pixel_values'] = [preprocess_val(x.convert("RGB")) for x in image_files['image']]
    inputs['labels'] = image_files['label']
    return inputs

train_prep_ds = dataset["train"].with_transform(train_transform_image)
valid_prep_ds = dataset["validation"].with_transform(val_transform_image)

print(train_prep_ds)

Dataset({
    features: ['image', 'label'],
    num_rows: 333
})


In [11]:
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    # return accuracy_score(y_true = labels, y_pred = predictions)
    return {
            "f1": float(
                f1_score(y_true = labels, y_pred = predictions)
            )
        }

In [18]:
from transformers import ViTForImageClassification

Image.MAX_IMAGE_PIXELS = None

vit_conf .num_labels = len(labels_lst)
vit_conf.id2label = {v : k for k,v in labels_lst.items()}
vit_conf.label2id = labels_lst

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    proxies={'https': 'proxy-ir.intel.com:912'},
    config= vit_conf,
)


from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-base-AIE-sample",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=10,
  # fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=1e-6,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
)


from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=train_prep_ds,
    eval_dataset=valid_prep_ds,
    tokenizer=feature_extractor,
)

RuntimeError: Error(s) in loading state_dict for ViTForImageClassification:
	size mismatch for embeddings.position_embeddings: copying a param with shape torch.Size([1, 197, 768]) from checkpoint, the shape in current model is torch.Size([1, 19601, 768]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

In [None]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

In [None]:
metrics = trainer.evaluate(prep_ds["test"])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

In [None]:
os.getcwd()
# %load_ext tensorboard


In [None]:
from transformers import ViTConfig

model_name_or_path = 'google/vit-base-patch16-224-in21k'
# feature_extractor = ViTImageProcessor.from_pretrained()
vc = ViTConfig.from_pretrained(model_name_or_path, proxies={'https': 'proxy-ir.intel.com:912'})
vc.attention_probs_dropout_prob = 0.1
vc.hidden_dropout_prob = 0.1
vc.image_size = 2240
vc.patch_size = 160
vc.encoder_stride = 160
print(vc)

In [37]:
from transformers import (
    Trainer,
    TrainingArguments,
    LevitFeatureExtractor,
    LevitForImageClassificationWithTeacher,
)

feature_ext = LevitFeatureExtractor.from_pretrained(
    "facebook/levit-128", proxies={"https": "proxy-ir.intel.com:912"}
)
print(feature_ext)

LevitFeatureExtractor {
  "crop_size": {
    "height": 224,
    "width": 224
  },
  "do_center_crop": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "feature_extractor_type": "LevitFeatureExtractor",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "LevitFeatureExtractor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 224
  }
}



