In [1]:
import torch
from datasets import load_dataset
import random
from PIL import ImageDraw, ImageFont, Image
from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer
import numpy as np
from evaluate import load
import pandas as pd
from datasets import Dataset
import os

In [66]:
file_path = "StanfordCars/stanford_cars_with_class_names.xlsx"
train_df = pd.read_excel(file_path, sheet_name='train')

train_folder = "StanfordCars/cars_train/cars_train"

# Get image files
train_image_files = [f for f in os.listdir(train_folder) if f.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'tiff'))]

# Prepare dataset components
image_paths = []
images = []
labels1 = []
label_ids1 = []
count = 0
label_to_id = {}  # To dynamically assign new label IDs
id_to_label = {}
next_label_id = 0

for image_file in train_image_files:
    image_path = os.path.join(train_folder, image_file)
    count += 1
    if count > 200:
        break
    try:
        with Image.open(image_path) as img:
            metadata = train_df.loc[train_df['image'] == image_file]
            if not metadata.empty:
                label_name = metadata['ture_class_name'].values[0]
                
                # Dynamically assign a new label ID if label_name is new
                if label_name not in label_to_id:
                    label_to_id[label_name] = next_label_id
                    id_to_label[next_label_id] = label_name
                    next_label_id += 1
                
                
                image_paths.append(image_path)
                images.append(img.copy())  # Use .copy() to avoid closing the image later
                #labels1.append(metadata['ture_class_name'].values[0])
                #label_ids1.append(metadata['class'].values[0])
                labels1.append(label_name)
                label_ids1.append(label_to_id[label_name])
    except ValueError as ve:
        print(f"ValueError encountered with image: {image_file}, skipping it.\n")

# Create a dictionary to match the Hugging Face dataset format
data_dict = {
    "image_file_path": image_paths,
    "image": images,
    "label_name": labels1,
    "labels": label_ids1,
}

# Load into a Hugging Face Dataset
train_dataset = Dataset.from_dict(data_dict)

# Save the dataset locally or explore it
#train_dataset.save_to_disk("stanford_cars_train_dataset")
print(train_dataset[0])

{'image_file_path': 'StanfordCars/cars_train/cars_train/04354.jpg', 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=720x480 at 0x347717740>, 'label_name': 'Jeep Wrangler SUV 2012', 'labels': 0}


In [67]:
test_df = pd.read_excel(file_path, sheet_name='train')

test_folder = "StanfordCars/cars_test/cars_test"

# Get image files
test_image_files = [f for f in os.listdir(test_folder) if f.lower().endswith(('png', 'jpg', 'jpeg', 'bmp', 'tiff'))]

# Prepare dataset components
image_paths = []
images = []
labels2 = []
label_ids2 = []
count = 0



for image_file in test_image_files:
    image_path = os.path.join(train_folder, image_file)
    count += 1
    if count > 200:
        break
    try:
        with Image.open(image_path) as img:
            metadata = train_df.loc[test_df['image'] == image_file]
            if not metadata.empty:
                label_name = metadata['ture_class_name'].values[0]
                
                # Dynamically assign a new label ID if label_name is new
                if label_name not in label_to_id:
                    label_to_id[label_name] = next_label_id
                    id_to_label[next_label_id] = label_name
                    next_label_id += 1
                
                image_paths.append(image_path)
                images.append(img.copy())  # Use .copy() to avoid closing the image later
                labels2.append(label_name)
                label_ids2.append(label_to_id[label_name])
    except ValueError as ve:
        print(f"ValueError encountered with image: {image_file}, skipping it.\n")

# Create a dictionary to match the Hugging Face dataset format
data_dict = {
    "image_file_path": image_paths,
    "image": images,
    "label_name": labels2,
    "labels": label_ids2,
}

# Load into a Hugging Face Dataset
test_dataset = Dataset.from_dict(data_dict)

# Save the dataset locally or explore it
#train_dataset.save_to_disk("stanford_cars_train_dataset")
print(test_dataset[0])

{'image_file_path': 'StanfordCars/cars_train/cars_train/04354.jpg', 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=720x480 at 0x3478476B0>, 'label_name': 'Jeep Wrangler SUV 2012', 'labels': 0}


In [68]:
model_name_or_path = 'google/vit-base-patch16-224-in21k'
processor = ViTImageProcessor.from_pretrained(model_name_or_path)
#processor.size = {'height': 180, 'width': 180}
processor

ViTImageProcessor {
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

In [69]:
def transform(example_batch):
    #print(example_batch)
    # Take a list of PIL images and turn them to pixel values
    inputs = processor([x for x in example_batch['image']], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['labels']
    return inputs

prepared_train_ds = train_dataset.with_transform(transform)
prepared_test_ds = test_dataset.with_transform(transform)
prepared_train_ds[0]

{'pixel_values': tensor([[[ 0.4039,  0.4118,  0.4118,  ...,  1.0000,  1.0000,  1.0000],
          [ 0.4118,  0.4039,  0.4118,  ...,  1.0000,  1.0000,  1.0000],
          [ 0.4118,  0.4039,  0.4118,  ...,  1.0000,  1.0000,  1.0000],
          ...,
          [-0.0824, -0.3412, -0.3490,  ...,  0.2627,  0.3647,  0.3098],
          [-0.1294, -0.1294, -0.1765,  ...,  0.4588,  0.4510,  0.3647],
          [-0.2157, -0.1137,  0.0431,  ...,  0.4667,  0.4980,  0.4824]],
 
         [[ 0.4196,  0.4275,  0.4275,  ...,  1.0000,  1.0000,  1.0000],
          [ 0.4275,  0.4196,  0.4275,  ...,  1.0000,  1.0000,  1.0000],
          [ 0.4275,  0.4196,  0.4275,  ...,  1.0000,  1.0000,  1.0000],
          ...,
          [-0.1059, -0.3647, -0.3569,  ...,  0.2471,  0.3098,  0.2471],
          [-0.1216, -0.1137, -0.1451,  ...,  0.4196,  0.3569,  0.2863],
          [-0.1922, -0.1137,  0.0824,  ...,  0.4118,  0.4196,  0.3882]],
 
         [[ 0.5137,  0.5294,  0.5373,  ...,  0.9843,  0.9843,  0.9843],
          [ 

In [70]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

metric = load("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

#all_labels = labels1 + labels2
#all_label_ids = label_ids1 + label_ids2
#label_to_id = {label: int(label_id) for label, label_id in zip(all_labels, all_label_ids)}
#id_to_label = {int(label_id): label for label, label_id in zip(all_labels, all_label_ids)}
#print(label_to_id)

labels = list(label_to_id.keys())
#print(labels)
model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label=id_to_label,
    label2id=label_to_id
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [71]:
training_args = TrainingArguments(
    output_dir="./vit-model",
    per_device_train_batch_size=16,
    evaluation_strategy="steps",
    num_train_epochs=4,
    fp16=False,
    no_cuda=True,
    save_steps=100,
    eval_steps=100,
    logging_steps=10,
    learning_rate=2e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to='tensorboard',
    load_best_model_at_end=True,
)



In [72]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=prepared_train_ds,
    eval_dataset=prepared_test_ds,
    tokenizer=processor,
    data_collator=collate_fn,
)

  trainer = Trainer(


In [73]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

Step,Training Loss,Validation Loss


***** train metrics *****
  epoch                    =        4.0
  total_flos               = 57802267GF
  train_loss               =     4.5595
  train_runtime            = 0:01:49.29
  train_samples_per_second =       7.32
  train_steps_per_second   =      0.476


In [74]:
metrics = trainer.evaluate(prepared_test_ds)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =        4.0
  eval_accuracy           =        0.9
  eval_loss               =      4.096
  eval_runtime            = 0:00:11.32
  eval_samples_per_second =     17.654
  eval_steps_per_second   =      2.207
