In [1]:
!pip install datasets transformers accelerate torch scikit-learn matplotlib



In [2]:
import torch
device='cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
from datasets import load_dataset
dataset = load_dataset('cats_vs_dogs')
dataset

README.md:   0%|          | 0.00/8.16k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/330M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/391M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/23410 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'labels'],
        num_rows: 23410
    })
})

In [4]:
dataset['train'].features

{'image': Image(mode=None, decode=True, id=None),
 'labels': ClassLabel(names=['cat', 'dog'], id=None)}

In [5]:
# Split the training data into train and test (let's say 10% for the test set)
train_test_split = dataset['train'].train_test_split(test_size=0.1)

# Further split the training set to get a validation set (e.g., 10% of the training set)
train_val_split = train_test_split['train'].train_test_split(test_size=0.1)

# Combine the splits into a new DatasetDict
final_dataset = {
    'train': train_val_split['train'],
    'val': train_val_split['test'],  
    'test': train_test_split['test']  
}

final_dataset

{'train': Dataset({
     features: ['image', 'labels'],
     num_rows: 18962
 }),
 'val': Dataset({
     features: ['image', 'labels'],
     num_rows: 2107
 }),
 'test': Dataset({
     features: ['image', 'labels'],
     num_rows: 2341
 })}

In [6]:
train_ds = final_dataset["train"]
val_ds = final_dataset["val"]
test_ds = final_dataset["test"]

In [9]:
train_ds

Dataset({
    features: ['image', 'labels'],
    num_rows: 18962
})

In [12]:
id2label = {id: label for id, label in enumerate(train_ds.features["labels"].names)}
label2id = {label: id for id, label in id2label.items()}
id2label,label2id

({0: 'cat', 1: 'dog'}, {'cat': 0, 'dog': 1})

In [13]:
from transformers import ViTImageProcessor

model_name = "google/vit-large-patch16-224"
processor = ViTImageProcessor.from_pretrained(model_name)
processor

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

ViTImageProcessor {
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

In [14]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    ToTensor,
    Resize,
)

# Get configurations from ViT processor
image_mean, image_std = processor.image_mean, processor.image_std
size = processor.size["height"]

# Normalizes the image pixels by subtracting the mean and dividing by the std from the pretrained model configurations
normalize = Normalize(mean=image_mean, std=image_std)

# Compose: Combines a series of image transformations into one pipeline.
train_transforms = Compose(
    [
        RandomResizedCrop(size),
        RandomHorizontalFlip(),
        ToTensor(),
        normalize,
    ]
)
val_transforms = Compose(
    [
        Resize(size),
        CenterCrop(size),
        ToTensor(),
        normalize,
    ]
)
test_transforms = Compose(
    [
        Resize(size),
        CenterCrop(size),
        ToTensor(),
        normalize,
    ]
)

In [15]:
def apply_train_transforms(examples):
    examples["pixel_values"] = [train_transforms(image.convert("RGB")) for image in examples["image"]]
    return examples


def apply_val_transforms(examples):
    examples["pixel_values"] = [val_transforms(image.convert("RGB")) for image in examples["image"]]
    return examples


def apply_test_transforms(examples):
    examples["pixel_values"] = [val_transforms(image.convert("RGB")) for image in examples["image"]]
    return examples

train_ds.set_transform(apply_train_transforms)
val_ds.set_transform(apply_val_transforms)
test_ds.set_transform(apply_test_transforms)

In [17]:
from transformers import ViTForImageClassification

labels  = dataset['train'].features['labels'].names

model = ViTForImageClassification.from_pretrained(
    model_name, 
    num_labels = len(labels),
    id2label=id2label, 
    label2id=label2id, 
    ignore_mismatched_sizes=True
)

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-large-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from transformers import TrainingArguments, Trainer
import numpy as np

train_args = TrainingArguments(
    output_dir="./output_models",
  per_device_train_batch_size=16,
  evaluation_strategy="epoch",
    save_strategy="epoch",
  num_train_epochs=3,
  fp16=True,
  logging_steps=10,
  learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  load_best_model_at_end=True
)

In [24]:
import torch
from torch.utils.data import DataLoader


def collate_fn(examples):
    # Stacks the pixel values of all examples into a single tensor and collects labels into a tensor
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["labels"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

# Create a DataLoader for the training dataset, with custom collation and a batch size of 4
train_dl = DataLoader(train_ds, collate_fn=collate_fn, batch_size=4)

In [25]:
trainer = Trainer(
    model,
    train_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collate_fn,
    tokenizer=processor,
)

  trainer = Trainer(


In [26]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,0.1905,0.112126


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


KeyboardInterrupt: 

In [38]:
model = ViTForImageClassification.from_pretrained("/kaggle/working/output_models/checkpoint-593")
from transformers import pipeline, ViTForImageClassification, AutoImageProcessor

# Load the model
model = ViTForImageClassification.from_pretrained("/kaggle/working/output_models/checkpoint-593")

# Load the image processor
image_processor = AutoImageProcessor.from_pretrained("/kaggle/working/output_models/checkpoint-593")
pipe = pipeline('image-classification', model=model, image_processor=image_processor)



Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [42]:
res=pipe('/kaggle/input/bbbbnnnn/OIP.jpg')
res

[{'label': 'cat', 'score': 0.9905894994735718},
 {'label': 'dog', 'score': 0.011865032836794853}]

In [44]:
dic={}
dic[res[0]['label']]=res[0]['score']
dic[res[1]['label']]=res[1]['score']
dic

{'cat': 0.9905894994735718, 'dog': 0.011865032836794853}