# File Structure

```text
patchcamelyon_subset/
├── tumor/
│   ├── img00001.png
│   ├── img00002.png
│   └── ...
└── normal/
    ├── img00001.png
    ├── img00002.png
    └── ...
```

## Imports

In [None]:

# For set up
from datasets import load_dataset
from typing import Any

# For Loading Model
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig

# For fine tuning
from peft import LoraConfig


## Set Up

In [None]:

# ---------------------- Set Up ---------------------- #

train_size = 9000 
validation_size = 1000 

# Downloaded and organized a subset of the patchcamelyon data set (first 10K images) into
# a folder called patchcamelyon_subset. Has sub folders "normal" and "tumor"
data = load_dataset("./patchcamelyon_subset", split="train")
data = data.train_test_split(
    train_size=train_size,
    test_size=validation_size,
    shuffle=True,
    seed=42,
)
# rename the 'test' set to 'validation'
data["validation"] = data.pop("test")

# ------------ Optional: display dataset details ------------ #
print(data) 
# This is actually a dictionary - it contains {'image':blah, 'label':hmmm}
print(f"data['train'][0]: {data['train'][0]}")
# First image in the training data
image = data['train'][0]['image']
# First label in the training data
label = data['train'][0]['label']
image.save("sample_image.png")
print("Image saved to sample_image.png")
print(label)
print(data['train'].features['label'])
# ----------------------------------------------------------- #

```text
DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 9000
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 1000
    })
})
data['train'][0]: {'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=96x96 at 0x7FB494250CA0>, 'label': 0}
Image saved to sample_image.png
0
ClassLabel(names=['normal', 'tumor'], id=None)
```