In [55]:
from datasets import load_dataset, Dataset
import os
from collections import defaultdict
import json 
from PIL import Image, ImageDraw
from functools import partial
import numpy as np
from transformers.image_transforms import center_to_corners_format
import torch

## Create dataset

In [56]:
annotation_file = "/home/taheera.ahmed/data/reindeerdrone/tiles/new_annotations.json"
data_dir = "/home/taheera.ahmed/data/reindeerdrone/tiles"
img_dir = "/home/taheera.ahmed/data/reindeerdrone/tiles/images"

In [73]:

datasets = load_dataset('json', data_files=annotation_file, field='annotations')

def process_image_id(image_id):
    # Remove the 'DSC' and '_tile' parts
    stripped_id = image_id.replace("DSC", "").replace("_tile", "").replace(".png", "")
    return int(stripped_id)

def combine_bboxes(examples):
    # Use defaultdict to initialize combined data for each image_id
    combined = defaultdict(lambda: {
        "image_id": None,
        "image": None,
        "width": None,
        "height": None,
        "objects": {
            'id': [], 
            'area': [],
            'category': [],
            'bbox': []
        }
    })
    
    # Loop over each example to combine bounding boxes by image_id
    for count, example in enumerate(examples):
        image_id = example['image_id']
        bbox = example['bbox']
        area = example['area']
        category_id = example['category_id']

        # Add bbox, category, and area to the corresponding image entry
        combined[image_id]["objects"]["bbox"].append(bbox)
        combined[image_id]["objects"]["category"].append(category_id)
        combined[image_id]["objects"]["area"].append(area)
        combined[image_id]["objects"]["id"].append(count)
        combined[image_id]["image_id"] = image_id
        
        # Load the image using PIL and get its dimensions
        image_path = os.path.join(img_dir, image_id)
        image = Image.open(image_path)
        width, height = image.size
        
        combined[image_id]["image"] = image
        combined[image_id]["width"] = width
        combined[image_id]["height"] = height
    
    # Flatten combined data into a list of dictionaries, one for each image
    combined_data = []
    for image_id, data in combined.items():
        combined_data.append({
            "image_id": process_image_id(data["image_id"]),
            #"image_id": data["image_id"],
            "image": data["image"],
            "width": data["width"],
            "height": data["height"],
            "objects": data["objects"]
        })
    
    return combined_data

combined_annotations = combine_bboxes(datasets['train'])
dataset = Dataset.from_list(combined_annotations)
dataset

Dataset({
    features: ['image_id', 'image', 'width', 'height', 'objects'],
    num_rows: 251
})

In [58]:
split = dataset.train_test_split(test_size=0.15, seed=1337)

dataset = {
    'train': split['train'],
    'validation': split['test']
}

print("Training Set:")
print(dataset['train'])
print("Validation Set:")
print(dataset['validation'])

dataset["train"][2]


Training Set:
Dataset({
    features: ['image_id', 'image', 'width', 'height', 'objects'],
    num_rows: 213
})
Validation Set:
Dataset({
    features: ['image_id', 'image', 'width', 'height', 'objects'],
    num_rows: 38
})


{'image_id': 54735,
 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1024x1024>,
 'width': 1024,
 'height': 1024,
 'objects': {'area': [12957.285532965,
   2582.2501702644,
   10277.7077077713,
   6468.324913521],
  'bbox': [[7.5581192042000005, 908.4531502426, 143.1929283019, 90.4883061379],
   [7.1121505028, 947.2585858125, 59.9725403355, 43.057208446],
   [615.6783560146, 946.6784266368, 132.9216059728, 77.3215733632],
   [676.1825675824, 955.737002753, 94.7559464773, 68.262997247]],
  'category': [0, 1, 0, 1],
  'id': [410, 411, 412, 413]}}

## did it work?

In [59]:
# read json file and get categories 
with open(annotation_file, 'r') as file:
    annotations = json.load(file)
    
categories = annotations['categories']
categories = [{"id": category['id'], "name": category['name']} for category in categories]
categories

[{'id': 0, 'name': 'Adult'}, {'id': 1, 'name': 'Calf'}]

In [60]:
annotations = dataset["train"][2]["objects"]["bbox"]
image = dataset["train"][2]["image"]
draw = ImageDraw.Draw(image)

for box in annotations:
    xmin, ymin, width, height = box
    xmax = xmin + width
    ymax = ymin + height
    draw.rectangle([xmin, ymin, xmax, ymax], outline="red", width=3)

image.show()

## Pre-process data

In [61]:
from transformers import AutoImageProcessor

MAX_SIZE = 1024
MODEL_NAME = "facebook/detr-resnet-50"

image_processor = AutoImageProcessor.from_pretrained(
    MODEL_NAME,
    do_resize=True,
    size={"max_height": MAX_SIZE, "max_width": MAX_SIZE},
    do_pad=True,
    pad_size={"height": MAX_SIZE, "width": MAX_SIZE},
)

In [62]:
import albumentations as A

train_augment_and_transform = A.Compose(
    [
        #A.Perspective(p=0.1),
        A.HorizontalFlip(p=0.5),
        #A.RandomBrightnessContrast(p=0.5),
        #A.HueSaturationValue(p=0.1),
    ],
    bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25),
)

validation_transform = A.Compose(
    [A.NoOp()],
    bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True),
)

def format_image_annotations_as_coco(image_id, categories, areas, bboxes):
    """Format one set of image annotations to the COCO format

    Args:
        image_id (str): image id. e.g. "0001"
        categories (List[int]): list of categories/class labels corresponding to provided bounding boxes
        areas (List[float]): list of corresponding areas to provided bounding boxes
        bboxes (List[Tuple[float]]): list of bounding boxes provided in COCO format
            ([center_x, center_y, width, height] in absolute coordinates)

    Returns:
        dict: {
            "image_id": image id,
            "annotations": list of formatted annotations
        }
    """
    annotations = []
    for category, area, bbox in zip(categories, areas, bboxes):
        formatted_annotation = {
            "image_id": image_id,
            "category_id": category,
            "iscrowd": 0,
            "area": area,
            "bbox": list(bbox),
        }
        annotations.append(formatted_annotation)

    return {
        "image_id": image_id,
        "annotations": annotations,
    }

def augment_and_transform_batch(examples, transform, image_processor, return_pixel_mask=False):
    """Apply augmentations and format annotations in COCO format for object detection task"""

    images = []
    annotations = []
    for image_id, image, objects in zip(examples["image_id"], examples["image"], examples["objects"]):
        image = np.array(image.convert("RGB"))

        # apply augmentations
        output = transform(image=image, bboxes=objects["bbox"], category=objects["category"])
        images.append(output["image"])

        # format annotations in COCO format
        formatted_annotations = format_image_annotations_as_coco(
            image_id, output["category"], objects["area"], output["bboxes"]
        )
        annotations.append(formatted_annotations)

    # Apply the image processor transformations: resizing, rescaling, normalization
    result = image_processor(images=images, annotations=annotations, return_tensors="pt")

    if not return_pixel_mask:
        result.pop("pixel_mask", None)

    return result

In [63]:

train_transform_batch = partial(
    augment_and_transform_batch, transform=train_augment_and_transform, image_processor=image_processor
)
validation_transform_batch = partial(
    augment_and_transform_batch, transform=validation_transform, image_processor=image_processor
)

dataset["train"] = dataset["train"].with_transform(train_transform_batch)
dataset["validation"] = dataset["validation"].with_transform(validation_transform_batch)

dataset["train"][2]

{'pixel_values': tensor([[[-0.3541, -0.3369, -0.4226,  ..., -0.4397, -0.3541, -0.3027],
          [-0.4397, -0.4568, -0.6109,  ..., -0.4568, -0.3541, -0.2856],
          [-0.6281, -0.6965, -0.8335,  ..., -0.5596, -0.5253, -0.4911],
          ...,
          [ 0.5707,  0.6563,  0.9303,  ...,  0.3823,  0.1768,  0.0912],
          [ 0.5536,  0.6392,  0.8789,  ...,  0.3138,  0.1254,  0.1768],
          [ 0.5193,  0.5536,  0.6734,  ...,  0.1597, -0.1828, -0.0458]],
 
         [[-0.2150, -0.1975, -0.2325,  ..., -0.3200, -0.3200, -0.2850],
          [-0.3025, -0.3200, -0.4601,  ..., -0.3375, -0.3025, -0.2325],
          [-0.4776, -0.5476, -0.6877,  ..., -0.4426, -0.4251, -0.3901],
          ...,
          [ 0.5728,  0.6604,  0.9405,  ...,  0.6254,  0.4153,  0.3277],
          [ 0.5728,  0.6604,  0.8880,  ...,  0.5553,  0.3627,  0.4153],
          [ 0.5378,  0.5728,  0.7304,  ...,  0.4328,  0.0476,  0.1877]],
 
         [[-0.5495, -0.5321, -0.6193,  ..., -0.5495, -0.4798, -0.4450],
          [-

In [64]:
def collate_fn(batch):
    data = {}
    data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch])
    data["labels"] = [x["labels"] for x in batch]
    if "pixel_mask" in batch[0]:
        data["pixel_mask"] = torch.stack([x["pixel_mask"] for x in batch])
    return data

## Preparing function to compute mAP

In [65]:
def convert_bbox_yolo_to_pascal(boxes, image_size):
    """
    Convert bounding boxes from YOLO format (x_center, y_center, width, height) in range [0, 1]
    to Pascal VOC format (x_min, y_min, x_max, y_max) in absolute coordinates.

    Args:
        boxes (torch.Tensor): Bounding boxes in YOLO format
        image_size (Tuple[int, int]): Image size in format (height, width)

    Returns:
        torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max)
    """
    # convert center to corners format
    boxes = center_to_corners_format(boxes)

    # convert to absolute coordinates
    height, width = image_size
    boxes = boxes * torch.tensor([[width, height, width, height]])

    return boxes

In [66]:
from dataclasses import dataclass
from torchmetrics.detection.mean_ap import MeanAveragePrecision


@dataclass
class ModelOutput:
    logits: torch.Tensor
    pred_boxes: torch.Tensor
    
@torch.no_grad()
def compute_average_precision(evaluation_results, image_processor, threshold=0.0, id2label=None):
    """
    Compute Average Precision (AP) for each class in an object detection task.

    Args:
        evaluation_results (EvalPrediction): Predictions and targets from evaluation.
        threshold (float, optional): Threshold to filter predicted boxes by confidence. Defaults to 0.0.
        id2label (Optional[dict], optional): Mapping from class id to class name. Defaults to None.

    Returns:
        Mapping[str, float]: Metrics in a form of dictionary {<class_name>: <AP_value>}
    """
    
    predictions, targets = evaluation_results.predictions, evaluation_results.label_ids

    image_sizes = []
    post_processed_targets = []
    post_processed_predictions = []

    # Collect targets in the required format for metric computation
    for batch in targets:
        batch_image_sizes = torch.tensor(np.array([x["orig_size"] for x in batch]))
        image_sizes.append(batch_image_sizes)
        for image_target in batch:
            boxes = torch.tensor(image_target["boxes"])
            boxes = convert_bbox_yolo_to_pascal(boxes, image_target["orig_size"])
            labels = torch.tensor(image_target["class_labels"])
            post_processed_targets.append({"boxes": boxes, "labels": labels})

    # Collect predictions in the required format for metric computation
    for batch, target_sizes in zip(predictions, image_sizes):
        batch_logits, batch_boxes = batch[1], batch[2]
        output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
        post_processed_output = image_processor.post_process_object_detection(
            output, threshold=threshold, target_sizes=target_sizes
        )
        post_processed_predictions.extend(post_processed_output)

    # Compute metrics using MeanAveragePrecision (this will provide class-level AP)
    metric = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
    metric.update(post_processed_predictions, post_processed_targets)
    metrics = metric.compute()

    # Extract per-class AP values
    classes = metrics.pop("classes")
    ap_per_class = metrics.pop("map_per_class")  # This gives AP per class

    # Format results as {class_name: AP_value}
    ap_results = {}
    ap_values = []
    for class_id, class_ap in zip(classes, ap_per_class):
        class_name = id2label[class_id.item()] if id2label is not None else f"class_{class_id.item()}"
        ap_results[f"ap_{class_name}"] = round(class_ap.item(), 4)
        ap_values.append(class_ap.item())  # Collect AP values

    # Calculate general AP (mean AP across all classes)
    ap_results["eval_ap"] = sum(ap_values) / len(ap_values)

    return ap_results


In [67]:
id2label = {category["id"]: category["name"] for category in categories}
label2id = {category["name"]: category["id"] for category in categories}

eval_compute_metrics_fn = partial(
    compute_average_precision, image_processor=image_processor, id2label=id2label, threshold=0.0
)

## Training the model

In [68]:
from transformers import AutoModelForObjectDetection

model = AutoModelForObjectDetection.from_pretrained(
    MODEL_NAME,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)


[1900957:7:0927/103112.433734:ERROR:command_buffer_proxy_impl.cc(132)] ContextResult::kTransientFailure: Failed to send GpuControl.CreateCommandBuffer.
Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequen

In [69]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="detr_finetuned_reindeerdrone",
    num_train_epochs=30,
    fp16=False,
    per_device_train_batch_size=8,
    dataloader_num_workers=4,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    weight_decay=1e-4,
    max_grad_norm=0.01,
    metric_for_best_model="eval_ap",
    greater_is_better=True,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    eval_do_concat_batches=False,
    push_to_hub=False,
)

In [70]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=image_processor,
    data_collator=collate_fn,
    compute_metrics=eval_compute_metrics_fn,
)

trainer.train()

  3%|▎         | 27/810 [14:18<6:55:04, 31.81s/it]

[A
[A
[A
[A
[A

                                                
[A                                             
  3%|▎         | 27/810 [04:54<05:21,  2.43it/s]
[A

{'eval_ap': 0.0053120015654712915, 'eval_loss': 1.9330289363861084, 'eval_ap_Adult': 0.0061, 'eval_ap_Calf': 0.0045, 'eval_runtime': 2.8081, 'eval_samples_per_second': 13.532, 'eval_steps_per_second': 1.781, 'epoch': 1.0}



[A
[A
[A
[A
[A

                                                
[A                                             
  3%|▎         | 27/810 [05:11<05:21,  2.43it/s]
[A

{'eval_ap': 0.01002857880666852, 'eval_loss': 1.9049028158187866, 'eval_ap_Adult': 0.0141, 'eval_ap_Calf': 0.006, 'eval_runtime': 2.9238, 'eval_samples_per_second': 12.997, 'eval_steps_per_second': 1.71, 'epoch': 2.0}



[A
[A
[A
[A
[A

                                                
[A                                             
  3%|▎         | 27/810 [05:27<05:21,  2.43it/s]
[A

{'eval_ap': 0.01703260187059641, 'eval_loss': 1.8919119834899902, 'eval_ap_Adult': 0.024, 'eval_ap_Calf': 0.0101, 'eval_runtime': 2.807, 'eval_samples_per_second': 13.538, 'eval_steps_per_second': 1.781, 'epoch': 3.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [05:44<05:21,  2.43it/s]
[A

{'eval_ap': 0.016353588551282883, 'eval_loss': 1.8861632347106934, 'eval_ap_Adult': 0.0217, 'eval_ap_Calf': 0.011, 'eval_runtime': 2.8864, 'eval_samples_per_second': 13.165, 'eval_steps_per_second': 1.732, 'epoch': 4.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [06:00<05:21,  2.43it/s]
[A

{'eval_ap': 0.02145358733832836, 'eval_loss': 1.8292330503463745, 'eval_ap_Adult': 0.0429, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.7491, 'eval_samples_per_second': 13.822, 'eval_steps_per_second': 1.819, 'epoch': 5.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [06:17<05:21,  2.43it/s]
[A

{'eval_ap': 0.012529902160167694, 'eval_loss': 1.8950300216674805, 'eval_ap_Adult': 0.0251, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.7984, 'eval_samples_per_second': 13.579, 'eval_steps_per_second': 1.787, 'epoch': 6.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [06:33<05:21,  2.43it/s]
[A

{'eval_ap': 0.008339914493262768, 'eval_loss': 2.131045341491699, 'eval_ap_Adult': 0.0167, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.8844, 'eval_samples_per_second': 13.174, 'eval_steps_per_second': 1.733, 'epoch': 7.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [06:50<05:21,  2.43it/s]
[A

{'eval_ap': 0.022186482325196266, 'eval_loss': 1.8248848915100098, 'eval_ap_Adult': 0.0444, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.8245, 'eval_samples_per_second': 13.454, 'eval_steps_per_second': 1.77, 'epoch': 8.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [07:07<05:21,  2.43it/s]
[A

{'eval_ap': 0.02563758217729628, 'eval_loss': 1.7182996273040771, 'eval_ap_Adult': 0.0506, 'eval_ap_Calf': 0.0007, 'eval_runtime': 2.8184, 'eval_samples_per_second': 13.483, 'eval_steps_per_second': 1.774, 'epoch': 9.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [07:23<05:21,  2.43it/s]
[A

{'eval_ap': 0.043873610207811, 'eval_loss': 1.703630805015564, 'eval_ap_Adult': 0.0871, 'eval_ap_Calf': 0.0007, 'eval_runtime': 2.8539, 'eval_samples_per_second': 13.315, 'eval_steps_per_second': 1.752, 'epoch': 10.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [07:40<05:21,  2.43it/s]
[A

{'eval_ap': 0.03462309390306473, 'eval_loss': 1.6803460121154785, 'eval_ap_Adult': 0.0692, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.9132, 'eval_samples_per_second': 13.044, 'eval_steps_per_second': 1.716, 'epoch': 11.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [07:56<05:21,  2.43it/s]
[A

{'eval_ap': 0.02890661545097828, 'eval_loss': 1.6154876947402954, 'eval_ap_Adult': 0.0578, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.8092, 'eval_samples_per_second': 13.527, 'eval_steps_per_second': 1.78, 'epoch': 12.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [08:13<05:21,  2.43it/s]
[A

{'eval_ap': 0.03201860189437866, 'eval_loss': 1.5900242328643799, 'eval_ap_Adult': 0.064, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.8614, 'eval_samples_per_second': 13.28, 'eval_steps_per_second': 1.747, 'epoch': 13.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [08:30<05:21,  2.43it/s]
[A

{'eval_ap': 0.03248976171016693, 'eval_loss': 1.5625640153884888, 'eval_ap_Adult': 0.065, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.8734, 'eval_samples_per_second': 13.225, 'eval_steps_per_second': 1.74, 'epoch': 14.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [08:46<05:21,  2.43it/s]
[A

{'eval_ap': 0.035272435983642936, 'eval_loss': 1.5558669567108154, 'eval_ap_Adult': 0.0704, 'eval_ap_Calf': 0.0002, 'eval_runtime': 2.8787, 'eval_samples_per_second': 13.201, 'eval_steps_per_second': 1.737, 'epoch': 15.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [09:03<05:21,  2.43it/s]
[A

{'eval_ap': 0.03525000101944897, 'eval_loss': 1.5247539281845093, 'eval_ap_Adult': 0.0704, 'eval_ap_Calf': 0.0001, 'eval_runtime': 3.025, 'eval_samples_per_second': 12.562, 'eval_steps_per_second': 1.653, 'epoch': 16.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [09:20<05:21,  2.43it/s]
[A

{'eval_ap': 0.034030631446512416, 'eval_loss': 1.5826334953308105, 'eval_ap_Adult': 0.068, 'eval_ap_Calf': 0.0001, 'eval_runtime': 2.7777, 'eval_samples_per_second': 13.68, 'eval_steps_per_second': 1.8, 'epoch': 17.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [09:37<05:21,  2.43it/s]
[A

{'eval_ap': 0.03262975066900253, 'eval_loss': 1.539825439453125, 'eval_ap_Adult': 0.0653, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.8977, 'eval_samples_per_second': 13.114, 'eval_steps_per_second': 1.726, 'epoch': 18.0}


                                                
  3%|▎         | 27/810 [09:45<05:21,  2.43it/s] 

{'loss': 1.8295, 'grad_norm': 43.13096618652344, 'learning_rate': 1.5995556879882246e-05, 'epoch': 18.52}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [09:53<05:21,  2.43it/s]
[A

{'eval_ap': 0.04172024130821228, 'eval_loss': 1.5074447393417358, 'eval_ap_Adult': 0.0834, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.7064, 'eval_samples_per_second': 14.041, 'eval_steps_per_second': 1.848, 'epoch': 19.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [10:11<05:21,  2.43it/s]
[A

{'eval_ap': 0.043391164392232895, 'eval_loss': 1.418958306312561, 'eval_ap_Adult': 0.0868, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.9677, 'eval_samples_per_second': 12.805, 'eval_steps_per_second': 1.685, 'epoch': 20.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [10:27<05:21,  2.43it/s]
[A

{'eval_ap': 0.04329508584487485, 'eval_loss': 1.4482594728469849, 'eval_ap_Adult': 0.0863, 'eval_ap_Calf': 0.0002, 'eval_runtime': 2.7253, 'eval_samples_per_second': 13.943, 'eval_steps_per_second': 1.835, 'epoch': 21.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [10:44<05:21,  2.43it/s]
[A

{'eval_ap': 0.0444614322623238, 'eval_loss': 1.4630017280578613, 'eval_ap_Adult': 0.0879, 'eval_ap_Calf': 0.001, 'eval_runtime': 2.9395, 'eval_samples_per_second': 12.927, 'eval_steps_per_second': 1.701, 'epoch': 22.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [11:01<05:21,  2.43it/s]
[A

{'eval_ap': 0.05424254751233093, 'eval_loss': 1.4154812097549438, 'eval_ap_Adult': 0.1085, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.8831, 'eval_samples_per_second': 13.18, 'eval_steps_per_second': 1.734, 'epoch': 23.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [11:18<05:21,  2.43it/s]
[A

{'eval_ap': 0.053653314820621745, 'eval_loss': 1.3808332681655884, 'eval_ap_Adult': 0.1073, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.7862, 'eval_samples_per_second': 13.639, 'eval_steps_per_second': 1.795, 'epoch': 24.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [11:35<05:21,  2.43it/s]
[A

{'eval_ap': 0.052218448370695114, 'eval_loss': 1.3928431272506714, 'eval_ap_Adult': 0.1044, 'eval_ap_Calf': 0.0, 'eval_runtime': 3.0074, 'eval_samples_per_second': 12.636, 'eval_steps_per_second': 1.663, 'epoch': 25.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [11:52<05:21,  2.43it/s]
[A

{'eval_ap': 0.05166024714708328, 'eval_loss': 1.414638876914978, 'eval_ap_Adult': 0.1033, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.8699, 'eval_samples_per_second': 13.241, 'eval_steps_per_second': 1.742, 'epoch': 26.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [12:09<05:21,  2.43it/s]
[A

{'eval_ap': 0.05174162611365318, 'eval_loss': 1.4169679880142212, 'eval_ap_Adult': 0.1035, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.861, 'eval_samples_per_second': 13.282, 'eval_steps_per_second': 1.748, 'epoch': 27.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [12:26<05:21,  2.43it/s]
[A

{'eval_ap': 0.05253266484942287, 'eval_loss': 1.4110380411148071, 'eval_ap_Adult': 0.1041, 'eval_ap_Calf': 0.001, 'eval_runtime': 2.8524, 'eval_samples_per_second': 13.322, 'eval_steps_per_second': 1.753, 'epoch': 28.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [12:42<05:21,  2.43it/s]
[A

{'eval_ap': 0.05214706435799599, 'eval_loss': 1.407902479171753, 'eval_ap_Adult': 0.1043, 'eval_ap_Calf': 0.0, 'eval_runtime': 2.7596, 'eval_samples_per_second': 13.77, 'eval_steps_per_second': 1.812, 'epoch': 29.0}



[A
[A
[A
[A
[A

                                                
[A                                              
  3%|▎         | 27/810 [13:00<05:21,  2.43it/s]
[A

{'eval_ap': 0.05209895223379135, 'eval_loss': 1.4079298973083496, 'eval_ap_Adult': 0.1042, 'eval_ap_Calf': 0.0, 'eval_runtime': 3.0501, 'eval_samples_per_second': 12.459, 'eval_steps_per_second': 1.639, 'epoch': 30.0}


                                                
100%|██████████| 810/810 [08:23<00:00,  1.61it/s]

{'train_runtime': 503.803, 'train_samples_per_second': 12.684, 'train_steps_per_second': 1.608, 'train_loss': 1.6614487259476274, 'epoch': 30.0}





TrainOutput(global_step=810, training_loss=1.6614487259476274, metrics={'train_runtime': 503.803, 'train_samples_per_second': 12.684, 'train_steps_per_second': 1.608, 'total_flos': 5.002340284130918e+18, 'train_loss': 1.6614487259476274, 'epoch': 30.0})