In [None]:
from ultralytics import YOLO

# Build a YOLOv9c model from scratch
# model = YOLO("yolov9c.yaml")

# Build a YOLOv9c model from pretrained weight
model = YOLO("yolov9c.pt")

# Display model information (optional)
model.info()

In [None]:
# Run inference with the YOLOv9c model on the 'bus.jpg' image
results = model("../bus.jpg")
for result in results:
    result.show()

In [None]:
dspath = "/workspaces/til24-cv-trainer/data/til24ufo/dataset.yaml"
results = model.train(
    data=dspath,
    epochs=80,
    patience=10,
    batch=16,
    imgsz=1440,
    save_period=1,
    cache="ram",
    device=0,
    workers=12,
    freeze=20,
    seed=42,
    deterministic=False,
    profile=True,
    lr0=1e-6,
    lrf=1e-4,
    warmup_epochs=8,
    plots=True,
    augment=True,
)

In [None]:
import torch
from PIL import Image
import open_clip

# model, _, preprocess = open_clip.create_model_and_transforms(
#     "ViT-H-14-quickgelu", pretrained="dfn5b"
# )
# tokenizer = open_clip.get_tokenizer("ViT-H-14-quickgelu")
hf_repo = "hf-hub:Interpause/ViT-H-14-quickgelu-dfn5b-til24id"
model, preprocess = open_clip.create_model_from_pretrained(
    hf_repo, precision="fp16", jit=True
)
tokenizer = open_clip.get_tokenizer(hf_repo)

In [None]:
img_path = "../bus.jpg"
image = preprocess(Image.open(img_path)).unsqueeze(0)
text = [
    "diagram",
    "dog",
    "cat",
    "bee",
    "truck",
    "bus",
    "school bus",
    "white school bus",
    "black and yellow school bus",
]
toks = tokenizer(text)

In [None]:
model.cuda()

In [None]:
with torch.no_grad(), torch.autocast("cuda"), torch.inference_mode():
    image_features = model.encode_image(image.cuda())
    text_features = model.encode_text(toks.cuda())
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

print("Label probs:", dict(zip(text, text_probs[0].tolist())))  # prints: [[1., 0., 0.]]

In [None]:
import open_clip

MODEL_ARCH = "ViT-H-14-quickgelu"
MODEL_BASE = "/workspaces/til24-cv-trainer/notebooks/archive/artifacts/dfn5b.bin"
MODEL_FT = "/workspaces/til24-cv-trainer/notebooks/archive/artifacts/v2_e28_fp16.bin"
model1, _ = open_clip.create_model_from_pretrained(
    MODEL_ARCH,
    pretrained=MODEL_BASE,
    precision="fp16",
    image_resize_mode="longest",
    image_interpolation="bicubic",
)
model2, _ = open_clip.create_model_from_pretrained(
    MODEL_ARCH,
    pretrained=MODEL_FT,
    precision="fp16",
    image_resize_mode="longest",
    image_interpolation="bicubic",
)

In [None]:
theta0 = model1.state_dict()
theta1 = model2.state_dict()
alpha = 0.6

# make sure checkpoints are compatible
assert set(theta0.keys()) == set(theta1.keys())

# interpolate between checkpoints with mixing coefficient alpha
theta = {key: (1 - alpha) * theta0[key] + alpha * theta1[key] for key in theta0.keys()}
model1.load_state_dict(theta)
import pickle, torch

torch.save(
    theta,
    "/workspaces/til24-cv-trainer/notebooks/archive/artifacts/wiseft-a0.6.bin",
    pickle_protocol=pickle.HIGHEST_PROTOCOL,
)

In [None]:
torch.jit.script(model1.cpu().eval()).save(
    "/workspaces/til24-cv-trainer/notebooks/archive/artifacts/wiseft-a0.6-jit.bin"
)