In [1]:
import os
import sys
import argparse
import IPython 
from PIL import Image

import torch
import torchvision
import numpy as np
import pandas as pd
import skimage
from scipy import sparse
import matplotlib.pyplot as plt 
import torchxrayvision as xrv

from dinov2.data import SamplerType, make_data_loader, make_dataset
from dinov2.data.datasets import NIHChestXray
from dinov2.data.transforms import make_xray_classification_eval_transform, make_classification_eval_transform
from dinov2.eval.setup import setup_and_build_model
from dinov2.eval.utils import ModelWithNormalize, evaluate, extract_features
from dinov2.MLkNN import MLkNN 
from dinov2.utils import show_image_from_tensor

In [2]:
args = argparse.Namespace(config_file='dinov2/configs/eval/vits14_pretrain.yaml', pretrained_weights='models/dinov2_vits14_pretrain.pth', output_dir='results/NIH/dinov2_vits14/knn', opts=[], train_dataset_str='NIHChestXray:split=TRAIN:root=/mnt/d/data/NIH/train', val_dataset_str='NIHChestXray:split=TEST:root=/mnt/d/data/NIH/test', nb_knn=[5, 20, 50, 100, 200], temperature=0.07, gather_on_cpu=False, batch_size=8, n_per_class_list=[-1], n_tries=1, ngpus=1, nodes=1, timeout=2800, partition='learnlab', use_volta32=False, comment='', exclude='')
model, autocast_dtype = setup_and_build_model(args)
model = ModelWithNormalize(model)

I20230803 23:56:06 177 dinov2 config.py:60] git:
  sha: 23d2684f7f158936bb17423f684f8127288ff76b, status: has uncommitted changes, branch: main

I20230803 23:56:06 177 dinov2 config.py:61] batch_size: 8
comment: 
config_file: dinov2/configs/eval/vitl14_pretrain.yaml
exclude: 
gather_on_cpu: False
n_per_class_list: [-1]
n_tries: 1
nb_knn: [5, 20, 50, 100, 200]
ngpus: 1
nodes: 1
opts: ['train.output_dir=/mnt/c/Users/user/Desktop/dinov2/results/NIH/dinov2_vitl14/knn']
output_dir: /mnt/c/Users/user/Desktop/dinov2/results/NIH/dinov2_vitl14/knn
partition: learnlab
pretrained_weights: models/dinov2_vitl14_pretrain.pth
temperature: 0.07
timeout: 2800
train_dataset_str: NIHChestXray:split=TRAIN:root=/mnt/d/data/NIH/train
use_volta32: False
val_dataset_str: NIHChestXray:split=TEST:root=/mnt/d/data/NIH/test
I20230803 23:56:06 177 dinov2 config.py:27] sqrt scaling learning rate; base: 0.004, new: 0.001
I20230803 23:56:06 177 dinov2 config.py:34] MODEL:
  WEIGHTS: ''
compute_precision:
  grad_scale

In [3]:
transform = make_classification_eval_transform()
train_dataset = make_dataset(
    dataset_str=args.train_dataset_str,
    transform=transform,
)
val_dataset = make_dataset(
    dataset_str=args.val_dataset_str,
    transform=transform,
)

I20230803 23:56:19 177 dinov2 loaders.py:89] using dataset: "NIHChestXray:split=TRAIN:root=/mnt/d/data/NIH/train"
I20230803 23:56:48 177 dinov2 nih_chest_xray.py:67] 0 x-ray's are missing from TRAIN set
I20230803 23:56:48 177 dinov2 loaders.py:94] # of dataset samples: 86,524
I20230803 23:56:48 177 dinov2 loaders.py:89] using dataset: "NIHChestXray:split=TEST:root=/mnt/d/data/NIH/test"
I20230803 23:56:54 177 dinov2 nih_chest_xray.py:67] 0 x-ray's are missing from TEST set
I20230803 23:56:54 177 dinov2 loaders.py:94] # of dataset samples: 25,596


In [4]:
train_dataset_str = args.train_dataset_str
val_dataset_str = args.val_dataset_str
batch_size = args.batch_size
gather_on_cpu = args.gather_on_cpu
num_workers = 0

In [5]:
with torch.cuda.amp.autocast(dtype=autocast_dtype):
    train_features, train_labels = extract_features(
        model, train_dataset, batch_size, num_workers, gather_on_cpu=gather_on_cpu
    )
    model.eval()
    val_features, val_labels = extract_features(
        model, val_dataset, batch_size, num_workers, gather_on_cpu=gather_on_cpu
    )

train_features = train_features.cpu().numpy()
train_labels = train_labels.cpu().numpy()
val_features = val_features.cpu().numpy()
val_labels = val_labels.cpu().numpy()

I20230803 23:56:54 177 dinov2 loaders.py:164] sampler: none
I20230803 23:56:54 177 dinov2 loaders.py:211] using PyTorch data loader
I20230803 23:56:54 177 dinov2 loaders.py:224] # of batches: 10,816


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 2.00 GiB total capacity; 1.61 GiB already allocated; 0 bytes free; 1.68 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## MLkNN

In [None]:
import numpy as np
import sklearn.metrics

for k in args.nb_knn:
    classifier = MLkNN(k)
    classifier.fit(train_features, train_labels)
    results = classifier.predict(train_features).toarray()
    print(f"### for {k}-NN ###")
    print("### Hamming loss: %f" % sklearn.metrics.hamming_loss(train_labels, results))
    print("### Accuracy score: %f" % sklearn.metrics.accuracy_score(train_labels, results))
    print("### mAUC score combined: %f" % sklearn.metrics.roc_auc_score(train_labels, results, average="weighted"))
    print("### F1 score: %f" % sklearn.metrics.f1_score(train_labels, results, average="micro"))

    # Disease-specific scores
    disease_results = {"AUC": {}, "Accuracy": {}, "F1": {}}
    for index, disease in enumerate(train_dataset.class_names):
        disease_results["AUC"][disease] =  sklearn.metrics.roc_auc_score(train_labels[:, index], results[:, index])
        disease_results["Accuracy"][disease] =  sklearn.metrics.accuracy_score(train_labels[:, index], results[:, index])
        disease_results["F1"][disease] =  sklearn.metrics.f1_score(train_labels[:, index], results[:, index])
    print("## Disease-specific AUC scores")    
    print(pd.DataFrame(disease_results))