In [1]:
import timm
import torch
import os
import pandas as pd
import config
from utils.dataset import chest_xray_datasplit, ChestXrayDataset, ChestXrayDatasetWithMask
from utils.train import train, validate, compute_pos_weight

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Pretrained CoAtNet-2 with head removed
model = timm.create_model("coatnet_2_rw_224", pretrained=True, num_classes=0)
model = model.to(device)
model.eval()

# Freeze weights
for param in model.parameters():
    param.requires_grad = False

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
test_path = os.path.join(config.DATASET_DIR, 'miccai2023_nih-cxr-lt_labels_test.csv')
train_path = os.path.join(config.DATASET_DIR, 'miccai2023_nih-cxr-lt_labels_train.csv')
val_path = os.path.join(config.DATASET_DIR, 'miccai2023_nih-cxr-lt_labels_val.csv')

# Load all CSVs
df_train = pd.read_csv(train_path)
df_val = pd.read_csv(val_path)
df_test = pd.read_csv(test_path)

# Combine them
full_df = pd.concat([df_train, df_val, df_test], ignore_index=True)

In [3]:
columns = ['Pneumothorax', 'Emphysema', 'Subcutaneous Emphysema', 'Pneumoperitoneum', 'Pneumomediastinum']
classes_df = full_df[columns]
classes_df = classes_df[classes_df.sum(axis=1) > 0]
columns.extend(['id', 'No Finding', 'subj_id'])
# Filter full_df based on the index of the filtered classes_df
group1_df = full_df.loc[classes_df.index, columns]
#group1_df = full_df[columns]

In [4]:
from torch.utils.data import DataLoader
full_dataset = ChestXrayDataset(dataframe=group1_df, img_dir=os.path.join(config.DATASET_DIR, 'cxr', 'images'))

train_dataset, val_dataset, test_dataset = chest_xray_datasplit(group1_df, full_dataset, dataset_dir=os.path.join(config.DATASET_DIR, 'cxr', 'images'))

train_loader = DataLoader(train_dataset, batch_size=96, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=96, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=96, shuffle=False, num_workers=4, pin_memory=True)

In [5]:
import numpy as np

def extract_features(model, dataloader):
    features = []
    labels = []

    with torch.no_grad():
        for images, targets in dataloader:
            images = images.to(device)
            outputs = model(images)
            outputs = outputs.view(outputs.size(0), -1).cpu().numpy()  # Flatten features
            features.append(outputs)
            labels.append(targets.cpu().numpy())

    return np.vstack(features), np.vstack(labels)

X_train, y_train = extract_features(model, train_loader)
X_val, y_val = extract_features(model, val_loader)
X_test, y_test = extract_features(model, test_loader)

In [6]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score

# Create base classifier
base_model = xgb.XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    use_label_encoder=False,
    n_jobs=-1
)

# Wrap in multi-label classifier
clf = MultiOutputClassifier(base_model)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)
print("Macro F1:", f1_score(y_test, y_pred, average='macro'))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Macro F1: 0.22607104825275895
