In [None]:
import torch
import mlflow
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score

from model import BoneAgeModel
from dataset_eval import BoneAgeEvalDataset

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Male_Test_Evaluation")


2026/01/18 16:07:15 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.schemas
2026/01/18 16:07:15 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.tables
2026/01/18 16:07:15 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.types
2026/01/18 16:07:15 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.constraints
2026/01/18 16:07:15 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.defaults
2026/01/18 16:07:15 INFO alembic.runtime.plugins: setup plugin alembic.autogenerate.comments
2026/01/18 16:07:18 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/18 16:07:18 INFO mlflow.store.db.utils: Updating database tables
2026/01/18 16:07:18 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/18 16:07:18 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/18 16:07:20 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/18 16:07:20 INFO alembic.runtime

<Experiment: artifact_location='file:d:/pw2/pw_male/mlruns/1', creation_time=1768652793416, experiment_id='1', last_update_time=1768652793416, lifecycle_stage='active', name='Male_Test_Evaluation', tags={}>

: 

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
model = BoneAgeModel().to(device)
model.load_state_dict(
    torch.load("male_boneage_model.pth", map_location=device)
)
model.eval()

print("✅ Model loaded correctly")


Using device: cpu




✅ Model loaded correctly


In [6]:
test_ds = BoneAgeEvalDataset("test/test.csv", "test/images")
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)


In [None]:
y_true, y_pred = [], []

with mlflow.start_run(run_name="male_test_eval"):
    with torch.no_grad():
        for img, grp, _ in test_loader:
            img = img.to(device)
            logits, _ = model(img)
            preds = torch.argmax(logits, 1)

            y_true.extend(grp.numpy())
            y_pred.extend(preds.cpu().numpy())

    acc = accuracy_score(y_true, y_pred)
    mlflow.log_metric("accuracy", acc)

    report = classification_report(
        y_true, y_pred,
        target_names=["0–2", "2–6", "6–10", "10+"],
        output_dict=True
    )

    
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(cm, display_labels=["0–2", "2–6", "6–10", "10+"])
    disp.plot(cmap="Blues", values_format="d")
    plt.title("Male Test Confusion Matrix")
    plt.savefig("confusion_matrix.png")
    mlflow.log_artifact("confusion_matrix.png")
    plt.show()


: 