In [None]:
from digilut.dataset import NpyDataset

csv_labels = "../patches/labels_balanced_test.csv"
folder_embeddings = "../embeddings/"

npy_dataset = NpyDataset(csv_labels, folder_embeddings)

# Check the shapes of X and y
print(f"Shape of X: {npy_dataset.X.shape}")

In [None]:
import os

import joblib

# Replace with the path to your saved model file
models_folder = "models/"
model_names = os.listdir(models_folder)

# Load the saved models
models = []
for model_name in model_names:
    model = joblib.load(models_folder + model_name)
    models.append(model)
    print(f"Model loaded from {model_name}")

In [None]:
# Run predictions

import numpy as np

X = npy_dataset.X

y_preds = []
for model in models:
    y_pred = model.predict_proba(X)
    y_preds.append(y_pred)

y_pred = np.array(y_preds).mean(axis=0)[:, 1] > 0.5
y_pred.astype(int)

In [None]:
# Convert preds into dataframe

import pandas as pd

df_preds = pd.DataFrame(
    {
        "patientID": npy_dataset.patient_ids,
        "slideID": npy_dataset.slide_ids,
        "slidePath": npy_dataset.names,
        "preds": y_pred.astype(int),
    }
)
df_preds = df_preds[df_preds.preds == 1]
df_preds

In [None]:
df_preds.to_csv("../patches/predictions.csv")

We have predictions for each tile.

Let's create bounding boxes from 2D predictions.

# Prepare data


In [None]:
import pandas as pd

In [None]:
preds = pd.read_csv("../patches/predictions.csv", index_col=0)
preds

In [None]:
# Split slide paths into columns to isolate coords

coords = []
for i, row in preds.iterrows():
    path = row.slidePath
    path = path.split(".jpg")[0]
    pathName, coord = path.split("/patches/")
    pathName = pathName.split("patches/")[1]
    coord = coord.split("_")
    coords.append((pathName, *coord))

coords

df_coords = pd.DataFrame(
    coords,
    columns=[
        "pathName",
        "patch_id_x",
        "patch_id_y",
        "x_pos",
        "y_pos",
        "patch_level",
        "patch_width",
        "patch_height",
    ],
)

df_full = pd.concat([preds, df_coords], axis=1)
df_full

In [None]:
# Add to this table the max coordinates of the blob
max_coords = pd.read_csv("../data/train_cleaned.csv", index_col=0)[
    ["slideName", "max_x", "max_y"]
].drop_duplicates()
df_final = df_full.merge(max_coords, left_on="pathName", right_on="slideName")
df_final

In [None]:
df_final = df_final[["pathName", "x_pos", "y_pos"]]
df_final["x_pos"] = pd.to_numeric(df_final["x_pos"])
df_final["y_pos"] = pd.to_numeric(df_final["y_pos"])
df_final

# Cluster


In [None]:
# Create dataframe with the nb of bboxes expected per slide
df_test = pd.read_csv("../data/train.csv")
bboxes_per_slide_dict = df_test.filename.value_counts().to_dict()
bboxes_per_slide_dict = {
    k.split(".tif")[0]: v for k, v in bboxes_per_slide_dict.items()
}
bboxes_per_slide_dict

In [None]:
from sklearn.cluster import DBSCAN

from digilut.create_bbox import get_n_most_recurrent_from_dict, plot_clusters

# Get slide names unique
unique_path_names = set(df_final.pathName)
final_predictions = []


for path_name in unique_path_names:
    bboxes_kept = []

    # Extract the predictions for this slide
    pixels = df_final[df_final.pathName == path_name].copy()
    pixels.drop(columns=["pathName"], inplace=True)
    print(path_name)
    print(pixels.values.shape)

    # Perform DBSCAN clustering
    X = pixels.values
    db = DBSCAN(eps=5000, min_samples=3).fit(X)
    labels = db.labels_
    pixels["cluster_id"] = labels

    # Plots
    # plot_clusters(X, labels, path_name)

    # Count nb positives patches per cluster (except the -1 one, outliers)
    occurences = pd.Series(labels).value_counts().to_dict()
    print("Occurences per cluster:", occurences)
    occurences.pop(-1)

    # Keep the n biggest clusters, k is the nb of expected clusters
    k = bboxes_per_slide_dict[path_name]
    clusters_to_keep = get_n_most_recurrent_from_dict(occurences, k)
    print("most occurent:", clusters_to_keep)
    pixels_kept_for_slide = pixels[pixels["cluster_id"].isin(clusters_to_keep)]

    # Get bounding box for each cluster
    for cluster_id in clusters_to_keep:
        cluster_pixels = pixels[pixels["cluster_id"] == cluster_id]
        final_predictions.append(
            {
                "filename": path_name,
                "x1": cluster_pixels["x_pos"].min(),
                "x2": cluster_pixels["x_pos"].max() + 256,
                "y1": cluster_pixels["y_pos"].min(),
                "y2": cluster_pixels["y_pos"].max() + 256,
            },
        )

    # If not enough bounding boxes predicted, patch with null boxes
    for i in range(len(clusters_to_keep) - k):
        final_predictions.append(
            {"filename": path_name, "x1": 0, "x2": 0, "y1": 0, "y2": 0},
        )

df_final_predictions = pd.DataFrame(final_predictions).sort_values("filename")
df_final_predictions

In [None]:
df_final_predictions.to_csv("bboxes_predicted.csv")

# Format bboxes file for submission

Goal: Last step, match predicted bboxes with trustii_id

The idea is to sort the 2 dataframes per filename.

Assumption:
- We expect them to be have the same number of bboxes for each filename

In [None]:
import pandas as pd

submission_template = pd.read_csv("../data/submission_sample.csv")
submission_template.sort_values("filename", inplace=True)
submission_template.drop(columns=["x1", "x2", "y1", "y2"], inplace=True)

bboxes_predicted = pd.read_csv("bboxes_predicted.csv", index_col=0)
bboxes_predicted

print(submission_template.shape)
print(bboxes_predicted.shape)

In [None]:
print("Expected (true):", len(submission_template) == len(bboxes_predicted))

In [None]:
final_submission = pd.concat([submission_template, bboxes_predicted], axis=1)
final_submission.sort_values("trustii_id")

In [None]:
final_submission.to_csv("submission.csv")