In [None]:
from pathlib import Path

import pandas as pd

In [None]:
DATA_FOLDER = "../data"

LABEL_TRAIN = Path(DATA_FOLDER, "train.csv")
LABEL_VAL = Path(DATA_FOLDER, "validation.csv")
LABEL_SUB = Path(DATA_FOLDER, "submission_sample.csv")
LABEL_PRESENCE = Path(DATA_FOLDER, "presence_of_lesion.csv")

# Train label dataset

In [None]:
# Training dataset. Expected: filename, 4 box coords, shape

df_train = pd.read_csv(LABEL_TRAIN)
df_train

In [None]:
print((df_train.x2 - df_train.x1).describe())
print((df_train.y2 - df_train.y1).describe())

In [None]:
df_train.filename.value_counts()

# Validation dataset

In [None]:
# We will run inference on this file. We expect to find filename, shape, and ID (1-indexed).

df_val = pd.read_csv(LABEL_VAL)
df_val

# Submission file

In [None]:
# The organizers provide a blank submission file.
# It gives us insights about the expected prediction format (comma separated).

df_submission = pd.read_csv(LABEL_SUB)
df_submission

In [None]:
# There are multiple rows per filename.
# The organizers give us the number of expected boxes.

df_submission.filename.value_counts()

# Check intersection between datasets

In [None]:
set_files_train = set(df_train.filename)
set_files_val = set(df_val.filename)
set_files_sub = set(df_submission.filename)

print("Count of unique files per dataset:")
print(f"- Train: {len(df_train):,} rows, {len(set_files_train):,} unique filenames")
print(f"- Val: {len(df_val):,} rows, {len(set_files_val):,} unique filenames")
print(f"- Sub: {len(df_submission):,} rows, {len(set_files_sub):,} unique filenames\n")

# Confirmation that the filenames of the submission file are indeed the ones of the val set
print("Are val and submission sets identical? (Expected: True):")
print(set_files_val == set_files_sub, "\n")

# Check leakage between train and val
print("Check leakage between train and val")
print("train VS val :", len(set_files_train & set_files_val), "commmon elements")

# Presence of legion

The organizers provided an extra CSV file that tells us if a slide contains or not some boxes.

Thus, we can separate labels into 2 categories:
- the "bounding boxes" are strong labels, high quality, but only on 25% of the dataset
- the "presence of legion" labels are weak labels, contain less information, but cover 100% of the dataset.

From it, I see one way of leveraging the two categories:
1. Pretrain a basemodel in a self supervized fashion, with contrastive losses, to return good tile embeddings
2. Finetune this base model on the bounding box dataset

In [None]:
df_presence = pd.read_csv(LABEL_PRESENCE, sep=";")
df_presence.sort_values("file_name")

In [None]:
# Check that the positive slides are all in LABEL_TRAIN
set_files_pos = set(df_presence[df_presence.presence_of_lesion == 1].file_name)
set_files_neg = set(df_presence[df_presence.presence_of_lesion == 0].file_name)

In [None]:
# In the presence of lesion file, the name have no suffixes

files = []
for file in set_files_train:
    name = file.split("_")[0] + ".tif"
    if name not in set_files_pos:
        files.append(file)


print("Number of files in train.csv, with bboxes (so they are positive)")
print("but they are labelled as negative in the presence of lesion file:", len(files))
print("If we use these weak labels, we should fix them first.")

# See https://app.trustii.io/datasets/1526/forums/148/messages