In [1]:
import sys
# Jupyter notebook setup

# Add our code
sys.path.append("../")

# Make sure imports are updated constantly
%load_ext autoreload
%autoreload 2

In [2]:
import torch

import joblib
import pandas as pd

from src.data.dataset import APTOSDataset
from src.preprocess.pipeline import Pipeline
from src.data.sampling import ImbalancedAPTOSDatasetSampler

In [3]:
DATA_FRAME = "../data/aptos2019-blindness-detection/train.csv"
DATA_DIRECTORY = "../data/aptos2019-blindness-detection/train_images"

# Pipeline to debug
PIPELINE_STAGES = [
    (
        "resize",
        {"width": 100, "height": 100}
    )
]

In [4]:
df = pd.read_csv(DATA_FRAME)

cache = joblib.Memory("/tmp/cachedir", verbose=0)
cache.clear()

pipeline = Pipeline(PIPELINE_STAGES)

dataset = APTOSDataset(df, DATA_DIRECTORY, pipeline, cache)

# Wrap the dataset up as it is in train.py
concat_dataset = torch.utils.data.ConcatDataset([dataset])
dataset_subset, _ = torch.utils.data.random_split(concat_dataset, [1000, len(dataset) - 1000])

# Create a sampler that more frequently presents samples from the less represented classes
sampler = ImbalancedAPTOSDatasetSampler(dataset_subset)



In [9]:
data_loader = torch.utils.data.DataLoader(dataset_subset, batch_size=100)

for idx, (image, diagnosis, id_) in enumerate(data_loader):
    print("Default sampler")
    print(torch.unique(diagnosis, return_counts=True))
    
    if idx >= 0:
        break

balanced_data_loader = torch.utils.data.DataLoader(dataset_subset, batch_size=100, sampler=sampler)

for idx, (image, diagnosis, id_) in enumerate(balanced_data_loader):
    print("ImbalancedAPTOSDatasetSampler sampler")
    print(torch.unique(diagnosis, return_counts=True))
    
    if idx >= 0:
        break



Default sampler
(tensor([0, 1, 2, 3, 4]), tensor([50, 10, 26,  3, 11]))
ImbalancedAPTOSDatasetSampler sampler
(tensor([0, 1, 2, 3, 4]), tensor([21, 17, 19, 24, 19]))
