weight adjustment for imbalanced dataset 

methods:  
- BCEWithLogitsLoss https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html
- oversampling

In [None]:
import torch as ch
import pandas as pd
import os
from prediction.short_term_outcome_prediction.timeseries_decomposition import decompose_and_label_timeseries

In [None]:
data_path = '/Users/jk1/temp/opsum_end/preprocessing/gsu_Extraction_20220815_prepro_08062024_083500/early_neurological_deterioration_train_data_splits/train_data_splits_early_neurological_deterioration_ts0.8_rs42_ns5.pth'

In [None]:
use_gpu = ch.cuda.is_available()
target_time_to_outcome = 6

In [None]:
splits = ch.load(os.path.join(data_path))

In [None]:
# recommended weight adjustement
recommended_weight_adjustment_df = pd.DataFrame(columns=['split', 'train', 'val'])
for idx, split in enumerate(splits):
    X_train, X_val, y_train, y_val = split
    train_map, train_flat_labels = decompose_and_label_timeseries(X_train, y_train, target_time_to_outcome=target_time_to_outcome)
    val_map, val_flat_labels = decompose_and_label_timeseries(X_val, y_val, target_time_to_outcome=target_time_to_outcome)
    
    n_pos_train = sum(train_flat_labels) + 1e-6
    n_neg_train = len(train_flat_labels) - n_pos_train
    n_pos_val = sum(val_flat_labels) + 1e-6
    n_neg_val = len(val_flat_labels) - n_pos_val

    print(f'For split {idx}:')
    print(f'Number of positive samples in train: {n_pos_train} ({n_pos_train/len(train_flat_labels):.2%})')
    print(f'Number of negative samples in train: {n_neg_train} ({n_neg_train/len(train_flat_labels):.2%})')
    print(f'Number of positive samples in val: {n_pos_val} ({n_pos_val/len(val_flat_labels):.2%})')
    print(f'Number of negative samples in val: {n_neg_val} ({n_neg_val/len(val_flat_labels):.2%})')

    print(f'Weight adjustment for train: {n_neg_train/n_pos_train}')
    print(f'Weight adjustment for val: {n_neg_val/n_pos_val}')
    
    recommended_weight_adjustment_df = pd.concat([recommended_weight_adjustment_df, pd.DataFrame([[idx, n_neg_train/n_pos_train, n_neg_val/n_pos_val]], columns=['split', 'train', 'val'])], ignore_index=True)

In [None]:
recommended_weight_adjustment_df

In [None]:
overall_average = recommended_weight_adjustment_df[['train', 'val']].median()
overall_average.median()

Test BucketBatchSampler

In [None]:
from prediction.short_term_outcome_prediction.timeseries_decomposition import prepare_subsequence_dataset


all_datasets = [prepare_subsequence_dataset(x, use_gpu=use_gpu) for x in splits]

In [None]:
train_dataset, val_dataset = all_datasets[0]

In [None]:
from prediction.short_term_outcome_prediction.timeseries_decomposition import BucketBatchSampler
from torch.utils.data import DataLoader
batch_size = 128

train_bucket_sampler = BucketBatchSampler(train_dataset.idx_to_len_map, batch_size,
                                              labels=train_dataset.targets,  # Pass the target labels
    oversampling_factor=1
)
train_loader = DataLoader(train_dataset, batch_sampler=train_bucket_sampler,
                                  # shuffling is done in the bucket sampler
                                  shuffle=False, drop_last=False)

In [None]:
# loop through train_loader and get number of positive and negative samples on each batch
n_total_pos = 0
n_total_neg = 0

for batch in train_loader:
    # get the labels
    labels = batch[1]
    # get the number of positive and negative samples
    n_pos = sum(labels)
    n_neg = len(labels) - n_pos
    n_total_pos += n_pos
    n_total_neg += n_neg
    print(f'Number of positive samples in batch: {n_pos} ({n_pos/len(labels):.2%})')
    print(f'Number of negative samples in batch: {n_neg} ({n_neg/len(labels):.2%})')

print(f'Number of positive samples in train: {n_total_pos} ({n_total_pos/(n_total_pos+n_total_neg):.2%})')
print(f'Number of negative samples in train: {n_total_neg} ({n_total_neg/(n_total_pos+n_total_neg):.2%})')


In [None]:
batch_size = 128

val_bucket_sampler = BucketBatchSampler(val_dataset.idx_to_len_map, batch_size,
                                              labels=val_dataset.targets,  # Pass the target labels
    oversampling_factor=1
)
val_loader = DataLoader(val_dataset, batch_sampler=val_bucket_sampler,
                                  # shuffling is done in the bucket sampler
                                  shuffle=False, drop_last=False)

In [None]:


val_dataset.targets.sum(), train_dataset.targets.sum()

In [None]:
# loop through val_loader and get number of positive and negative samples on each batch
n_total_pos = 0
n_total_neg = 0
for batch in val_loader:
    # get the labels
    labels = batch[1]
    # get the number of positive and negative samples
    n_pos = sum(labels)
    n_neg = len(labels) - n_pos
    n_total_pos += n_pos
    n_total_neg += n_neg
    print(f'Number of positive samples in batch: {n_pos} ({n_pos/len(labels):.2%})')
    print(f'Number of negative samples in batch: {n_neg} ({n_neg/len(labels):.2%})')
print(f'Number of positive samples in val: {n_total_pos} ({n_total_pos/(n_total_pos+n_total_neg):.2%})')
print(f'Number of negative samples in val: {n_total_neg} ({n_total_neg/(n_total_pos+n_total_neg):.2%})')

In [None]:
X_train, X_val, y_train, y_val = splits[0]

In [None]:
y_val.shape, y_train.shape

In [None]:
X_train.shape, X_val.shape