- imbalance: (True, False) => make data imbalanced intentionally
- split: (random, skf_prop, skf_major) => data split strategy
- split_rs: (1~5) => random_state of split
- split_fnum : (0~4) => fold num of split
- resampling strategy: (x2, cluster, cnt) => resampling strategy
- aug_strategy: ('none', (0.0 1.0]) => copy_paste prob



In [2]:
import random
import numpy as np
from PIL import Image
from torch.utils.data import Dataset

class VOC_ProbDataset(Dataset):
    def __init__(
            self, 
            meta_df,
            augmentation=None,
            preprocessing=None,
            copypaste_prop = 0.5,
            upsample_list = None,
            pre_aug=None,
            post_aug=None,
            label2num=None,
            vc_col=None,
            vc_df=None,
    ):
        self.ids = meta_df['id']
        self.images = meta_df['image_loc']
        self.masks = meta_df['mask_loc']
        self.meta_df = meta_df
        
        self.augmentation = augmentation
        self.preprocessing = preprocessing
        
        self.copypaste_prop = copypaste_prop
        self.upsample_list = upsample_list
        self.pre_aug = pre_aug
        self.post_aug = post_aug
        self.label2num = label2num

        self.vc_col = vc_col
        self.vc_df = vc_df        

    def __getitem__(self, i):
        # read data
        if self.vc_col:
            chosen = random.choices(self.vc_df[self.vc_col], self.vc_df['weight'], k=1)[0]
            idxes = self.meta_df[self.meta_df[self.vc_col] == chosen].index
            cur_id = random.choice(list(idxes))
            row = self.meta_df[self.meta_df.index == cur_id]
            
            dst_id = row['id'].values[0]
            dst_img = np.array(Image.open(row['image_loc'].values[0]))
            dst_mask = np.array(Image.open(row['mask_loc'].values[0]))
        else:
            dst_id =  self.ids.iloc[i]
            dst_img = np.array(Image.open(self.images.iloc[i]))
            dst_mask = np.array(Image.open(self.masks.iloc[i]))

        if self.copypaste_prop:
            paste_prob = random.random()
            if paste_prob < self.copypaste_prop:
                result, image, mask = copyblob(dst_img, dst_mask, 
                                               self.meta_df, self.upsample_list, 
                                               self.label2num, self.pre_aug, self.post_aug)
                if result:
                    dst_img, dst_mask= image, mask
                    if self.preprocessing:
                        sample = self.preprocessing(image=dst_img, mask=dst_mask)
                        dst_img, dst_mask = sample['image'], sample['mask']
                    return dst_img, dst_mask, dst_id

        if self.augmentation:
            sample = self.augmentation(image=dst_img, mask=dst_mask)
            dst_img, dst_mask = sample['image'], sample['mask']
        if self.preprocessing:
            sample = self.preprocessing(image=dst_img, mask=dst_mask)
            dst_img, dst_mask = sample['image'], sample['mask']
        return dst_img, dst_mask, dst_id

    def __len__(self):
        return len(self.ids)

    
def get_vc_df(cur_train_df, col):
    cnt_vc = pd.DataFrame(cur_train_df[col].value_counts())
    cnt_vc = cnt_vc.reset_index()
    cnt_vc.columns = [col,'cnt']
    cnt_vc['weight'] = 1/np.log(cnt_vc.cnt)
    cnt_vc['weight'] = cnt_vc['weight'] / cnt_vc['weight'].sum()
    return cnt_vc

In [3]:
import random
import os 
import logging
import pickle

import numpy as np
import pandas as pd
import segmentation_models_pytorch as smp

import torch
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import ConcatDataset

os.chdir('../../')

from src.utils.augmentation import copyblob
from src.utils.util import get_labedict
from src.utils.dataset import VOCDataset, VOC_CopyPasteDataset
from src.utils.train import TrainEpoch, ValidEpoch
from src.utils.metrics import Multiclass_IoU_Dice
from src.utils.augmentation import get_training_augmentation, \
                                get_validation_augmentation, \
                                get_preaug, get_postaug, \
                                get_preprocessing

In [4]:

labeldict = get_labedict()
label2num = labeldict['label2num']
num2label = labeldict['num2label']

train_df = pd.read_csv('./data/train_df.csv', index_col=0)
kf_cols =  [i for i in train_df.columns if i.startswith('kf')]
skf_cols =  [i for i in train_df.columns if i.startswith('skf')]

cnt_df = pd.read_csv('./data/imbalance_df.csv', index_col=0)

upsample_dict = dict(cnt_df[cnt_df['after'] < 60]['after'].apply(lambda x: int(100/x) ))
upsample_list = []
for label, num in upsample_dict.items():
    upsample_list += [label]*num


In [5]:
# IMBALANCE = True
# SPLIT = 'skf_major' # 
# SPLIT_RS = 1
# SPLIT_FNUM = 0
# RESAMPLING_STRATEGY = 'cnt'
# AUG_STRATEGY = 1.0
# copy_paste_prob = None if AUG_STRATEGY == 'none' else AUG_STRATEGY
# copy_paste_prob


In [6]:
def check_sampling(train_df, split, resample_rs):
    IMBALANCE = True
    SPLIT = split
    SPLIT_RS = 1
    SPLIT_FNUM = 0
    RESAMPLING_STRATEGY = resample_rs
    AUG_STRATEGY = 1.0
    copy_paste_prob = None if AUG_STRATEGY == 'none' else AUG_STRATEGY
    copy_paste_prob

    if IMBALANCE:
        print(f'before imbalance {len(train_df)}')
        train_df = train_df[train_df['use_image']==True]
        print(f'after imbalance {len(train_df)}')

    cur_split = f'{SPLIT}_rstate{SPLIT_RS}_fold{SPLIT_FNUM}'
    cur_train_df = train_df[train_df[cur_split] == 'train']
    cur_val_df = train_df[train_df[cur_split] == 'val']

    ENCODER = 'resnet101'
    ENCODER_WEIGHTS = 'imagenet'
    preprocessing_fn = smp.encoders.get_preprocessing_fn(ENCODER, ENCODER_WEIGHTS)

    if RESAMPLING_STRATEGY == 'x2':
        vc_col = None
        vc_df = None
    elif RESAMPLING_STRATEGY == 'cluster':
        vc_col = 'cluster_num_16'
        vc_df = get_vc_df(cur_train_df, vc_col)
    elif RESAMPLING_STRATEGY == 'cnt':
        vc_col = 'major_label'
        vc_df = get_vc_df(cur_train_df, vc_col)


    dataset_a = VOC_ProbDataset(cur_train_df,
                                augmentation=get_training_augmentation(),
                                #preprocessing=get_preprocessing(preprocessing_fn),
                                copypaste_prop = 1,
                                upsample_list=upsample_list,
                                pre_aug=get_preaug,
                                post_aug=get_postaug(),
                                label2num=label2num,)

    dataset_b = VOC_ProbDataset(cur_train_df,
                                augmentation=get_training_augmentation(),
                                #preprocessing=get_preprocessing(preprocessing_fn),
                                copypaste_prop = 1,
                                upsample_list=upsample_list,
                                pre_aug=get_preaug,
                                post_aug=get_postaug(),
                                label2num=label2num,
                                vc_col=vc_col,
                                vc_df=vc_df)

    train_dataset = ConcatDataset([dataset_a, dataset_b])

    from tqdm.notebook import tqdm
    def check_dataset(cur_dataset, cur_train_df, vc_col):
        if vc_col is None:
            vc_col = 'major_label'
        rows = []
        for i in tqdm(range(len(cur_dataset))):
            rows.extend(cur_train_df[cur_train_df.id == cur_dataset[i][2]].values)
        new_df = pd.DataFrame(rows)
        new_df.columns = cur_train_df.columns
        return pd.DataFrame(new_df[vc_col].value_counts())

    vc_a = check_dataset(dataset_a, cur_train_df, vc_col)
    vc_b = check_dataset(dataset_b, cur_train_df, vc_col)
    vc_concat = check_dataset(train_dataset, cur_train_df, vc_col)

    vc_a.columns = ['vc_a']
    vc_b.columns = ['vc_b']
    vc_concat.columns = ['vc_concat']
    vc_a = vc_a.sort_index()
    vc_b = vc_b.sort_index()
    vc_concat = vc_concat.sort_index()

    return pd.concat([vc_a, vc_b, vc_concat], axis=1)



In [7]:
random_x2 = check_sampling(train_df,'random', 'x2')
random_x2

before imbalance 1464
after imbalance 993


  0%|          | 0/801 [00:00<?, ?it/s]

  0%|          | 0/801 [00:00<?, ?it/s]

  0%|          | 0/1602 [00:00<?, ?it/s]

Unnamed: 0,vc_a,vc_b,vc_concat
aeroplane,13,13,26
bicycle,5,5,10
bird,17,17,34
boat,57,57,114
bottle,6,6,12
bus,56,56,112
car,63,63,126
cat,18,18,36
chair,9,9,18
cow,47,47,94


In [8]:
skf_prop_clsuter = check_sampling(train_df,'skf_prop', 'cluster')
skf_prop_clsuter

before imbalance 1464
after imbalance 993


  0%|          | 0/782 [00:00<?, ?it/s]

  0%|          | 0/782 [00:00<?, ?it/s]

  0%|          | 0/1564 [00:00<?, ?it/s]

Unnamed: 0,vc_a,vc_b,vc_concat
0,390,20,413
1,62,39,90
2,13,56,55
3,13,64,64
4,41,37,89
5,32,47,71
6,44,38,85
7,19,59,69
8,22,40,67
9,39,33,72


In [9]:

skf_major_cnt= check_sampling(train_df,'skf_major', 'cnt')
skf_major_cnt

before imbalance 1464
after imbalance 993


  0%|          | 0/797 [00:00<?, ?it/s]

  0%|          | 0/797 [00:00<?, ?it/s]

  0%|          | 0/1594 [00:00<?, ?it/s]

Unnamed: 0,vc_a,vc_b,vc_concat
aeroplane,14,41,65
bicycle,4,86,88
bird,14,46,58
boat,55,37,83
bottle,4,71,86
bus,57,32,83
car,64,19,89
cat,18,42,46
chair,8,47,71
cow,46,34,77


# old resampling strategy

In [None]:


def get_resampled_kmeans_df(cur_train_df):
    cluster_vc = pd.DataFrame(cur_train_df.cluster_num_16.value_counts())
    cluster_vc = cluster_vc.reset_index()
    cluster_vc.columns = ['cluster_num','cnt']
    max_cluster_num = cluster_vc.iloc[0]['cluster_num']
    min_cluster_num = cluster_vc.iloc[-1]['cluster_num']

    target_num = len(cur_train_df) * 2
    remain_num = target_num - cluster_vc.cnt.max()

    new_vc = pd.DataFrame(cluster_vc[cluster_vc.cnt != cluster_vc.cnt.max()])

    new_vc['sample_num'] = (new_vc.cnt.max() / new_vc['cnt'])
    new_vc['sample_num'] = new_vc['sample_num'].astype(int)
    new_vc['sample_num'] += 1

    new_vc = adjust_sample(new_vc, remain_num)

    cluster_vc = pd.merge(cluster_vc, new_vc, how='left')
    cluster_vc.loc[cluster_vc.cluster_num == max_cluster_num,'sample_num'] = 1.0

    temp_dfs = []
    for i in range(len(cluster_vc)):
        c_num, sample_cnt = cluster_vc.iloc[i]['cluster_num'], cluster_vc.iloc[i]['sample_num']
        temp_df = cur_train_df[cur_train_df.cluster_num_16 == int(c_num)]
        temp_dfs.extend([temp_df] * int(sample_cnt))

    new_train_df = pd.concat(temp_dfs)
    # print(target_num - len(new_train_df))
    return new_train_df

def adjust_cluster_sample(new_vc, remain_num, batch_size = 8):
    while True:
        new_sample_num = np.sum(new_vc.sample_num * new_vc.cnt)
        diff = remain_num - new_sample_num
        if abs(diff) < batch_size:
            break
        if diff < 0:
            c_num = random.choice(list(new_vc.iloc[-4:]['cluster_num']))
            new_vc.loc[new_vc.cluster_num == c_num, 'sample_num'] -=1 
        else:
            c_num = random.choice(list(new_vc.iloc[-4:]['cluster_num']))
            new_vc.loc[new_vc.cluster_num == c_num, 'sample_num'] +=1 
    return new_vc


for rs in range(5):
    for fn in range(5):
        SPLIT_RS, SPLIT_FNUM = rs, fn
        cur_split = f'skf_prop_rstate{SPLIT_RS+1}_fold{SPLIT_FNUM}'
        cur_train_df = train_df[train_df[cur_split] == 'train']
        cur_val_df = train_df[train_df[cur_split] == 'val']

        cluster_vc = pd.DataFrame(cur_train_df.cluster_num_16.value_counts())
        cluster_vc = cluster_vc.reset_index()
        cluster_vc.columns = ['cluster_num','cnt']
        max_cluster_num = cluster_vc.iloc[0]['cluster_num']
        min_cluster_num = cluster_vc.iloc[-1]['cluster_num']

        target_num = len(cur_train_df) * 2
        remain_num = target_num - cluster_vc.cnt.max()

        new_vc = pd.DataFrame(cluster_vc[cluster_vc.cnt != cluster_vc.cnt.max()])
        
        new_vc['sample_num'] = (new_vc.cnt.max() / new_vc['cnt'])
        new_vc['sample_num'] = new_vc['sample_num'].astype(int)
        new_vc['sample_num'] += 1
        
        new_vc = adjust_sample(new_vc, remain_num)
        

        cluster_vc = pd.merge(cluster_vc, new_vc, how='left')
        cluster_vc.loc[cluster_vc.cluster_num == max_cluster_num,'sample_num'] = 1.0

        temp_dfs = []
        for i in range(len(cluster_vc)):
            c_num, sample_cnt = cluster_vc.iloc[i]['cluster_num'], cluster_vc.iloc[i]['sample_num']
            temp_df = cur_train_df[cur_train_df.cluster_num_16 == int(c_num)]
            temp_dfs.extend([temp_df] * int(sample_cnt))

        new_train_df = pd.concat(temp_dfs)
        print(target_num - len(new_train_df))
