In [1]:
import json
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
coco_path = "/home/minyong-voyagerx/Mignon/Min/bb-detection/data/train_annotations.json"

with open(coco_path, 'r') as f:
    ann_json = json.load(f)

In [3]:
def make_train_valid_json(valid_image_list, fold):
    new_train_ann_json = defaultdict(list)
    new_train_ann_json["info"] = ann_json["info"]
    new_train_ann_json["categories"] = ann_json["categories"]

    new_valid_ann_json = defaultdict(list)
    new_valid_ann_json["info"] = ann_json["info"]

    for ann in tqdm(ann_json["images"]):
        if ann["id"] not in valid_image_list:
            new_train_ann_json['images'].append(ann)
        else:
            new_valid_ann_json['images'].append(ann)

    train_nid = 0
    valid_nid = 0
    for ann in tqdm(ann_json["annotations"]):
        if ann["image_id"] not in valid_image_list:
            ann["id"] = train_nid
            train_nid += 1
            new_train_ann_json['annotations'].append(ann)
        else:
            ann["id"] = valid_nid
            valid_nid += 1
            new_valid_ann_json['annotations'].append(ann)


    new_train_ann_json["categories"] = ann_json["categories"]
    new_valid_ann_json["categories"] = ann_json["categories"]
    

    with open(f"../data/train_fold{fold}.json", 'w') as f:
        json.dump(new_train_ann_json, f)
    
    with open(f"../data/valid_fold{fold}.json", 'w') as f:
        json.dump(new_valid_ann_json, f)

In [4]:
def get_train_valid_img_list(kfold, df):
    df_folds = df[['image_id']].copy()

    df_folds.loc[:, 'bbox_count'] = 1

    df_folds = df_folds.groupby('image_id').count()

    df_folds.loc[:, 'object_count'] = df.groupby('image_id')['class_id'].nunique()

    df_folds.loc[:, 'stratify_group'] = np.char.add(
        df_folds['object_count'].values.astype(str),
        df_folds['bbox_count'].apply(lambda x: f'_{x // 15}').values.astype(str)
    )

    skf = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=777)
    df_folds.loc[:, 'fold'] = 0
    for fold_number, (train_index, val_index) in enumerate(skf.split(X=df_folds.index, y=df_folds['stratify_group'])):
        df_folds.loc[df_folds.iloc[val_index].index, 'fold'] = fold_number

    df_folds.reset_index(inplace=True)
    for fold in range(kfold):
        df_train = pd.merge(df, df_folds[df_folds['fold'] != fold], on='image_id')
        df_valid = pd.merge(df, df_folds[df_folds['fold'] == fold], on='image_id')

        print(df_train['class_id'].value_counts())
        print(df_valid['class_id'].value_counts())

        valid_image_list = list(df_valid.image_id.unique())

        make_train_valid_json(valid_image_list, fold)


In [5]:
df = pd.DataFrame({"image_id": [0] * 80218, "class_id": [0] * 80218})

for idx, (ann) in enumerate(ann_json["annotations"]):
    df.loc[idx, "image_id"] = ann["image_id"]
    df.loc[idx, "class_id"] = ann["category_id"]

df

Unnamed: 0,image_id,class_id
0,130874,1
1,117880,1
2,117880,4
3,113386,4
4,148678,4
...,...,...
80213,122410,1
80214,151392,3
80215,127363,1
80216,127363,4


In [6]:
get_train_valid_img_list(5, df)

1    31184
4    15517
3     9586
2     7855
Name: class_id, dtype: int64
1    7774
4    3981
3    2416
2    1905
Name: class_id, dtype: int64


100%|██████████| 62622/62622 [00:55<00:00, 1126.49it/s]
100%|██████████| 80218/80218 [01:13<00:00, 1084.89it/s]


1    31137
4    15649
3     9613
2     7796
Name: class_id, dtype: int64
1    7821
4    3849
3    2389
2    1964
Name: class_id, dtype: int64


100%|██████████| 62622/62622 [00:58<00:00, 1067.35it/s]
100%|██████████| 80218/80218 [01:13<00:00, 1089.37it/s]


1    31155
4    15630
3     9573
2     7817
Name: class_id, dtype: int64
1    7803
4    3868
3    2429
2    1943
Name: class_id, dtype: int64


100%|██████████| 62622/62622 [00:55<00:00, 1119.66it/s]
100%|██████████| 80218/80218 [01:12<00:00, 1112.54it/s]


1    31144
4    15595
3     9644
2     7798
Name: class_id, dtype: int64
1    7814
4    3903
3    2358
2    1962
Name: class_id, dtype: int64


100%|██████████| 62622/62622 [00:57<00:00, 1095.44it/s]
100%|██████████| 80218/80218 [01:11<00:00, 1114.28it/s]


1    31212
4    15601
3     9592
2     7774
Name: class_id, dtype: int64
1    7746
4    3897
3    2410
2    1986
Name: class_id, dtype: int64


100%|██████████| 62622/62622 [00:57<00:00, 1083.78it/s]
100%|██████████| 80218/80218 [01:11<00:00, 1122.55it/s]


## Check Train-Valid box distribution

In [None]:
coco_path = "/home/minyong-voyagerx/Mignon/Min/bb-detection/train_fold0.json"

with open(coco_path, 'r') as f:
    ann_json = json.load(f)

result = [0] * 4
img_box = defaultdict(int)

for ann in ann_json["annotations"]:
    result[ann["category_id"] - 1] += 1
    img_box[ann["image_id"]] += 1

print("Each class box distribution: ", result)
print("Total box number: ", sum(result))
print("Total image number: ", len(ann_json["images"]))



coco_path = "/home/minyong-voyagerx/Mignon/Min/bb-detection/valid_fold0.json"

with open(coco_path, 'r') as f:
    ann_json = json.load(f)

result = [0] * 4
img_box = defaultdict(int)

for ann in ann_json["annotations"]:
    result[ann["category_id"] - 1] += 1
    img_box[ann["image_id"]] += 1

print("Each class box distribution: ", result)
print("Total box number: ", sum(result))
print("Total image number: ", len(ann_json["images"]))