In [None]:
import os 
import numpy as np 
import pandas as pd 
from config import config
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

if not os.path.exists("./Train_Val_MultiLabel_KFold"):
    os.mkdir("./Train_Val_MultiLabel_KFold")

def getMultiCsv(files_info):
    """
    Read csv info from official and external
    """
    res = []
    for file_info in files_info:
        files = pd.read_csv(file_info["file_name"])
        for i in range(len(files)):
            files["Id"][i] = file_info["prefix"] + files["Id"][i] + file_info["suffix"]
        res.append(files)
    res = pd.concat(res, ignore_index=True)
    return res

def getAugCsv(file_list, augN):
    """
    Make file_id become file_id0, file_id1, ...  file_id6, file_id7,
    The suffix number indicates a different augment
    So the number of training pictures multipy 8 in every epoch
    """
    res = []
    for augType in range(augN):
        files = file_list.copy()
        for index in files["Id"].index:
            if files["Id"][index][-1] != str(augType):
                files["Id"][index] = files["Id"][index] + str(augType)
        res.append(files)
    res = pd.concat(res, ignore_index=True)
    return res

def toNumpyY(target_list):
    """
    Convert label mode for MultilabelStratifiedKFold
    like 
    [1, 3] -> [0, 1, 0, 1, 0, 0, 0, ...., 0, 0]
    [0, 4] -> [1, 0, 0, 0, 1, 0, 0, ...., 0, 0]
    """
    res = []
    for line in target_list:
        targets = line.split(" ")
        temp = [0] * 28
        for target in targets:
            temp[int(target)] = 1
        res.append(temp)
    return res

train_file_info = [{"file_name": "../input/train.csv", "prefix": "train/", "suffix": ".png"},\
                    {"file_name": "../input/HPAv18/HPAv18RBGY_wodpl.csv", "prefix": "HPAv18/png_gray/", "suffix": ".png"}]
all_files = getMultiCsv(train_file_info)
y = toNumpyY(all_files["Target"])
    
mskf = MultilabelStratifiedKFold(n_splits=config.kfoldN, random_state=0)

for iterKF, (train_index, val_index) in enumerate(mskf.split(y, y)):
    train_data_list = all_files.iloc[train_index]
    val_data_list = all_files.iloc[val_index]
    print("iterKF{0} train_aug".format(iterKF))
    train_data_list = getAugCsv(train_data_list, 8)
    train_data_list.to_csv('./Train_Val_MultiLabel_KFold/Train_KFold_%s_%s.csv'%(config.kfoldN, iterKF), index=None)
    print("iterKF{0} val_aug".format(iterKF))
    val_data_list = getAugCsv(val_data_list, 8)
    val_data_list.to_csv('./Train_Val_MultiLabel_KFold/Val_KFold_%s_%s.csv'%(config.kfoldN, iterKF), index=None)
    print("iterKF{0} aug_end".format(iterKF))

iterKF0 train_aug
iterKF0 val_aug
iterKF0 aug_end
iterKF1 train_aug
iterKF1 val_aug
iterKF1 aug_end
iterKF2 train_aug
iterKF2 val_aug
iterKF2 aug_end
iterKF3 train_aug
