### Import libraries

In [1]:
import sys
sys.path.append("../")

In [2]:
import os
from os import environ
import numpy as np
from random import choices
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
import skimage.io as io

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.transforms as mtrans
%matplotlib inline

In [5]:
from preprocess.common import load_nii

In [6]:
from shutil import copyfile
from sklearn.model_selection import GroupKFold

### Generate all data

In [7]:
if environ.get('MMsCardiac_DATA_PATH') is not None:
    MMs_DATA_PATH = environ.get('MMsCardiac_DATA_PATH')
else:
    print("Please set the environment variable MMs_DATA_PATH. Read the README!")

In [8]:
df = pd.read_csv("../utils/data/train.csv")

In [9]:
vendor_save_path = "byVendor"

In [10]:
for c_vendor in df["Vendor"].unique():
    os.makedirs(os.path.join(MMs_DATA_PATH, vendor_save_path, "train{}".format(c_vendor)), exist_ok=True)
    os.makedirs(os.path.join(MMs_DATA_PATH, vendor_save_path, "val{}".format(c_vendor)), exist_ok=True)

In [11]:
skf = GroupKFold(n_splits=7)
target = df["Vendor"]

# Get current fold data
for fold_indx, (train_index, val_index) in enumerate(skf.split(np.zeros(len(target)), target, groups=df["External code"])):
    if fold_indx == 0:  # If current iteration is the desired fold, take it!
        df_train = df.loc[train_index]
        df_val = df.loc[val_index]

In [12]:
df_train["Vendor"].value_counts()

A    1560
B    1312
C     436
Name: Vendor, dtype: int64

In [13]:
df_val["Vendor"].value_counts()

B    234
A    178
C    138
Name: Vendor, dtype: int64

In [14]:
np.intersect1d(df_train["External code"].unique(), df_val["External code"].unique())

array([], dtype=object)

---------------------------------------------------------------------------------------------------

In [15]:
vendor_count = {i:0 for i in df["Vendor"].unique()}

In [16]:
for index, row in df_train.iterrows():
    
    external_code = row["External code"]
    current_vendor, current_slice, current_phase = row["Vendor"], row["Slice"], row["Phase"]
    
    filename = "{}.npy".format(vendor_count[current_vendor])
    
    if row['Labeled']:
        data_parent = os.path.join("Training-corrected", "Labeled_npy")
    else:
        data_parent = os.path.join("Training-corrected", "Unlabeled_npy")
        

    img_path = os.path.join(
        MMs_DATA_PATH, data_parent,
        external_code, "{}_slice{}_phase{}.npy".format(external_code, current_slice, current_phase)
    )
    
    if not os.path.exists(img_path):
        assert False, "Error! Path not exists for '{}'".format(img_path)
        
    
    dst = os.path.join(
        MMs_DATA_PATH, vendor_save_path, "train{}".format(current_vendor), filename
    )
    copyfile(img_path, dst)
    
    vendor_count[current_vendor]+=1

In [17]:
vendor_count = {i:0 for i in df["Vendor"].unique()}

In [18]:
for index, row in df_val.iterrows():
    
    external_code = row["External code"]
    current_vendor, current_slice, current_phase = row["Vendor"], row["Slice"], row["Phase"]
    
    filename = "{}.npy".format(vendor_count[current_vendor])
    
    if row['Labeled']:
        data_parent = os.path.join("Training-corrected", "Labeled_npy")
    else:
        data_parent = os.path.join("Training-corrected", "Unlabeled_npy")
        

    img_path = os.path.join(
        MMs_DATA_PATH, data_parent,
        external_code, "{}_slice{}_phase{}.npy".format(external_code, current_slice, current_phase)
    )
    
    if not os.path.exists(img_path):
        assert False, "Error! Path not exists for '{}'".format(img_path)
        
    
    dst = os.path.join(
        MMs_DATA_PATH, vendor_save_path, "val{}".format(current_vendor), filename
    )
    copyfile(img_path, dst)
    
    vendor_count[current_vendor]+=1