In [1]:
import pandas as pd
import numpy as np
from glob import glob

In [2]:
target_columns = [
    "Negative for Pneumonia", "Typical Appearance", "Indeterminate Appearance", "Atypical Appearance"
]

In [3]:
df = pd.read_csv('/workspace/data/train_study_level.csv')

In [4]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=516)

df["cv"] = -1

for cv, (train_index, test_index) in enumerate(mskf.split(df, df.loc[:, target_columns])):
    print("TRAIN:", train_index, "TEST:", test_index)
    df.loc[test_index, "cv"] = cv

TRAIN: [   0    1    2 ... 6051 6052 6053] TEST: [  15   17   20 ... 6043 6047 6050]
TRAIN: [   0    1    2 ... 6051 6052 6053] TEST: [   6   13   14 ... 6037 6044 6049]
TRAIN: [   0    1    3 ... 6049 6050 6052] TEST: [   2    7    9 ... 6045 6051 6053]
TRAIN: [   2    3    4 ... 6051 6052 6053] TEST: [   0    1    8 ... 6030 6039 6042]
TRAIN: [   0    1    2 ... 6050 6051 6053] TEST: [   3    4    5 ... 6046 6048 6052]




In [5]:
df.head()

Unnamed: 0,id,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,cv
0,00086460a852_study,0,1,0,0,3
1,000c9c05fd14_study,0,0,0,1,3
2,00292f8c37bd_study,1,0,0,0,2
3,005057b3f880_study,1,0,0,0,4
4,0051d9b12e72_study,0,0,0,1,4


In [7]:
df.columns

Index(['id', 'Negative for Pneumonia', 'Typical Appearance',
       'Indeterminate Appearance', 'Atypical Appearance', 'cv'],
      dtype='object')

In [8]:
image_path_list = glob('/workspace/data/train/*/*/*.dcm')

In [9]:
image_path_list[0]

'/workspace/data/train/eeecfd50b220/0fd96597c559/13131b0c3db4.dcm'

In [10]:
image_path_list[1000].split("/")[4]+"_study"

'386aa41785fa_study'

In [11]:
id_list = []
neg = []
typ = []
indete = []
atyp = []
dicom_path_list = []

error = []

for i in range(len(image_path_list)):
    image_path = image_path_list[i]
    study = image_path.split("/")[4]
    try:
        row = df[df["id"] == f"{study}_study"]

        id_list += [f"{study}_study"]
        neg += [row['Negative for Pneumonia']]
        typ += [row['Typical Appearance']]
        indete += [row['Indeterminate Appearance']]
        atyp += [row['Atypical Appearance']]
        dicom_path_list += [image_path]
    except:
        error.append(image_path)

In [12]:
df_train = pd.DataFrame({
    'id': id_list,
    'dicom_path': dicom_path_list
}).merge(df, on='id', how='left')

In [13]:
df_train

Unnamed: 0,id,dicom_path,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,cv
0,eeecfd50b220_study,/workspace/data/train/eeecfd50b220/0fd96597c55...,0,1,0,0,3
1,a89c1f3470e1_study,/workspace/data/train/a89c1f3470e1/c23cc53bdbf...,0,0,1,0,4
2,9cc2f64f0c58_study,/workspace/data/train/9cc2f64f0c58/2eebbe88278...,0,1,0,0,2
3,7b6c49da06db_study,/workspace/data/train/7b6c49da06db/c1fd5829f05...,0,1,0,0,2
4,d14080fd6f2a_study,/workspace/data/train/d14080fd6f2a/9d6098d515e...,0,1,0,0,0
...,...,...,...,...,...,...,...
6329,6bb38a2b98f0_study,/workspace/data/train/6bb38a2b98f0/70960f315ac...,0,1,0,0,4
6330,8501ffeadc53_study,/workspace/data/train/8501ffeadc53/0c640403081...,0,1,0,0,0
6331,c1ba4d912111_study,/workspace/data/train/c1ba4d912111/5d1e7b4f209...,0,1,0,0,4
6332,7e4059d6a0f9_study,/workspace/data/train/7e4059d6a0f9/ac80ee2496b...,1,0,0,0,2


In [14]:
df_train.cv.value_counts()

2    1273
3    1271
4    1264
1    1264
0    1262
Name: cv, dtype: int64

In [16]:
df_train.to_csv('/workspace/data/df_train_study_level.csv', index=False)