# PREPARE CSV FILE

In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_excel('../data/NOH/NOH Data.xlsx')
df['Bethesda'] = df['FNA biopsy '].apply(lambda x: re.findall(r'Bethesda (\w+)',x)[0])
df

Unnamed: 0,Patient #,FNA biopsy,Surgery diagnosis,Bethesda
0,1,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V
1,2,Benign (Bethesda II),Benign,II
2,3,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V
3,4,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V
4,5,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V
...,...,...,...,...
127,128,Atypia of undetermine (Bethesda III),Benign thyroid follicular neoplasm,III
128,129,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V
129,130,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V
130,131,Thyroid papillary carcinoma (Bethesda VI),Thyroid papillary carcinoma,VI


In [3]:
df['Bethesda'].value_counts()

Bethesda
V      52
VI     48
II     12
III     8
IV      6
I       6
Name: count, dtype: int64

In [4]:
latin = {
    'I': 1,
    'II': 2,
    'III': 3,
    'IV': 4,
    'V': 5,
    'VI': 6
}

In [5]:
df['Bethesda_num'] = df['Bethesda'].apply(lambda x:latin[x])
df['label'] = df['Bethesda_num'] - 1
df

Unnamed: 0,Patient #,FNA biopsy,Surgery diagnosis,Bethesda,Bethesda_num,label
0,1,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V,5,4
1,2,Benign (Bethesda II),Benign,II,2,1
2,3,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V,5,4
3,4,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V,5,4
4,5,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V,5,4
...,...,...,...,...,...,...
127,128,Atypia of undetermine (Bethesda III),Benign thyroid follicular neoplasm,III,3,2
128,129,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V,5,4
129,130,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V,5,4
130,131,Thyroid papillary carcinoma (Bethesda VI),Thyroid papillary carcinoma,VI,6,5


In [6]:
df.to_csv('../metadata/NOH_groundtruth.csv')

Prepare Train/Val/Test dataset

In [7]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df['label'], test_size=0.3)

In [8]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
skf = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
skf.get_n_splits(df['label'],df['label'])
for i, (train_index, test_index) in enumerate(skf.split(df['label'],df['label'])):
    df.loc[train_index,f'fold_{i}'] = 'train'
    df.loc[test_index,f'fold_{i}'] = 'test'

In [9]:
df

Unnamed: 0,Patient #,FNA biopsy,Surgery diagnosis,Bethesda,Bethesda_num,label,fold_0,fold_1,fold_2,fold_3,fold_4
0,1,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V,5,4,test,train,train,train,test
1,2,Benign (Bethesda II),Benign,II,2,1,train,train,test,train,train
2,3,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V,5,4,test,train,train,test,train
3,4,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V,5,4,train,test,train,train,train
4,5,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V,5,4,train,train,train,train,test
...,...,...,...,...,...,...,...,...,...,...,...
127,128,Atypia of undetermine (Bethesda III),Benign thyroid follicular neoplasm,III,3,2,test,test,train,test,train
128,129,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V,5,4,train,test,train,train,train
129,130,Thyroid papillary carcinoma (Bethesda V),Thyroid papillary carcinoma,V,5,4,train,train,train,test,test
130,131,Thyroid papillary carcinoma (Bethesda VI),Thyroid papillary carcinoma,VI,6,5,train,test,train,test,train


In [10]:
df.to_csv('../metadata/NOH_patient_dataset.csv')

Read image files

In [11]:
import os
import pathlib

In [12]:
p = pathlib.Path('../data/NOH')
all_folders = list(p.iterdir())
all_folders = [i for i in all_folders if i.is_dir()]
all_folders[:3]

[PosixPath('../data/NOH/042. NTH.R'),
 PosixPath('../data/NOH/098. DTG'),
 PosixPath('../data/NOH/096. BTHH.R')]

In [13]:
rows = []
for f in all_folders:
    patient_id = int(re.findall(r'(\d+).',f.name)[0])
    all_imgs = list(f.iterdir())
    all_imgs = [i for i in all_imgs if i.name.startswith('IMG') and i.name.endswith('.jpg')]
    for img in all_imgs:
        rows.append((patient_id, str(img)))

In [14]:
image_dataset = pd.DataFrame(rows, columns=['patient_id', 'image_path'])
image_dataset

Unnamed: 0,patient_id,image_path
0,42,../data/NOH/042. NTH.R/IMG_20230119_092341.jpg
1,42,../data/NOH/042. NTH.R/IMG_20230119_092349.jpg
2,42,../data/NOH/042. NTH.R/IMG_20230119_092307.jpg
3,42,../data/NOH/042. NTH.R/IMG_20230119_091227.jpg
4,42,../data/NOH/042. NTH.R/IMG_20230119_092238.jpg
...,...,...
2043,41,../data/NOH/041. NTH.L/IMG_20230119_090927.jpg
2044,41,../data/NOH/041. NTH.L/IMG_20230119_091023.jpg
2045,41,../data/NOH/041. NTH.L/IMG_20230119_090844.jpg
2046,41,../data/NOH/041. NTH.L/IMG_20230119_090904.jpg


In [15]:
image_dataset.to_csv('../metadata/NOH_image_path.csv')