# PREPARE CSV FILE

In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('../metadata/TAN_THYROID_summary_surgery.csv', index_col=0)
df

Unnamed: 0.1,Unnamed: 0,Scan ID,Cytology no.,Diagnosis/Bethesda system,Bethesda actual,Histopathology,Surgery diagnosis in number,Present,Present (Manual)
0,0,TAN001,C-11-22,Malignant,6,Anaplastic carcinoma,1.0,1,exist
1,1,TAN002,C-20-22,Benign nodule,2,Simple Nodular Goitre,0.0,1,exist
2,2,TAN003,C-53-23,Benign nodule,2,Simple Nodular Goitre,0.0,1,exist
3,3,TAN004,C-36-23,Benign nodule,2,Simple Nodular Goitre,0.0,1,exist
4,4,TAN005,C-373-20,Benign nodule,2,Simple Nodular Goitre,0.0,1,exist
...,...,...,...,...,...,...,...,...,...
100,100,TAN101,C-076-23,Carcinoma,6,No biopsy,,1,exist
101,101,TAN102,C-285-23,Goitre/thyroiditis,2,Cystic goitre,0.0,1,exist
102,102,TAN103,CP-01-23,Carcinoma,6,Anaplastic carcinoma,1.0,1,exist
103,103,TAN104,C-302-23,Colloid goitre,2,Colloid goitre,0.0,1,exist


In [3]:
df['Surgery diagnosis in number'].value_counts(dropna=False)

Surgery diagnosis in number
0.0    90
1.0    10
NaN     5
Name: count, dtype: int64

In [4]:
df = df[df['Surgery diagnosis in number'].notnull()].reset_index(drop=True)

In [5]:
df['Surgery diagnosis in number'].value_counts(dropna=False)

Surgery diagnosis in number
0.0    90
1.0    10
Name: count, dtype: int64

In [6]:
def get_label(x):
    return x

In [7]:
df['label'] = df['Surgery diagnosis in number'].apply(lambda x: get_label(x))

In [8]:
df['Bethesda actual'].value_counts().sort_index()

Bethesda actual
1     1
2    77
3     2
4    11
6     9
Name: count, dtype: int64

In [9]:
df['label'].value_counts().sort_index()

label
0.0    90
1.0    10
Name: count, dtype: int64

In [10]:
df.to_csv('../metadata/TAN_surgery_groundtruth.csv')

Prepare Train/Val/Test dataset

In [11]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df['label'], test_size=0.3)

In [12]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
skf = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
skf.get_n_splits(df['label'],df['label'])
for i, (train_index, test_index) in enumerate(skf.split(df['label'],df['label'])):
    df.loc[train_index,f'fold_{i}'] = 'train'
    df.loc[test_index,f'fold_{i}'] = 'test'

In [13]:
import re
df['patient_id'] = df['Scan ID'].apply(lambda x: int(re.findall(r'TAN(\d+)',x)[0]))

In [14]:
df

Unnamed: 0.1,Unnamed: 0,Scan ID,Cytology no.,Diagnosis/Bethesda system,Bethesda actual,Histopathology,Surgery diagnosis in number,Present,Present (Manual),label,fold_0,fold_1,fold_2,fold_3,fold_4,patient_id
0,0,TAN001,C-11-22,Malignant,6,Anaplastic carcinoma,1.0,1,exist,1.0,test,train,train,train,train,1
1,1,TAN002,C-20-22,Benign nodule,2,Simple Nodular Goitre,0.0,1,exist,0.0,train,train,test,train,test,2
2,2,TAN003,C-53-23,Benign nodule,2,Simple Nodular Goitre,0.0,1,exist,0.0,train,train,train,test,train,3
3,3,TAN004,C-36-23,Benign nodule,2,Simple Nodular Goitre,0.0,1,exist,0.0,train,test,train,train,train,4
4,4,TAN005,C-373-20,Benign nodule,2,Simple Nodular Goitre,0.0,1,exist,0.0,train,test,test,train,test,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,99,TAN100,C-071-23,Follicular neoplasm,4,Adenomatous goitre,0.0,1,exist,0.0,test,train,train,train,train,100
96,101,TAN102,C-285-23,Goitre/thyroiditis,2,Cystic goitre,0.0,1,exist,0.0,train,train,train,train,test,102
97,102,TAN103,CP-01-23,Carcinoma,6,Anaplastic carcinoma,1.0,1,exist,1.0,train,train,train,train,train,103
98,103,TAN104,C-302-23,Colloid goitre,2,Colloid goitre,0.0,1,exist,0.0,train,train,test,train,train,104


In [15]:
df.to_csv('../metadata/TAN_surgery_patient_dataset.csv')

Read image files

In [16]:
import os
import pathlib

In [17]:
p = pathlib.Path('../data/Tanzania-Data')
all_folders = list(p.iterdir())
all_folders = [i for i in all_folders if i.is_dir()]
all_folders[:3]

[PosixPath('../data/Tanzania-Data/TAN097'),
 PosixPath('../data/Tanzania-Data/TAN010'),
 PosixPath('../data/Tanzania-Data/TAN009')]

In [18]:
rows = []
for f in all_folders:
    if 'TAN' in f.name:
        patient_id = int(re.findall(r'TAN(\d+)',f.name)[0])
        all_imgs = list(f.iterdir())
        all_imgs = [i for i in all_imgs if i.name.startswith('IMG') and i.name.endswith('.jpg')]
        for img in all_imgs:
            rows.append((patient_id, str(img)))

In [19]:
image_dataset = pd.DataFrame(rows, columns=['patient_id', 'image_path'])
image_dataset

Unnamed: 0,patient_id,image_path
0,97,../data/Tanzania-Data/TAN097/IMG_2301251206156...
1,97,../data/Tanzania-Data/TAN097/IMG_2301251202141...
2,97,../data/Tanzania-Data/TAN097/IMG_2301251158260...
3,97,../data/Tanzania-Data/TAN097/IMG_2301251200592...
4,97,../data/Tanzania-Data/TAN097/IMG_2301251204265...
...,...,...
1192,101,../data/Tanzania-Data/TAN101/IMG_2302021504226...
1193,101,../data/Tanzania-Data/TAN101/IMG_2302021458365...
1194,101,../data/Tanzania-Data/TAN101/IMG_2302021505018...
1195,101,../data/Tanzania-Data/TAN101/IMG_2302021505310...


In [20]:
image_dataset.to_csv('../metadata/TAN_surgery_image_path.csv')

# PREPARE CSV FILE

In [21]:
import pandas as pd

In [22]:
df = pd.read_csv('../metadata/TAN_surgery_patient_dataset.csv',index_col=0)
df1 = pd.read_csv('../metadata/TAN_surgery_image_path.csv',index_col=0)

In [23]:
df

Unnamed: 0.1,Unnamed: 0,Scan ID,Cytology no.,Diagnosis/Bethesda system,Bethesda actual,Histopathology,Surgery diagnosis in number,Present,Present (Manual),label,fold_0,fold_1,fold_2,fold_3,fold_4,patient_id
0,0,TAN001,C-11-22,Malignant,6,Anaplastic carcinoma,1.0,1,exist,1.0,test,train,train,train,train,1
1,1,TAN002,C-20-22,Benign nodule,2,Simple Nodular Goitre,0.0,1,exist,0.0,train,train,test,train,test,2
2,2,TAN003,C-53-23,Benign nodule,2,Simple Nodular Goitre,0.0,1,exist,0.0,train,train,train,test,train,3
3,3,TAN004,C-36-23,Benign nodule,2,Simple Nodular Goitre,0.0,1,exist,0.0,train,test,train,train,train,4
4,4,TAN005,C-373-20,Benign nodule,2,Simple Nodular Goitre,0.0,1,exist,0.0,train,test,test,train,test,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,99,TAN100,C-071-23,Follicular neoplasm,4,Adenomatous goitre,0.0,1,exist,0.0,test,train,train,train,train,100
96,101,TAN102,C-285-23,Goitre/thyroiditis,2,Cystic goitre,0.0,1,exist,0.0,train,train,train,train,test,102
97,102,TAN103,CP-01-23,Carcinoma,6,Anaplastic carcinoma,1.0,1,exist,1.0,train,train,train,train,train,103
98,103,TAN104,C-302-23,Colloid goitre,2,Colloid goitre,0.0,1,exist,0.0,train,train,test,train,train,104


In [24]:
df1

Unnamed: 0,patient_id,image_path
0,97,../data/Tanzania-Data/TAN097/IMG_2301251206156...
1,97,../data/Tanzania-Data/TAN097/IMG_2301251202141...
2,97,../data/Tanzania-Data/TAN097/IMG_2301251158260...
3,97,../data/Tanzania-Data/TAN097/IMG_2301251200592...
4,97,../data/Tanzania-Data/TAN097/IMG_2301251204265...
...,...,...
1192,101,../data/Tanzania-Data/TAN101/IMG_2302021504226...
1193,101,../data/Tanzania-Data/TAN101/IMG_2302021458365...
1194,101,../data/Tanzania-Data/TAN101/IMG_2302021505018...
1195,101,../data/Tanzania-Data/TAN101/IMG_2302021505310...


In [25]:
df = df.merge(df1, how='inner', left_on='patient_id', right_on='patient_id')
df

Unnamed: 0.1,Unnamed: 0,Scan ID,Cytology no.,Diagnosis/Bethesda system,Bethesda actual,Histopathology,Surgery diagnosis in number,Present,Present (Manual),label,fold_0,fold_1,fold_2,fold_3,fold_4,patient_id,image_path
0,0,TAN001,C-11-22,Malignant,6,Anaplastic carcinoma,1.0,1,exist,1.0,test,train,train,train,train,1,../data/Tanzania-Data/TAN001/IMG_20220623_1400...
1,0,TAN001,C-11-22,Malignant,6,Anaplastic carcinoma,1.0,1,exist,1.0,test,train,train,train,train,1,../data/Tanzania-Data/TAN001/IMG_20220623_1413...
2,0,TAN001,C-11-22,Malignant,6,Anaplastic carcinoma,1.0,1,exist,1.0,test,train,train,train,train,1,../data/Tanzania-Data/TAN001/IMG_20220623_1349...
3,0,TAN001,C-11-22,Malignant,6,Anaplastic carcinoma,1.0,1,exist,1.0,test,train,train,train,train,1,../data/Tanzania-Data/TAN001/IMG_20220623_1349...
4,0,TAN001,C-11-22,Malignant,6,Anaplastic carcinoma,1.0,1,exist,1.0,test,train,train,train,train,1,../data/Tanzania-Data/TAN001/IMG_20220623_1413...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,104,TAN105,C-329-23,Follicular neoplasm,4,Follicular adenoma,0.0,1,exist,0.0,train,train,train,train,train,105,../data/Tanzania-Data/TAN105/IMG_2303311302112...
1115,104,TAN105,C-329-23,Follicular neoplasm,4,Follicular adenoma,0.0,1,exist,0.0,train,train,train,train,train,105,../data/Tanzania-Data/TAN105/IMG_2303311304087...
1116,104,TAN105,C-329-23,Follicular neoplasm,4,Follicular adenoma,0.0,1,exist,0.0,train,train,train,train,train,105,../data/Tanzania-Data/TAN105/IMG_2303311305261...
1117,104,TAN105,C-329-23,Follicular neoplasm,4,Follicular adenoma,0.0,1,exist,0.0,train,train,train,train,train,105,../data/Tanzania-Data/TAN105/IMG_2303311306152...


In [26]:
df[['patient_id', 'label']].drop_duplicates()['label'].value_counts().sort_index()

label
0.0    80
1.0    10
Name: count, dtype: int64

In [27]:
df['new_label'] = df['label'].astype(int)
df

Unnamed: 0.1,Unnamed: 0,Scan ID,Cytology no.,Diagnosis/Bethesda system,Bethesda actual,Histopathology,Surgery diagnosis in number,Present,Present (Manual),label,fold_0,fold_1,fold_2,fold_3,fold_4,patient_id,image_path,new_label
0,0,TAN001,C-11-22,Malignant,6,Anaplastic carcinoma,1.0,1,exist,1.0,test,train,train,train,train,1,../data/Tanzania-Data/TAN001/IMG_20220623_1400...,1
1,0,TAN001,C-11-22,Malignant,6,Anaplastic carcinoma,1.0,1,exist,1.0,test,train,train,train,train,1,../data/Tanzania-Data/TAN001/IMG_20220623_1413...,1
2,0,TAN001,C-11-22,Malignant,6,Anaplastic carcinoma,1.0,1,exist,1.0,test,train,train,train,train,1,../data/Tanzania-Data/TAN001/IMG_20220623_1349...,1
3,0,TAN001,C-11-22,Malignant,6,Anaplastic carcinoma,1.0,1,exist,1.0,test,train,train,train,train,1,../data/Tanzania-Data/TAN001/IMG_20220623_1349...,1
4,0,TAN001,C-11-22,Malignant,6,Anaplastic carcinoma,1.0,1,exist,1.0,test,train,train,train,train,1,../data/Tanzania-Data/TAN001/IMG_20220623_1413...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,104,TAN105,C-329-23,Follicular neoplasm,4,Follicular adenoma,0.0,1,exist,0.0,train,train,train,train,train,105,../data/Tanzania-Data/TAN105/IMG_2303311302112...,0
1115,104,TAN105,C-329-23,Follicular neoplasm,4,Follicular adenoma,0.0,1,exist,0.0,train,train,train,train,train,105,../data/Tanzania-Data/TAN105/IMG_2303311304087...,0
1116,104,TAN105,C-329-23,Follicular neoplasm,4,Follicular adenoma,0.0,1,exist,0.0,train,train,train,train,train,105,../data/Tanzania-Data/TAN105/IMG_2303311305261...,0
1117,104,TAN105,C-329-23,Follicular neoplasm,4,Follicular adenoma,0.0,1,exist,0.0,train,train,train,train,train,105,../data/Tanzania-Data/TAN105/IMG_2303311306152...,0


In [28]:
df[['patient_id', 'new_label']].drop_duplicates()['new_label'].value_counts().sort_index()

new_label
0    80
1    10
Name: count, dtype: int64

In [29]:
df.to_csv('../metadata/TAN_surgery_2c.csv')