# Preprocessing datas for full study models

## Imports

In [1]:
import pandas as pd
import numpy as np
import os
from multi_image_dataset import *
from balance import *

In [2]:
import torch
from torchvision import transforms
from transforms import *

## Informations about annotations

In [3]:
# Load annotations
df = pd.read_csv('train.csv')

In [4]:
df.head(n=10)

Unnamed: 0,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,pe_present_on_image,negative_exam_for_pe,qa_motion,qa_contrast,flow_artifact,rv_lv_ratio_gte_1,rv_lv_ratio_lt_1,leftsided_pe,chronic_pe,true_filling_defect_not_pe,rightsided_pe,acute_and_chronic_pe,central_pe,indeterminate
0,6897fa9de148,2bfbb7fd2e8b,c0f3cb036d06,0,0,0,0,0,0,1,1,0,0,1,0,0,0
1,6897fa9de148,2bfbb7fd2e8b,f57ffd3883b6,0,0,0,0,0,0,1,1,0,0,1,0,0,0
2,6897fa9de148,2bfbb7fd2e8b,41220fda34a3,0,0,0,0,0,0,1,1,0,0,1,0,0,0
3,6897fa9de148,2bfbb7fd2e8b,13b685b4b14f,0,0,0,0,0,0,1,1,0,0,1,0,0,0
4,6897fa9de148,2bfbb7fd2e8b,be0b7524ffb4,0,0,0,0,0,0,1,1,0,0,1,0,0,0
5,6897fa9de148,2bfbb7fd2e8b,c7b99cb454d4,0,0,0,0,0,0,1,1,0,0,1,0,0,0
6,6897fa9de148,2bfbb7fd2e8b,e153deb813ed,0,0,0,0,0,0,1,1,0,0,1,0,0,0
7,6897fa9de148,2bfbb7fd2e8b,c6bbe08f2736,0,0,0,0,0,0,1,1,0,0,1,0,0,0
8,6897fa9de148,2bfbb7fd2e8b,09886998dc28,0,0,0,0,0,0,1,1,0,0,1,0,0,0
9,6897fa9de148,2bfbb7fd2e8b,5b5226ac7bac,0,0,0,0,0,0,1,1,0,0,1,0,0,0


## Keep only the available images

In [5]:
# List of available images (only keep their ID) in folder
available_images = list(map(lambda x: x.split('.')[0], os.listdir('images/')))

df = df.loc[df['SOPInstanceUID'].isin(available_images)]

## Create a balanced dataset

In [6]:
# Reduced prediction problem for one study
df_predict_pe_study = df.loc[:, df.columns.intersection(['StudyInstanceUID', 'negative_exam_for_pe'])]
df_predict_pe_study['StudyUID'] = df_predict_pe_study['StudyInstanceUID']
df_predict_pe_study = df_predict_pe_study.groupby('StudyInstanceUID').max()
df_predict_pe_study['negative_exam_for_pe'].value_counts()

1    139
0     90
Name: negative_exam_for_pe, dtype: int64

In [7]:
# Create the biggest balanced dataframe
df_balanced = balance_dataframe(df_predict_pe_study, 'negative_exam_for_pe')
df_balanced['negative_exam_for_pe'].value_counts()

1    90
0    90
Name: negative_exam_for_pe, dtype: int64

In [8]:
studies = list(df_balanced['StudyUID'])

In [9]:
df_dataset = pd.DataFrame()

for s in studies:
    rows = df.loc[df['StudyInstanceUID'] == s]
    df_dataset = df_dataset.append(rows)
    

df_dataset

Unnamed: 0,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,pe_present_on_image,negative_exam_for_pe,qa_motion,qa_contrast,flow_artifact,rv_lv_ratio_gte_1,rv_lv_ratio_lt_1,leftsided_pe,chronic_pe,true_filling_defect_not_pe,rightsided_pe,acute_and_chronic_pe,central_pe,indeterminate
124,013358b540bb,2805267980e7,c4cce735427f,0,1,0,0,0,0,0,0,0,0,0,0,0,0
125,013358b540bb,2805267980e7,f77e1fe61c1c,0,1,0,0,0,0,0,0,0,0,0,0,0,0
126,013358b540bb,2805267980e7,620a791e6a26,0,1,0,0,0,0,0,0,0,0,0,0,0,0
127,013358b540bb,2805267980e7,dd7cc25275d9,0,1,0,0,0,0,0,0,0,0,0,0,0,0
128,013358b540bb,2805267980e7,03a36b3e9e28,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,ffbf563099a4,517a0a9477bc,762f761ec1f3,0,1,0,0,0,0,0,0,0,0,0,0,0,0
5192,ffbf563099a4,517a0a9477bc,39291d570390,0,1,0,0,0,0,0,0,0,0,0,0,0,0
5193,ffbf563099a4,517a0a9477bc,44096749c6cf,0,1,0,0,0,0,0,0,0,0,0,0,0,0
5194,ffbf563099a4,517a0a9477bc,68833642b40a,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
df_dataset['StudyInstanceUID'].value_counts()

bed6309efd9c    368
0dc057976e8d    313
de9e82cbd095    309
7cd26a554409    298
5a83a90b867b    293
               ... 
48a408c167d5     94
af234b50ae62     89
3845443d4ddb     86
1ae6a23952ff     69
9934197d1d7d     33
Name: StudyInstanceUID, Length: 180, dtype: int64

In [11]:
df_dataset = df_dataset[df_dataset['StudyInstanceUID'] != '9934197d1d7d']
df_dataset['StudyInstanceUID'].value_counts()
df_dataset

Unnamed: 0,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,pe_present_on_image,negative_exam_for_pe,qa_motion,qa_contrast,flow_artifact,rv_lv_ratio_gte_1,rv_lv_ratio_lt_1,leftsided_pe,chronic_pe,true_filling_defect_not_pe,rightsided_pe,acute_and_chronic_pe,central_pe,indeterminate
124,013358b540bb,2805267980e7,c4cce735427f,0,1,0,0,0,0,0,0,0,0,0,0,0,0
125,013358b540bb,2805267980e7,f77e1fe61c1c,0,1,0,0,0,0,0,0,0,0,0,0,0,0
126,013358b540bb,2805267980e7,620a791e6a26,0,1,0,0,0,0,0,0,0,0,0,0,0,0
127,013358b540bb,2805267980e7,dd7cc25275d9,0,1,0,0,0,0,0,0,0,0,0,0,0,0
128,013358b540bb,2805267980e7,03a36b3e9e28,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,ffbf563099a4,517a0a9477bc,762f761ec1f3,0,1,0,0,0,0,0,0,0,0,0,0,0,0
5192,ffbf563099a4,517a0a9477bc,39291d570390,0,1,0,0,0,0,0,0,0,0,0,0,0,0
5193,ffbf563099a4,517a0a9477bc,44096749c6cf,0,1,0,0,0,0,0,0,0,0,0,0,0,0
5194,ffbf563099a4,517a0a9477bc,68833642b40a,0,1,0,0,0,0,0,0,0,0,0,0,0,0


## One dataset per channel approach

In [12]:
t = transforms.Compose([Rescale(256),
                        Normalize(),
                        ToTensor()])

multi_image_dataset = MultiImageDataset(df_dataset, 'images/', transform=t)

In [13]:
torch.save(multi_image_dataset, 'datasets/multi_image_dataset.pt')

In [14]:
print(len(multi_image_dataset))
multi_image_dataset[0]['image'].shape

179


torch.Size([150, 256, 256])