# Preprocessing datas for full study models

## Imports

In [1]:
import pandas as pd
import numpy as np
import os
from multi_image_dataset import *
from balance import *

In [2]:
import torch
from torchvision import transforms
from transforms import *

## Informations about annotations

In [3]:
# Load annotations
df = pd.read_csv('train.csv')

In [4]:
df.head(n=10)

Unnamed: 0,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,pe_present_on_image,negative_exam_for_pe,qa_motion,qa_contrast,flow_artifact,rv_lv_ratio_gte_1,rv_lv_ratio_lt_1,leftsided_pe,chronic_pe,true_filling_defect_not_pe,rightsided_pe,acute_and_chronic_pe,central_pe,indeterminate
0,6897fa9de148,2bfbb7fd2e8b,c0f3cb036d06,0,0,0,0,0,0,1,1,0,0,1,0,0,0
1,6897fa9de148,2bfbb7fd2e8b,f57ffd3883b6,0,0,0,0,0,0,1,1,0,0,1,0,0,0
2,6897fa9de148,2bfbb7fd2e8b,41220fda34a3,0,0,0,0,0,0,1,1,0,0,1,0,0,0
3,6897fa9de148,2bfbb7fd2e8b,13b685b4b14f,0,0,0,0,0,0,1,1,0,0,1,0,0,0
4,6897fa9de148,2bfbb7fd2e8b,be0b7524ffb4,0,0,0,0,0,0,1,1,0,0,1,0,0,0
5,6897fa9de148,2bfbb7fd2e8b,c7b99cb454d4,0,0,0,0,0,0,1,1,0,0,1,0,0,0
6,6897fa9de148,2bfbb7fd2e8b,e153deb813ed,0,0,0,0,0,0,1,1,0,0,1,0,0,0
7,6897fa9de148,2bfbb7fd2e8b,c6bbe08f2736,0,0,0,0,0,0,1,1,0,0,1,0,0,0
8,6897fa9de148,2bfbb7fd2e8b,09886998dc28,0,0,0,0,0,0,1,1,0,0,1,0,0,0
9,6897fa9de148,2bfbb7fd2e8b,5b5226ac7bac,0,0,0,0,0,0,1,1,0,0,1,0,0,0


## Keep only the available images

In [5]:
# List of available images (only keep their ID) in folder
available_images = list(map(lambda x: x.split('.')[0], os.listdir('images/')))

df = df.loc[df['SOPInstanceUID'].isin(available_images)]

## Create a balanced dataset

In [6]:
# Reduced prediction problem for one study
df_predict_pe_study = df[['StudyInstanceUID', 'SOPInstanceUID', 'pe_present_on_image', 'negative_exam_for_pe']]
df_predict_pe_study['negative_exam_for_pe'].value_counts()

1    27650
0    17350
Name: negative_exam_for_pe, dtype: int64

In [7]:
# Create the biggest balanced dataframe
df_balanced = balance_dataframe(df_predict_pe_study, 'negative_exam_for_pe')
df_balanced['negative_exam_for_pe'].value_counts()

1    17350
0    17350
Name: negative_exam_for_pe, dtype: int64

## One dataset per channel approach

In [8]:
t = transforms.Compose([Rescale(256),
                        Normalize(),
                        ToTensor()])

multi_image_dataset = MultiImageDataset(df_balanced, 'images/', transform=t)

In [9]:
torch.save(multi_image_dataset, 'datasets/multi_image_dataset.pt')

In [10]:
multi_image_dataset[0]['image'].shape

torch.Size([150, 256, 256])