# Preprocessing datas for full study model v5+

## Imports

In [1]:
import pandas as pd
import numpy as np
import os
from modules.multi_image_multi_label_dataset import *
from modules.balance import *

In [2]:
import torch
from torchvision import transforms
from transforms import *

## Informations about annotations

In [3]:
# Load annotations
df = pd.read_csv('train.csv')

In [4]:
df.head(n=10)

Unnamed: 0,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,pe_present_on_image,negative_exam_for_pe,qa_motion,qa_contrast,flow_artifact,rv_lv_ratio_gte_1,rv_lv_ratio_lt_1,leftsided_pe,chronic_pe,true_filling_defect_not_pe,rightsided_pe,acute_and_chronic_pe,central_pe,indeterminate
0,6897fa9de148,2bfbb7fd2e8b,c0f3cb036d06,0,0,0,0,0,0,1,1,0,0,1,0,0,0
1,6897fa9de148,2bfbb7fd2e8b,f57ffd3883b6,0,0,0,0,0,0,1,1,0,0,1,0,0,0
2,6897fa9de148,2bfbb7fd2e8b,41220fda34a3,0,0,0,0,0,0,1,1,0,0,1,0,0,0
3,6897fa9de148,2bfbb7fd2e8b,13b685b4b14f,0,0,0,0,0,0,1,1,0,0,1,0,0,0
4,6897fa9de148,2bfbb7fd2e8b,be0b7524ffb4,0,0,0,0,0,0,1,1,0,0,1,0,0,0
5,6897fa9de148,2bfbb7fd2e8b,c7b99cb454d4,0,0,0,0,0,0,1,1,0,0,1,0,0,0
6,6897fa9de148,2bfbb7fd2e8b,e153deb813ed,0,0,0,0,0,0,1,1,0,0,1,0,0,0
7,6897fa9de148,2bfbb7fd2e8b,c6bbe08f2736,0,0,0,0,0,0,1,1,0,0,1,0,0,0
8,6897fa9de148,2bfbb7fd2e8b,09886998dc28,0,0,0,0,0,0,1,1,0,0,1,0,0,0
9,6897fa9de148,2bfbb7fd2e8b,5b5226ac7bac,0,0,0,0,0,0,1,1,0,0,1,0,0,0


## Keep only the available images and positives

In [5]:
# List of available images (only keep their ID) in folder
available_images = list(map(lambda x: x.split('.')[0], os.listdir('images/')))

df = df.loc[df['SOPInstanceUID'].isin(available_images)]
df= df.loc[df['negative_exam_for_pe'] == 0]

## Add Acute pe column

In [6]:
# Reduced prediction problem for one study
df_predict_labels_study = df.loc[:, df.columns.intersection(['StudyInstanceUID', 'SOPInstanceUID', 'rv_lv_ratio_gte_1', 'rv_lv_ratio_lt_1',
                                                             'leftsided_pe', 'rightsided_pe', 'central_pe',
                                                            'chronic_pe', 'acute_and_chronic_pe', 'pe_present_on_image'])]

df_predict_labels_study

Unnamed: 0,StudyInstanceUID,SOPInstanceUID,pe_present_on_image,rv_lv_ratio_gte_1,rv_lv_ratio_lt_1,leftsided_pe,chronic_pe,rightsided_pe,acute_and_chronic_pe,central_pe
0,6897fa9de148,c0f3cb036d06,0,0,1,1,0,1,0,0
1,6897fa9de148,f57ffd3883b6,0,0,1,1,0,1,0,0
2,6897fa9de148,41220fda34a3,0,0,1,1,0,1,0,0
3,6897fa9de148,13b685b4b14f,0,0,1,1,0,1,0,0
4,6897fa9de148,be0b7524ffb4,0,0,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
83929,2b4a1c8c4573,b54a68d6af71,0,1,0,1,1,0,0,0
83930,2b4a1c8c4573,2a62c1cf15de,0,1,0,1,1,0,0,0
83931,2b4a1c8c4573,2a43a9e6d344,0,1,0,1,1,0,0,0
83932,2b4a1c8c4573,210cbd4cb77f,1,1,0,1,1,0,0,0


In [7]:
df_predict_labels_study['acute_pe'] = 1 - df_predict_labels_study['chronic_pe'] + df_predict_labels_study['acute_and_chronic_pe']
df_predict_labels_study.head(10)

Unnamed: 0,StudyInstanceUID,SOPInstanceUID,pe_present_on_image,rv_lv_ratio_gte_1,rv_lv_ratio_lt_1,leftsided_pe,chronic_pe,rightsided_pe,acute_and_chronic_pe,central_pe,acute_pe
0,6897fa9de148,c0f3cb036d06,0,0,1,1,0,1,0,0,1
1,6897fa9de148,f57ffd3883b6,0,0,1,1,0,1,0,0,1
2,6897fa9de148,41220fda34a3,0,0,1,1,0,1,0,0,1
3,6897fa9de148,13b685b4b14f,0,0,1,1,0,1,0,0,1
4,6897fa9de148,be0b7524ffb4,0,0,1,1,0,1,0,0,1
5,6897fa9de148,c7b99cb454d4,0,0,1,1,0,1,0,0,1
6,6897fa9de148,e153deb813ed,0,0,1,1,0,1,0,0,1
7,6897fa9de148,c6bbe08f2736,0,0,1,1,0,1,0,0,1
8,6897fa9de148,09886998dc28,0,0,1,1,0,1,0,0,1
9,6897fa9de148,5b5226ac7bac,0,0,1,1,0,1,0,0,1


In [8]:
df_predict_labels_study['StudyInstanceUID'].value_counts()

0dc057976e8d    313
de9e82cbd095    309
7cd26a554409    298
5a83a90b867b    293
f359418e021b    279
               ... 
a84c2373adef    100
d766f804c9fd     97
5364d189c001     96
af234b50ae62     89
1ae6a23952ff     69
Name: StudyInstanceUID, Length: 90, dtype: int64

In [9]:
df_dataset = df_predict_labels_study[df_predict_labels_study['StudyInstanceUID'] != '9934197d1d7d']
print(df_dataset['StudyInstanceUID'].value_counts())
df_dataset

Unnamed: 0,StudyInstanceUID,SOPInstanceUID,pe_present_on_image,rv_lv_ratio_gte_1,rv_lv_ratio_lt_1,leftsided_pe,chronic_pe,rightsided_pe,acute_and_chronic_pe,central_pe,acute_pe
0,6897fa9de148,c0f3cb036d06,0,0,1,1,0,1,0,0,1
1,6897fa9de148,f57ffd3883b6,0,0,1,1,0,1,0,0,1
2,6897fa9de148,41220fda34a3,0,0,1,1,0,1,0,0,1
3,6897fa9de148,13b685b4b14f,0,0,1,1,0,1,0,0,1
4,6897fa9de148,be0b7524ffb4,0,0,1,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
83929,2b4a1c8c4573,b54a68d6af71,0,1,0,1,1,0,0,0,0
83930,2b4a1c8c4573,2a62c1cf15de,0,1,0,1,1,0,0,0,0
83931,2b4a1c8c4573,2a43a9e6d344,0,1,0,1,1,0,0,0,0
83932,2b4a1c8c4573,210cbd4cb77f,1,1,0,1,1,0,0,0,0


## One dataset per channel approach

In [10]:
t = transforms.Compose([Rescale(256),
                        Normalize(),
                        ToTensor(True),
                        ToCategorical()])

multi_image_multi_label_dataset = MultiImageMultiLabelDataset(df_dataset, 'images/', transform=t)

In [11]:
torch.save(multi_image_multi_label_dataset, 'datasets/multi_image_multi_label_dataset.pt')

In [12]:
print(len(multi_image_multi_label_dataset))
multi_image_multi_label_dataset[0]

{'image': tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 4.6136e-05,
           1.9223e-04, 4.6136e-05],
          [4.6136e-05, 2.0377e-04, 3.8447e-05,  ..., 1.4610e-04,
           1.1534e-05, 2.6913e-05],
          [1.6532e-04, 7.6894e-06, 2.6913e-05,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          ...,
          [3.4987e-04, 1.2649e-03, 4.4983e-04,  ..., 1.1880e-03,
           3.4987e-04, 3.3449e-04],
          [1.1765e-03, 3.6140e-04, 3.4218e-04,  ..., 2.8451e-04,
           3.4218e-04, 1.0227e-03],
          [2.5375e-04, 3.1526e-04, 1.0111e-03,  ..., 3.9216e-04,
           1.1265e-03, 5.3441e-04]],
 
         [[4.2676e-04, 1.1265e-03, 5.4979e-04,  ..., 1.2226e-03,
           3.2295e-04, 2.9220e-04],
          [1.2188e-03, 3.1142e-04, 2.8066e-04,  ..., 5.1519e-04,
           5.0365e-04, 1.1572e-03],
          [3.4987e-04, 3.5755e-04, 1.0996e-03,  ..., 9.9962e-05,
           5.0750e-04, 1.4994e-04],
          ...,
          [1.3187e-03, 5.9208e-04, 5.2672e-04,  .