# Preprocessing datas for first model

## Imports

In [1]:
import pandas as pd
import numpy as np
import os
from full_image_dataset import *
from balance import *

In [2]:
import torch
from torchvision import transforms
from transforms import *

## Informations about annotations

In [3]:
# Load annotations
df = pd.read_csv('train.csv')

In [4]:
df.head(n=10)

Unnamed: 0,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,pe_present_on_image,negative_exam_for_pe,qa_motion,qa_contrast,flow_artifact,rv_lv_ratio_gte_1,rv_lv_ratio_lt_1,leftsided_pe,chronic_pe,true_filling_defect_not_pe,rightsided_pe,acute_and_chronic_pe,central_pe,indeterminate
0,6897fa9de148,2bfbb7fd2e8b,c0f3cb036d06,0,0,0,0,0,0,1,1,0,0,1,0,0,0
1,6897fa9de148,2bfbb7fd2e8b,f57ffd3883b6,0,0,0,0,0,0,1,1,0,0,1,0,0,0
2,6897fa9de148,2bfbb7fd2e8b,41220fda34a3,0,0,0,0,0,0,1,1,0,0,1,0,0,0
3,6897fa9de148,2bfbb7fd2e8b,13b685b4b14f,0,0,0,0,0,0,1,1,0,0,1,0,0,0
4,6897fa9de148,2bfbb7fd2e8b,be0b7524ffb4,0,0,0,0,0,0,1,1,0,0,1,0,0,0
5,6897fa9de148,2bfbb7fd2e8b,c7b99cb454d4,0,0,0,0,0,0,1,1,0,0,1,0,0,0
6,6897fa9de148,2bfbb7fd2e8b,e153deb813ed,0,0,0,0,0,0,1,1,0,0,1,0,0,0
7,6897fa9de148,2bfbb7fd2e8b,c6bbe08f2736,0,0,0,0,0,0,1,1,0,0,1,0,0,0
8,6897fa9de148,2bfbb7fd2e8b,09886998dc28,0,0,0,0,0,0,1,1,0,0,1,0,0,0
9,6897fa9de148,2bfbb7fd2e8b,5b5226ac7bac,0,0,0,0,0,0,1,1,0,0,1,0,0,0


## Keep only the available images

In [5]:
# List of available images (only keep their ID) in folder
available_images = list(map(lambda x: x.split('.')[0], os.listdir('images/')))

df = df.loc[df['SOPInstanceUID'].isin(available_images)]

## Create a balanced dataset of 2 classes as a starter

In [6]:
# Reduced prediction problem for one image
df_predict_pe_image = df[['SOPInstanceUID', 'pe_present_on_image']]
df_predict_pe_image['pe_present_on_image'].value_counts()

0    41865
1     3135
Name: pe_present_on_image, dtype: int64

In [7]:
# Create the biggest balanced dataframe
df_balanced = balance_dataframe(df_predict_pe_image, 'pe_present_on_image')
df_balanced['pe_present_on_image'].value_counts()

1    3135
0    3135
Name: pe_present_on_image, dtype: int64

## One dataset per channel approach

In [8]:
t = transforms.Compose([Rescale(256), Normalize(), ToTensor()])

full_image_dataset = FullImageDataset(df_balanced, 'images/', transform=t)
torch.save(full_image_dataset, 'datasets/full_image_dataset.pt')

In [9]:
full_image_dataset[0]

{'image': tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [0.0000e+00, 1.5379e-05, 0.0000e+00,  ..., 1.1918e-04,
           0.0000e+00, 0.0000e+00],
          [1.2303e-04, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          ...,
          [1.6417e-03, 3.5948e-03, 1.8454e-03,  ..., 3.6794e-03,
           2.4683e-03, 1.9339e-03],
          [3.7870e-03, 2.2376e-03, 1.7532e-03,  ..., 0.0000e+00,
           0.0000e+00, 5.3825e-05],
          [1.1111e-03, 1.1265e-03, 3.2603e-03,  ..., 1.4072e-03,
           3.3756e-03, 1.8108e-03]],
 
         [[1.5686e-03, 3.5256e-03, 1.7570e-03,  ..., 3.7678e-03,
           2.4183e-03, 1.8800e-03],
          [3.7370e-03, 2.1838e-03, 1.7070e-03,  ..., 0.0000e+00,
           0.0000e+00, 5.3825e-05],
          [9.0350e-04, 9.8039e-04, 3.1219e-03,  ..., 1.5648e-03,
           3.5448e-03, 1.6609e-03],
          ...,
          [7.1126e-04, 0.0000e+00, 0.0000e+00,  .