In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpllimg
import seaborn as sns

In [2]:
# Setup plots
%matplotlib inline
plt.rcParams['figure.figsize'] = 10, 8
%config InlineBackend.figure_format = 'retina'
sns.set()

In [3]:
# Load colums in dataframes for each dataset
df_train = pd.read_csv('../train.csv')
df_test = pd.read_csv('../test.csv')

print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")

Train shape: (1790594, 17)
Test shape: (146853, 3)


In [4]:
# Builds the list of images IDs available in the given directory
def listAvailableImages(directory, extension):
    return list(map(lambda x: x.split(extension)[0], os.listdir(directory)))


available_images = listAvailableImages('../images', '.jpg')

In [5]:
# Only keep available images rows
def cleanUpDataframe(df):
    return df.loc[df_train['SOPInstanceUID'].isin(available_images)]
    

df_train_clean = cleanUpDataframe(df_train)
df_train_clean

Unnamed: 0,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,pe_present_on_image,negative_exam_for_pe,qa_motion,qa_contrast,flow_artifact,rv_lv_ratio_gte_1,rv_lv_ratio_lt_1,leftsided_pe,chronic_pe,true_filling_defect_not_pe,rightsided_pe,acute_and_chronic_pe,central_pe,indeterminate
0,6897fa9de148,2bfbb7fd2e8b,c0f3cb036d06,0,0,0,0,0,0,1,1,0,0,1,0,0,0
1,6897fa9de148,2bfbb7fd2e8b,f57ffd3883b6,0,0,0,0,0,0,1,1,0,0,1,0,0,0
2,6897fa9de148,2bfbb7fd2e8b,41220fda34a3,0,0,0,0,0,0,1,1,0,0,1,0,0,0
3,6897fa9de148,2bfbb7fd2e8b,13b685b4b14f,0,0,0,0,0,0,1,1,0,0,1,0,0,0
4,6897fa9de148,2bfbb7fd2e8b,be0b7524ffb4,0,0,0,0,0,0,1,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88507,9934197d1d7d,16f3c3d9851d,9a899e0be9a6,0,1,0,0,0,0,0,0,0,0,0,0,0,0
88508,9934197d1d7d,16f3c3d9851d,996873b9192b,0,1,0,0,0,0,0,0,0,0,0,0,0,0
88509,9934197d1d7d,16f3c3d9851d,f3d335a3fd4e,0,1,0,0,0,0,0,0,0,0,0,0,0,0
88510,9934197d1d7d,16f3c3d9851d,1144e3048221,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
# Remove packed features

# TODO add asserts to ensure a good behavior before dropping
df_train_clean.drop(['negative_exam_for_pe','qa_motion','qa_contrast','flow_artifact'], axis=1, inplace=True)
df_train_clean.drop(['rv_lv_ratio_gte_1','rv_lv_ratio_lt_1'], axis=1, inplace=True)
df_train_clean.drop(['leftsided_pe','rightsided_pe','central_pe'], axis=1, inplace=True)
df_train_clean.drop(['chronic_pe','acute_and_chronic_pe'], axis=1, inplace=True)
df_train_clean.drop(['true_filling_defect_not_pe','indeterminate'], axis=1, inplace=True)
df_train_clean.drop(['SOPInstanceUID'], axis=1, inplace=True)

In [7]:
df2 = df_train_clean.groupby(['StudyInstanceUID','SeriesInstanceUID']).sum()
df2.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,pe_present_on_image
StudyInstanceUID,SeriesInstanceUID,Unnamed: 2_level_1
0045f113e031,454c8fdfb649,0
008626c99017,461fe63dec07,0
0112fd91fdd0,2b363f6cfd3d,0
013358b540bb,2805267980e7,0
01796cd8f2cd,9cfdde6c6987,0
018b5097a129,0729127097da,47
02aca64c04a9,ec4e2e757b0c,0
0358701cd26a,c8fa9c024b17,0
045a692f7143,de09eb5de208,5
07bd8cc56850,e580d22aa050,8


In [10]:
df2['pe_present_on_image'].value_counts()

0      144
6        5
35       4
15       4
5        3
8        3
19       3
2        3
3        2
9        2
57       2
17       2
20       2
26       2
28       2
45       2
52       2
42       2
61       2
59       1
14       1
21       1
69       1
78       1
18       1
82       1
16       1
83       1
12       1
23       1
86       1
87       1
7        1
89       1
92       1
93       1
97       1
22       1
24       1
55       1
25       1
53       1
49       1
48       1
47       1
46       1
65       1
44       1
109      1
39       1
38       1
37       1
66       1
34       1
67       1
68       1
121      1
Name: pe_present_on_image, dtype: int64