### **download external packages**

In [1]:
HELPER_DIR = '/kaggle/input/pydicom-conda-helper/'

!conda install {HELPER_DIR+'libjpeg-turbo-2.1.0-h7f98852_0.tar.bz2'} -c conda-forge -y -q
!conda install {HELPER_DIR+'libgcc-ng-9.3.0-h2828fa1_19.tar.bz2'} -c conda-forge -y -q
!conda install {HELPER_DIR+'gdcm-2.8.9-py37h500ead1_1.tar.bz2'} -c conda-forge -y -q
!conda install {HELPER_DIR+'conda-4.10.1-py37h89c1867_0.tar.bz2'} -c conda-forge -y -q
!conda install {HELPER_DIR+'certifi-2020.12.5-py37h89c1867_1.tar.bz2'} -c conda-forge -y -q
!conda install {HELPER_DIR+'openssl-1.1.1k-h7f98852_0.tar.bz2'} -c conda-forge -y -q

Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done
Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done
Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done
Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done
Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done
Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done


### **import dependencies**

In [2]:
import os, zipfile
import cv2
import plotly.express as px
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from kaggle_secrets import UserSecretsClient
import pydicom
import wandb

from pathlib import Path

### **configuration and initialization**

In [3]:
SIIM_COVID19_DETECTION_DIR = '/kaggle/input/siim-covid19-detection/'

WORKING_DIR = '/kaggle/working/'
TEMP_DIR = '/kaggle/temp/'

INPUT_DIR = SIIM_COVID19_DETECTION_DIR+'train/'
OUTPUT_DIR = WORKING_DIR+'data/'

TRAIN_IMAGE_LEVEL_PATH = SIIM_COVID19_DETECTION_DIR+'train_image_level.csv'
TRAIN_STUDY_LEVEL_PATH = SIIM_COVID19_DETECTION_DIR+'train_study_level.csv'

IMG_SIZE = WIDTH = HEIGHT = 512
N_IMAGES_WANDB = 42


INTERPOLATION = cv2.INTER_LANCZOS4

In [4]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [5]:
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("WANDB_API_KEY")
os.environ['WANDB_API_KEY'] = secret_value_0

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33madrien-loridan[0m (use `wandb login --relogin` to force relogin)


True

### **load csv file**

In [6]:
df_train_image_level = pd.read_csv(TRAIN_IMAGE_LEVEL_PATH)
df_train_study_level = pd.read_csv(TRAIN_STUDY_LEVEL_PATH)

### **first look**

In [7]:
df_train_image_level.sample(5)

Unnamed: 0,id,boxes,label,StudyInstanceUID
966,270bbfe1f9e5_image,"[{'x': 396.79994, 'y': 1913.60007, 'width': 13...",opacity 1 396.79994 1913.60007 1740.8000000000...,7e5a7b6193ed
2402,61e618dc7f9a_image,,none 1 0 0 1 1,7416b5cbc531
4919,c76d21aa7403_image,"[{'x': 463.91406, 'y': 1534.00976, 'width': 51...",opacity 1 463.91406 1534.00976 976.55859000000...,612f28c33ac0
3892,9ecb1253f647_image,"[{'x': 1694.02355, 'y': 295.15852, 'width': 59...",opacity 1 1694.02355 295.15852 2290.4262599999...,3da10090af63
2937,76c66ee8e58d_image,,none 1 0 0 1 1,c29970048923


In [8]:
df_train_image_level.describe()

Unnamed: 0,id,boxes,label,StudyInstanceUID
count,6334,4294,6334,6334
unique,6334,4294,4295,6054
top,0a5f36296cb0_image,"[{'x': 2812.39057, 'y': 1180.85304, 'width': 4...",none 1 0 0 1 1,0fd2db233deb
freq,1,1,2040,9


In [9]:
df_train_study_level.sample(5)

Unnamed: 0,id,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
1776,4c1f8d4c24ea_study,0,1,0,0
3735,9eabf871003c_study,0,0,1,0
503,159cbf48e73c_study,0,1,0,0
1320,38111e7353fb_study,0,1,0,0
4523,bec6fb240456_study,0,1,0,0


In [10]:
df_train_study_level.describe()

Unnamed: 0,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
count,6054.0,6054.0,6054.0,6054.0
mean,0.276842,0.471589,0.173274,0.078295
std,0.447475,0.499233,0.378515,0.268658
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0


### **merge df study/image, add path image**

In [11]:
df_train_image_level['id'] = df_train_image_level.apply(lambda row: row.id.split('_')[0], axis=1)
df_train_image_level['path'] = df_train_image_level.apply(lambda row: OUTPUT_DIR+row.id+'.jpg', axis=1)
df_train_image_level['image_level'] = df_train_image_level.apply(lambda row: row.label.split(' ')[0], axis=1)

df_train_study_level['id'] = df_train_study_level.apply(lambda row: row.id.split('_')[0], axis=1)
df_train_study_level.columns = ['StudyInstanceUID', 'Negative for Pneumonia', 'Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']

In [12]:
df = df_train_image_level.merge(df_train_study_level, on='StudyInstanceUID',how="left")
df.sample(3)

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
301,0b55cd36e5cc,"[{'x': 158.60737, 'y': 588.96881, 'width': 117...",opacity 1 158.60737 588.96881 1330.14309 2579....,508614a9c9ff,/kaggle/working/data/0b55cd36e5cc.jpg,opacity,0,1,0,0
3506,8e3a2441aead,"[{'x': 857.10131, 'y': 450.99124, 'width': 100...",opacity 1 857.10131 450.99124 1862.52819 2083....,71d4e9b630fb,/kaggle/working/data/8e3a2441aead.jpg,opacity,0,1,0,0
2199,59710cda6113,,none 1 0 0 1 1,cf7914085784,/kaggle/working/data/59710cda6113.jpg,none,1,0,0,0


In [13]:
print(f"Number of images in trainset: {len(df)}")
print(f"Number of images in trainset ( without boxes): {df['boxes'].isna().sum()}")
print(f"Number of images in trainset ( with boxes): {len(df) - df['boxes'].isna().sum()}")

Number of images in trainset: 6334
Number of images in trainset ( without boxes): 2040
Number of images in trainset ( with boxes): 4294


In [14]:
labels = df[['Negative for Pneumonia','Typical Appearance','Indeterminate Appearance','Atypical Appearance']]

In [15]:
fig = px.bar(labels.sum(),
             title="<b>Distribution images by classes</b>",)
fig.update_layout(showlegend=False,
                  xaxis_title="",
                  yaxis_title="")


fig.show()

In [16]:
df['study_level'] = np.argmax(labels.values, axis=1)
df.sample(3)

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,study_level
6266,fd7515610b5a,"[{'x': 436.49994, 'y': 168.75, 'width': 1155, ...",opacity 1 436.49994 168.75 1591.49994 2313.75 ...,f13c5fb56a73,/kaggle/working/data/fd7515610b5a.jpg,opacity,0,1,0,0,1
2440,636b2b26d2f8,,none 1 0 0 1 1,e598f9bfe550,/kaggle/working/data/636b2b26d2f8.jpg,none,1,0,0,0,0
5995,f2633be66b0d,"[{'x': 1831.25336, 'y': 436.93333, 'width': 73...",opacity 1 1831.25336 436.93333 2569.78669 2076...,dcc6632b56b4,/kaggle/working/data/f2633be66b0d.jpg,opacity,0,1,0,0,1


In [17]:
no_bb = df[df['boxes'].isna()].shape[0]
has_bb = df[df['boxes'].notna()].shape[0]

px.pie(names=["with boxes", "without boxes"],
       values=[has_bb, no_bb], 
       title="<b>Distribution images by boxes</b>")

In [18]:
no_bb = df[(df['boxes'].isna() & df['Negative for Pneumonia'] ==1)].shape[0]
has_bb = df[(df['boxes'].notna() & df['Negative for Pneumonia'] ==1)].shape[0]

px.pie(names=["with boxes", "without boxes"],
       values=[has_bb, no_bb], 
       title="<b>Distribution images by boxes for negative study</b>")

In [19]:
no_bb = df[(df['boxes'].isna() & df['Negative for Pneumonia'] ==0)].shape[0]
has_bb = df[(df['boxes'].notna() & df['Negative for Pneumonia'] ==0)].shape[0]

px.pie(names=["with boxes", "without boxes"],
       values=[has_bb, no_bb], 
       title="<b>Distribution images by boxes for positive study</b>")

In [20]:
label_to_class_id = {
    'Negative for Pneumonia': 0,
    'Typical Appearance': 1,
    'Indeterminate Appearance': 2,
    'Atypical Appearance': 3
}

class_id_to_label = {v: k for k, v in label_to_class_id.items()}

### **get path dicom files**

In [21]:
path_dicom_files = []

total = sum([len(f) for r, d, f in os.walk(INPUT_DIR)])

with tqdm(total=total) as pbar:
    for dirname, _, filenames in os.walk(INPUT_DIR):
        for file in filenames:
            path_dicom_files.append(Path(os.path.join(dirname, file)))
            pbar.update(1)

  0%|          | 0/6334 [00:00<?, ?it/s]

### **rescale all train images and save to IMG_SIZE=512x512px jpg / save original width and height then export df**

In [22]:
img=None
for p in tqdm(path_dicom_files):
    img_name = p.parts[-1][0:-4]
    if img_name =='039159f7b61b':
        print(True)
        dcm = pydicom.dcmread(p)
        img = dcm.pixel_array
        if dcm.PhotometricInterpretation == "MONOCHROME1":
            img = cv2.bitwise_not(img)
        img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX, dtype=cv2.CV_8U)
        img = cv2.resize(img, (WIDTH, HEIGHT), interpolation = INTERPOLATION)

  0%|          | 0/6334 [00:00<?, ?it/s]

True



The length of the pixel data in the dataset (13262360 bytes) indicates it contains excess padding. 216296 bytes will be removed from the end of the data



In [23]:
df.loc[:,"width"] = np.nan
df.loc[:,"height"] = np.nan


for p in tqdm(path_dicom_files):
    dcm = pydicom.dcmread(p)
    img = dcm.pixel_array
    img_name = p.parts[-1][0:-4]
    
    index = df[df['id'].str.contains(img_name)].index
    df.loc[index, ['width']] = img.shape[0]
    df.loc[index, ['height']] = img.shape[1]

    if dcm.PhotometricInterpretation == "MONOCHROME1":
        img = cv2.bitwise_not(img)
    img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX, dtype=cv2.CV_8U)
    img = cv2.resize(img, (WIDTH, HEIGHT), interpolation = INTERPOLATION)
    
    cv2.imwrite(OUTPUT_DIR+img_name+'.jpg', img)
    
#039159f7b61b image return error (or 920d7ef35702 )
    

  0%|          | 0/6334 [00:00<?, ?it/s]

In [24]:
df.to_csv(WORKING_DIR+'meta.csv', index = False)

### **df images with boxes**

In [25]:
opacity_df = df.dropna(subset = ["boxes"], inplace=False)
opacity_df = opacity_df.reset_index(drop=True)

In [26]:
opacity_df.sample(5)

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,study_level,width,height
402,1843ab225632,"[{'x': 539.74635, 'y': 1342.18882, 'width': 13...",opacity 1 539.74635 1342.18882 1859.12635 2803...,4ae89e667a24,/kaggle/working/data/1843ab225632.jpg,opacity,0,1,0,0,1,3488.0,4256.0
1611,61dbca9280da,"[{'x': 1576.38667, 'y': 1028.79333, 'width': 5...",opacity 1 1576.38667 1028.79333 2151.9333 1606...,09dd3b736a69,/kaggle/working/data/61dbca9280da.jpg,opacity,0,1,0,0,1,2544.0,3056.0
362,149c73c79507,"[{'x': 1874.04946, 'y': 582.60854, 'width': 29...",opacity 1 1874.04946 582.60854 2165.57912 1216...,bc129d5dbba7,/kaggle/working/data/149c73c79507.jpg,opacity,0,0,0,1,3,2336.0,2836.0
4052,f2044befe4f9,"[{'x': 611.0948, 'y': 361.58786, 'width': 1102...",opacity 1 611.0948 361.58786 1713.496839999999...,b5e30871622c,/kaggle/working/data/f2044befe4f9.jpg,opacity,0,1,0,0,1,3488.0,4256.0
2978,b3faa4b8cd5d,"[{'x': 468.7981, 'y': 1146.78167, 'width': 871...",opacity 1 468.7981 1146.78167 1340.52143 2086....,272dfcc2c228,/kaggle/working/data/b3faa4b8cd5d.jpg,opacity,0,0,0,1,3,2539.0,3050.0


In [27]:
opacity_df.describe()

Unnamed: 0,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,study_level,width,height
count,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0
mean,0.0,0.664648,0.244294,0.091057,1.426409,2767.499301,3247.543549
std,0.0,0.472168,0.429718,0.287724,0.653298,598.85804,731.608369
min,0.0,0.0,0.0,0.0,1.0,1140.0,1140.0
25%,0.0,0.0,0.0,0.0,1.0,2336.0,2836.0
50%,0.0,1.0,0.0,0.0,1.0,2544.0,3032.0
75%,0.0,1.0,0.0,0.0,2.0,3480.0,4240.0
max,0.0,1.0,1.0,1.0,3.0,4891.0,4891.0


### **convert train image boxes to wandb image for visualization**

In [28]:
def get_bbox(row):
    bboxes = []
    bbox = []
    for i, l in enumerate(row.label.split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l))
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []  
            
    return bboxes

In [29]:
def scale_bbox(row, bboxes):
    scale_x = IMG_SIZE/row.width
    scale_y = IMG_SIZE/row.height
    
    scaled_bboxes = []
    for bbox in bboxes:
        x = int(np.round(bbox[0]*scale_x, 4))
        y = int(np.round(bbox[1]*scale_y, 4))
        x1 = int(np.round(bbox[2]*(scale_x), 4))
        y1= int(np.round(bbox[3]*scale_y, 4))

        scaled_bboxes.append([x, y, x1, y1]) # xmin, ymin, xmax, ymax
    
    return scaled_bboxes

In [30]:
def wandb_bbox(image, bboxes, true_label, class_id_to_label):
    all_boxes = []
    for bbox in bboxes:
        box_data = {"position": {
                        "minX": bbox[0],
                        "minY": bbox[1],
                        "maxX": bbox[2],
                        "maxY": bbox[3]
                    },
                     "class_id" : int(true_label),
                     "box_caption": class_id_to_label[true_label],
                     "domain" : "pixel"}
        all_boxes.append(box_data)
    

    return wandb.Image(image, boxes={
        "ground_truth": {
            "box_data": all_boxes,
          "class_labels": class_id_to_label
        }
    })

In [31]:
sampled_opacity_df = opacity_df.sample(N_IMAGES_WANDB).reset_index(drop=True)

run = wandb.init(project='project8-kaggle-covid19')

wandb_bbox_list = []
for i in tqdm(range(sampled_opacity_df.shape[0])):
    row = sampled_opacity_df.loc[i]
    image = cv2.imread(row.path)
    bboxes = get_bbox(row)
    scale_bboxes = scale_bbox(row, bboxes)
    true_label = row.study_level
    wandb_bbox_list.append(wandb_bbox(image, 
                                      scale_bboxes, 
                                      true_label, 
                                      class_id_to_label))
    
wandb.log({"radiograph": wandb_bbox_list})

run.finish()

run

[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


  0%|          | 0/42 [00:00<?, ?it/s]

VBox(children=(Label(value=' 9.24MB of 9.24MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
_runtime,13
_timestamp,1623866775
_step,0


0,1
_runtime,▁
_timestamp,▁
_step,▁


### **ref** 

* https://www.kaggle.com/xhlulu
* https://www.kaggle.com/yujiariyasu
* https://www.kaggle.com/ayuraj
* https://www.kaggle.com/dschettler8845   
....