In [1]:
import os
import plotly.express as px
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import wandb

**paramètres de configuration**

In [2]:
DATA_DIR = './data'
TRAIN_PATH = DATA_DIR + '/train/'
TEST_PATH = DATA_DIR + '/test/'
IMG_SIZE = 512
NB_IMAGES_VISUALISATION = 42

**import des méta données**

In [3]:
df = pd.read_csv(DATA_DIR+'/train_image_level.csv')
label_df = pd.read_csv(DATA_DIR+'/train_study_level.csv')
meta_df = pd.read_csv(DATA_DIR+'/meta.csv')

**premières observations**

In [4]:
df.sample(5)

Unnamed: 0,id,boxes,label,StudyInstanceUID
5499,dd3174ed1ffe_image,,none 1 0 0 1 1,8e696c58fe5a
1467,3bbe55d548e4_image,,none 1 0 0 1 1,cec70a25ea45
2908,75e68ff861bb_image,"[{'x': 393.68089, 'y': 175.27153, 'width': 218...",opacity 1 393.68089 175.27153 612.65696 806.35...,497096b56928
4674,bdf485c8b391_image,"[{'x': 1057.07163, 'y': 855.18855, 'width': 60...",opacity 1 1057.07163 855.18855 1658.8501899999...,c45123ecdd71
3867,9d8d469d6bf4_image,,none 1 0 0 1 1,7b794c082812


In [5]:
df.describe()

Unnamed: 0,id,boxes,label,StudyInstanceUID
count,6334,4294,6334,6334
unique,6334,4294,4295,6054
top,5c1e736c0af9_image,"[{'x': 704.51895, 'y': 1187.80541, 'width': 63...",none 1 0 0 1 1,0fd2db233deb
freq,1,1,2040,9


In [6]:
label_df.sample(5)

Unnamed: 0,id,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
4387,b99ab479998f_study,0,1,0,0
5419,e4b50e7402c3_study,1,0,0,0
1982,55b4ef57b6d4_study,0,1,0,0
172,080c164369e7_study,0,1,0,0
4091,ad5c640225b4_study,0,1,0,0


In [7]:
label_df.describe()

Unnamed: 0,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
count,6054.0,6054.0,6054.0,6054.0
mean,0.276842,0.471589,0.173274,0.078295
std,0.447475,0.499233,0.378515,0.268658
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0


In [8]:
meta_df.sample(5)

Unnamed: 0,image_id,dim0,dim1,split
1727,d19332cb71d5,2801,2802,train
1006,c697b7d09124,2581,2566,train
576,a16b4c54153c,3488,4256,train
1368,30159ae10968,2320,2832,train
4946,c8cf39d50d34,2400,2880,train


In [9]:
meta_df.describe()

Unnamed: 0,dim0,dim1
count,7597.0,7597.0
mean,2740.16467,3172.717125
std,566.462481,687.08893
min,846.0,1140.0
25%,2336.0,2836.0
50%,2544.0,3027.0
75%,3052.0,3408.0
max,4891.0,4891.0


**modifications des champs pour merge des metadonnées**

In [10]:
df['id'] = df.apply(lambda row: row.id.split('_')[0], axis=1)
df['path'] = df.apply(lambda row: TRAIN_PATH+row.id+'.jpg', axis=1)
df['image_level'] = df.apply(lambda row: row.label.split(' ')[0], axis=1)

label_df['id'] = label_df.apply(lambda row: row.id.split('_')[0], axis=1)
label_df.columns = ['StudyInstanceUID', 'Negative for Pneumonia', 'Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']


In [11]:
df = df.merge(label_df, on='StudyInstanceUID',how="left")
df.sample(3)

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance
3375,89ae9de3ada1,,none 1 0 0 1 1,299201b9ba40,./data/train/89ae9de3ada1.jpg,none,1,0,0,0
3345,882aa657e170,"[{'x': 340.43734, 'y': 1002.8014, 'width': 106...",opacity 1 340.43734 1002.8014 1401.17596 2234....,bae643d12bd6,./data/train/882aa657e170.jpg,opacity,0,1,0,0
2313,5e0c5bcc27c7,"[{'x': 2032.80003, 'y': 541.33333, 'width': 59...",opacity 1 2032.80003 541.33333 2630.1333600000...,785e985b3a22,./data/train/5e0c5bcc27c7.jpg,opacity,0,1,0,0


In [12]:
print(f"Nombre d'image dans le trainset: {len(df)}")
print(f"Nombre d'image dans le trainset n'ayant pas d'objets à detecter: {df['boxes'].isna().sum()}")

Nombre d'image dans le trainset: 6334
Nombre d'image dans le trainset n'ayant pas d'objets à detecter: 2040


In [13]:
labels = df[['Negative for Pneumonia','Typical Appearance','Indeterminate Appearance','Atypical Appearance']]

In [49]:
fig = px.bar(labels.sum(),
             title="<b>Distribution des images par classes</b>",)
fig.update_layout(showlegend=False,
                  xaxis_title="",
                  yaxis_title="")


fig.show()

In [16]:
df['study_level'] = np.argmax(labels.values, axis=1)
df.sample(3)

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,study_level
4933,c7f5e09a2415,,none 1 0 0 1 1,cec0cef8568d,./data/train/c7f5e09a2415.jpg,none,1,0,0,0,0
5936,efe43aeaa2ee,"[{'x': 1229.33887, 'y': 674.71885, 'width': 12...",opacity 1 1229.33887 674.71885 2506.80127 3037...,5f893a168417,./data/train/efe43aeaa2ee.jpg,opacity,0,1,0,0,1
5758,e7f48029ca34,"[{'x': 2355.64909, 'y': 124.83056, 'width': 46...",opacity 1 2355.64909 124.83056 2822.6922999999...,830d41512494,./data/train/e7f48029ca34.jpg,opacity,0,1,0,0,1


In [17]:
label_to_class_id = {
    'Negative for Pneumonia': 0,
    'Typical Appearance': 1,
    'Indeterminate Appearance': 2,
    'Atypical Appearance': 3
}

class_id_to_label = {v: k for k, v in label_to_class_id.items()}

In [18]:
train_meta_df = meta_df.loc[meta_df.split == 'train']
train_meta_df.columns = ['id', 'dim0', 'dim1', 'split']
train_meta_df.sample(3)

Unnamed: 0,id,dim0,dim1,split
4444,001bd15d1891,2800,3408,train
5016,1b98d395bdb5,3488,4256,train
3909,81c988c895a8,2800,3408,train


In [19]:
df = df.merge(train_meta_df, on='id',how="left")
df.sample(5)

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,study_level,dim0,dim1,split
1461,3b8707263b78,,none 1 0 0 1 1,f7d8873478ac,./data/train/3b8707263b78.jpg,none,1,0,0,0,0,3480,4240,train
5893,edfa7e192393,"[{'x': 771.75717, 'y': 776.01363, 'width': 670...",opacity 1 771.75717 776.01363 1442.31253 1171....,805e6350549c,./data/train/edfa7e192393.jpg,opacity,0,1,0,0,1,2336,2836,train
1316,353d5dbe2de0,,none 1 0 0 1 1,4b1d718d562a,./data/train/353d5dbe2de0.jpg,none,1,0,0,0,0,2560,2350,train
710,1c96d9b08487,,none 1 0 0 1 1,ffcb4630f46f,./data/train/1c96d9b08487.jpg,none,0,1,0,0,1,2800,3408,train
2101,550f057ee0b0,,none 1 0 0 1 1,35521a34aebc,./data/train/550f057ee0b0.jpg,none,0,1,0,0,1,2848,2822,train


In [47]:
no_bb = df[df['boxes'].isna()].shape[0]
has_bb = df[df['boxes'].notna()].shape[0]

px.pie(names=["Avec Boxes", "Sans Boxes"],
       values=[has_bb, no_bb], 
       title="<b>Distribution des images par boxes</b>")

In [20]:
opacity_df = df.dropna(subset = ["boxes"], inplace=False)
opacity_df = opacity_df.reset_index(drop=True)

In [21]:
opacity_df.sample(5)

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,study_level,dim0,dim1,split
3695,db516ead1471,"[{'x': 565.06623, 'y': 451.6139, 'width': 730....",opacity 1 565.06623 451.6139 1295.35645 2424.3...,193d3d99e496,./data/train/db516ead1471.jpg,opacity,0,1,0,0,1,2800,3245,train
4269,fe6a2dcda2da,"[{'x': 2128.61914, 'y': 1797.93075, 'width': 5...",opacity 1 2128.61914 1797.93075 2691.894039999...,7902e534869e,./data/train/fe6a2dcda2da.jpg,opacity,0,0,1,0,2,2544,3056,train
1635,6373fdd11ed0,"[{'x': 2066.33495, 'y': 170.40825, 'width': 27...",opacity 1 2066.33495 170.40825 2345.69262 259....,4986a30d2f81,./data/train/6373fdd11ed0.jpg,opacity,0,0,0,1,3,2436,3032,train
1593,6107ccde78c7,"[{'x': 330.32102, 'y': 1786.43797, 'width': 82...",opacity 1 330.32102 1786.43797 1154.8308499999...,f5f8f43fece0,./data/train/6107ccde78c7.jpg,opacity,0,0,1,0,2,3488,4256,train
1430,563f8a6c25fb,"[{'x': 1868.26377, 'y': 1038.97636, 'width': 4...",opacity 1 1868.26377 1038.97636 2340.94089 164...,1f89329fdcf2,./data/train/563f8a6c25fb.jpg,opacity,0,0,1,0,2,2610,2677,train


In [22]:
opacity_df.describe()

Unnamed: 0,Negative for Pneumonia,Typical Appearance,Indeterminate Appearance,Atypical Appearance,study_level,dim0,dim1
count,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0,4294.0
mean,0.0,0.664648,0.244294,0.091057,1.426409,2767.499301,3247.543549
std,0.0,0.472168,0.429718,0.287724,0.653298,598.85804,731.608369
min,0.0,0.0,0.0,0.0,1.0,1140.0,1140.0
25%,0.0,0.0,0.0,0.0,1.0,2336.0,2836.0
50%,0.0,1.0,0.0,0.0,1.0,2544.0,3032.0
75%,0.0,1.0,0.0,0.0,2.0,3480.0,4240.0
max,0.0,1.0,1.0,1.0,3.0,4891.0,4891.0


**diverses fonctions pour formatage des données wandb**

In [23]:
def get_bbox(row):
    bboxes = []
    bbox = []
    for i, l in enumerate(row.label.split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l))
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []  
            
    return bboxes

In [24]:
def scale_bbox(row, bboxes):
    # Get scaling factor
    scale_x = IMG_SIZE/row.dim1
    scale_y = IMG_SIZE/row.dim0
    
    scaled_bboxes = []
    for bbox in bboxes:
        x = int(np.round(bbox[0]*scale_x, 4))
        y = int(np.round(bbox[1]*scale_y, 4))
        x1 = int(np.round(bbox[2]*(scale_x), 4))
        y1= int(np.round(bbox[3]*scale_y, 4))

        scaled_bboxes.append([x, y, x1, y1]) # xmin, ymin, xmax, ymax
    
    return scaled_bboxes

In [25]:
def wandb_bbox(image, bboxes, true_label, class_id_to_label):
    all_boxes = []
    for bbox in bboxes:
        box_data = {"position": {
                        "minX": bbox[0],
                        "minY": bbox[1],
                        "maxX": bbox[2],
                        "maxY": bbox[3]
                    },
                     "class_id" : int(true_label),
                     "box_caption": class_id_to_label[true_label],
                     "domain" : "pixel"}
        all_boxes.append(box_data)
    

    return wandb.Image(image, boxes={
        "ground_truth": {
            "box_data": all_boxes,
          "class_labels": class_id_to_label
        }
    })

**formatage et envoi sur serveur wandb pour visualisation**

In [None]:
wandb.login()

In [None]:
sampled_opacity_df = opacity_df.sample(NB_IMAGES_VISUALISATION).reset_index(drop=True)

run = wandb.init(project='project8-kaggle-covid19')

wandb_bbox_list = []
for i in tqdm(range(sampled_opacity_df.shape[0])):
    row = sampled_opacity_df.loc[i]
    image = cv2.imread(row.path)
    bboxes = get_bbox(row)
    scale_bboxes = scale_bbox(row, bboxes)
    true_label = row.study_level
    wandb_bbox_list.append(wandb_bbox(image, 
                                      scale_bboxes, 
                                      true_label, 
                                      class_id_to_label))
    
wandb.log({"radiograph": wandb_bbox_list})

run.finish()

run

**références**   
pour le redimensionnement du dataset initial : https://www.kaggle.com/xhlulu   
pour une fonction : https://www.kaggle.com/yujiariyasu   
pour le reste /eda/wandb : https://www.kaggle.com/ayuraj   
pour les informations du projet : https://www.kaggle.com/dschettler8845   