**import des librairies**

In [1]:
import os
import shutil
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import yaml

import wandb

**paramètres de configuration**

In [2]:
DATA_DIR = './data/'
TRAIN_PATH = DATA_DIR + 'train/'
TEST_PATH = DATA_DIR + 'test/'
YOLOV5_DIR = './yolov5/'
TEMP_DIR = './temp/'
KAGGLE_RESIZED_DATASET_NAME = 'xhlulu/siim-covid19-resized-to-512px-jpg'
IMG_SIZE = 512

WANDB_PROJECT_NAME = 'project8-kaggle-covid19'
WANDB_ENTITY_NAME = ''

In [3]:
BATCH_SIZE = 16
EPOCHS = 10

**téléchargement automatique des données**

In [4]:
if not os.path.isdir(DATA_DIR):
    !kaggle datasets download {KAGGLE_RESIZED_DATASET_NAME} --path {DATA_DIR} --unzip --quiet
    !kaggle competitions download -c "siim-covid19-detection" --file "sample_submission.csv" --path {DATA_DIR} --quiet
    !kaggle competitions download -c "siim-covid19-detection" --file "train_image_level.csv" --path {DATA_DIR} --quiet
    !kaggle competitions download -c "siim-covid19-detection" --file "train_study_level.csv" --path {DATA_DIR} --quiet
    
    file_to_extract = DATA_DIR+"/train_image_level.csv.zip"
    with zipfile.ZipFile(file_to_extract) as file:
        file.extract("train_image_level.csv", DATA_DIR)
    os.remove(file_to_extract)

**téléchargement du modèle YoloV5**

In [5]:
if not os.path.isdir(YOLOV5_DIR):
    !git clone https://github.com/ultralytics/yolov5
    %cd {YOLOV5_DIR}
    !pip install -qr requirements.txt
    %cd ../

**authentification wandb**

In [6]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33madrien-loridan[0m (use `wandb login --relogin` to force relogin)


True

**import des méta données**

In [7]:
df = pd.read_csv(DATA_DIR+'/train_image_level.csv')
label_df = pd.read_csv(DATA_DIR+'/train_study_level.csv')
meta_df = pd.read_csv(DATA_DIR+'/meta.csv')

**merge des annotations**

In [8]:
df['id'] = df.apply(lambda row: row.id.split('_')[0], axis=1)
df['path'] = df.apply(lambda row: TRAIN_PATH+row.id+'.jpg', axis=1)
df['image_level'] = df.apply(lambda row: row.label.split(' ')[0], axis=1)

label_df['id'] = label_df.apply(lambda row: row.id.split('_')[0], axis=1)
label_df.columns = ['StudyInstanceUID', 'Negative for Pneumonia', 'Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']

train_meta_df = meta_df.loc[meta_df.split == 'train']
train_meta_df.columns = ['id', 'dim0', 'dim1', 'split']

df = df.merge(train_meta_df, on='id',how="left")
df.sample(4)

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level,dim0,dim1,split
6034,f42957f488ef,"[{'x': 1817.65592, 'y': 1347.56175, 'width': 7...",opacity 1 1817.65592 1347.56175 2605.39808 197...,3ca7c79b3eac,./data/train/f42957f488ef.jpg,opacity,2436,3032,train
3683,94dfaa1ebe4d,"[{'x': 691.93567, 'y': 142.77639, 'width': 683...",opacity 1 691.93567 142.77639 1375.02257999999...,b9aa4efa289f,./data/train/94dfaa1ebe4d.jpg,opacity,2416,2872,train
902,246ecfa577ee,,none 1 0 0 1 1,ded38a5051da,./data/train/246ecfa577ee.jpg,none,2457,2840,train
3176,807656417522,"[{'x': 2805.46667, 'y': 2077.4333, 'width': 40...",opacity 1 2805.46667 2077.4333 3211.80017 2533...,ae70fa66b983,./data/train/807656417522.jpg,opacity,3480,4240,train


In [9]:
#df = df.dropna(subset = ["boxes"], inplace=False)
#df = df.reset_index(drop=True)

In [10]:
df.shape

(6334, 9)

In [11]:
df = df[~df.duplicated(subset=['id'], keep='first')]

In [12]:
df.shape

(6334, 9)

**échantillon**

In [13]:
df = df.sample(128)

**train test split**

In [14]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)

In [15]:
train_df = train_df.copy()
valid_df = valid_df.copy()

In [16]:
train_df.loc[:, 'split'] = 'train'
valid_df.loc[:, 'split'] = 'valid'

df = pd.concat([train_df, valid_df]).reset_index(drop=True)
df.sample(4)

Unnamed: 0,id,boxes,label,StudyInstanceUID,path,image_level,dim0,dim1,split
41,6f08b0295244,,none 1 0 0 1 1,2edd69dd0934,./data/train/6f08b0295244.jpg,none,3006,3006,train
14,1567909b87d1,"[{'x': 252.3774, 'y': 1680.38239, 'width': 754...",opacity 1 252.3774 1680.38239 1007.01391 2886....,c7c53a0fcccc,./data/train/1567909b87d1.jpg,opacity,3488,4256,train
29,861e6b07c9d8,"[{'x': 1884.95248, 'y': 1217.28, 'width': 883....",opacity 1 1884.95248 1217.28 2768.32581 2257.0...,366c5acac35b,./data/train/861e6b07c9d8.jpg,opacity,2536,3048,train
68,f37d85d13cdd,"[{'x': 1862.83636, 'y': 1039.7215, 'width': 49...",opacity 1 1862.83636 1039.7215 2361.2268599999...,30c274c783a2,./data/train/f37d85d13cdd.jpg,opacity,2330,2846,train


**création des dossiers temporaires d'entrainement**

In [17]:
#assert(os.path.isdir(TEMP_DIR)==False)

In [18]:
if not os.path.isdir(TEMP_DIR):
    [os.makedirs(dir, exist_ok=True) for dir in [TEMP_DIR+'images/train',
                                                 TEMP_DIR+'images/valid',
                                                 TEMP_DIR+'labels/train',
                                                 TEMP_DIR+'labels/valid']]
    for i in tqdm(range(len(df))):
        row = df.loc[i]
        if row.split == 'train':
            shutil.copy(row.path, f'{TEMP_DIR}images/train/{row.id}.jpg')
        else:
            shutil.copy(row.path, f'{TEMP_DIR}images/valid/{row.id}.jpg')

100%|███████████████████████████████████████████████████████████████████████████████| 128/128 [00:00<00:00, 131.52it/s]


**création fichier configuration yaml pour yolov5**

In [19]:
if not os.path.isfile(YOLOV5_DIR+'data/data.yaml'):
    data_yaml = dict(
        train = f'../{TEMP_DIR[2:]}images/train',
        val = f'../{TEMP_DIR[2:]}images/valid',
        nc = 2,
        names = ['none','opacity']
    )

    with open(YOLOV5_DIR+'data/data.yaml', 'w') as outfile:
        yaml.dump(data_yaml, outfile, default_flow_style=True)

**diverses fonctions pour formatages boxes**

In [20]:
def get_bbox(row):
    bboxes = []
    bbox = []
    for i, l in enumerate(row.label.split(' ')):
        if (i % 6 == 0) | (i % 6 == 1):
            continue
        bbox.append(float(l))
        if i % 6 == 5:
            bboxes.append(bbox)
            bbox = []  
            
    return bboxes

# Scale the bounding boxes according to the size of the resized image. 
def scale_bbox(row, bboxes):
    # Get scaling factor
    scale_x = IMG_SIZE/row.dim1
    scale_y = IMG_SIZE/row.dim0
    
    scaled_bboxes = []
    for bbox in bboxes:
        x = int(np.round(bbox[0]*scale_x, 4))
        y = int(np.round(bbox[1]*scale_y, 4))
        x1 = int(np.round(bbox[2]*(scale_x), 4))
        y1= int(np.round(bbox[3]*scale_y, 4))

        scaled_bboxes.append([x, y, x1, y1]) # xmin, ymin, xmax, ymax
        
    return scaled_bboxes

# Convert the bounding boxes in YOLO format.
def get_yolo_format_bbox(img_w, img_h, bboxes):
    yolo_boxes = []
    for bbox in bboxes:
        w = bbox[2] - bbox[0] # xmax - xmin
        h = bbox[3] - bbox[1] # ymax - ymin
        xc = bbox[0] + int(np.round(w/2)) # xmin + width/2
        yc = bbox[1] + int(np.round(h/2)) # ymin + height/2
        
        yolo_boxes.append([f'{xc/img_w:.6f}',f'{yc/img_h:.6f}' ,f'{w/img_w:.6f}' ,f'{h/img_h:.6f}']) # x_center y_center width height
    
    return yolo_boxes

**création des labels, txt files**

In [21]:
if not os.listdir(f'{TEMP_DIR}labels/train'):
    for i in tqdm(range(len(df))):
        row = df.loc[i]
        img_id = row.id
        split = row.split
        label = row.image_level

        if row.split=='train':
            file_name = f'{TEMP_DIR}labels/train/{row.id}.txt'
        else:
            file_name = f'{TEMP_DIR}labels/valid/{row.id}.txt'


        if label=='opacity':
            bboxes = get_bbox(row)
            scale_bboxes = scale_bbox(row, bboxes)
            yolo_bboxes = get_yolo_format_bbox(IMG_SIZE, IMG_SIZE, scale_bboxes)

            with open(file_name, 'w') as f:
                for bbox in yolo_bboxes:
                    bbox = [1]+bbox
                    bbox = [str(i) for i in bbox]
                    bbox = ' '.join(bbox)
                    f.write(bbox)
                    f.write('\n')

100%|██████████████████████████████████████████████████████████████████████████████| 128/128 [00:00<00:00, 2063.81it/s]


In [22]:
%cd {YOLOV5_DIR}

C:\Users\adrie\Workspace\openclassrooms-iml-projects\project8\yolov5


In [23]:
""""!python train.py --img {IMG_SIZE} \
                 --batch-size {BATCH_SIZE} \
                 --epochs {EPOCHS} \
                 --data data/data.yaml \
                 --weights yolov5s.pt \
                 --save_period 1\
                 --project {WANDB_PROJECT_NAME}"""

'"!python train.py --img {IMG_SIZE}                  --batch-size {BATCH_SIZE}                  --epochs {EPOCHS}                  --data data/data.yaml                  --weights yolov5s.pt                  --save_period 1                 --project {WANDB_PROJECT_NAME}'

python train.py --img-size 512 --batch-size 16 --epochs 10 --data data.yaml --weights yolov5x.pt --save_period 1 --project project8-kaggle-covid19
