In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
!pip install -qU "python-gdcm" pydicom pylibjpeg "opencv-python-headless"

In [3]:
# kaggle API로 데이터를 다운로드합니다.
# kaggle token을 업로드해야합니다 -> 발급받아야함

from google.colab import files
kaggle_token = files.upload()

!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [4]:
# data 디렉토리 생성
!mkdir /content/data

# 캐글 데이터 다운로드
!kaggle competitions download -c rsna-pneumonia-detection-challenge

# 다운로드 받은 케글 데이터를 data 디렉토리로 이동
!mv rsna-pneumonia-detection-challenge.zip /content/data

# 캐글 데이터를 data 디렉토리에 압축풀기
!unzip -q /content/data/rsna-pneumonia-detection-challenge.zip -d /content/data

Downloading rsna-pneumonia-detection-challenge.zip to /content
100% 3.65G/3.66G [00:22<00:00, 226MB/s]
100% 3.66G/3.66G [00:22<00:00, 172MB/s]


In [7]:
# YOLOv5 깃클론

!git clone https://github.com/ultralytics/yolov5  # clone
%cd yolov5
%pip install -qr requirements.txt  # install

import torch
import utils
display = utils.notebook_init()  # checks

YOLOv5 🚀 v7.0-128-gb96f35c Python-3.9.16 torch-1.13.1+cu116 CUDA:0 (Tesla T4, 15102MiB)


Setup complete ✅ (2 CPUs, 12.7 GB RAM, 33.1/166.8 GB disk)


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import pydicom
import os
import glob
from tqdm import tqdm
import zipfile
from sklearn.model_selection import train_test_split
import yaml

In [9]:
train = pd.read_csv('/content/data/stage_2_train_labels.csv')
train

Unnamed: 0,patientId,x,y,width,height,Target
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,0
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,0
2,00322d4d-1c29-4943-afc9-b6754be640eb,,,,,0
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,,,,,0
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1
...,...,...,...,...,...,...
30222,c1ec14ff-f6d7-4b38-b0cb-fe07041cbdc8,185.0,298.0,228.0,379.0,1
30223,c1edf42b-5958-47ff-a1e7-4f23d99583ba,,,,,0
30224,c1f6b555-2eb1-4231-98f6-50a963976431,,,,,0
30225,c1f7889a-9ea9-4acb-b64c-b737c929599a,570.0,393.0,261.0,345.0,1


In [10]:
# train csv에 각 이미지 경로를 추가함
image_root = '/content/data/stage_2_train_images'
paths = []
for k in tqdm(range(len(train))):
  row = train.iloc[k, :]
  path = os.path.join(image_root, row['patientId']) + '.dcm'
  paths.append(path)

train['path'] = paths

100%|██████████| 30227/30227 [00:13<00:00, 2165.24it/s]


In [11]:
train['patientId'].duplicated().sum(), len(train)

(3543, 30227)

In [30]:
# 데이터프레임을 넣으면 YOLO 형태의 txt가 생성되는 함수

def get_YOLO_txt(df):
  df = df.sort_values(by='patientId')

  for k in tqdm(range(len(df))):
    row = df.iloc[k, :]

    if row['Target'] == 0:
      with open(os.path.join('/content/yolov5/data/labels', row['patientId'] + '.txt'), 'a') as f:
        f.write('')

    else:
      x_min, y_min, w, h = row['x'], row['y'], row['width'], row['height']
      dw, dh = pydicom.dcmread(row['path']).pixel_array.astype(np.float32).shape
      dw, dh = 1/dw, 1/dh

      x_center = (x_min + w/2) * dw
      y_center = (y_min + h/2) * dh
      w = w * dw
      h = h * dh

      line = '0' + ' ' + ' '.join(map(str, [x_center, y_center, w, h]))

      with open(os.path.join('/content/yolov5/data/labels', row['patientId'] + '.txt'), 'a') as f:
        f.write(line + '\n')

In [31]:
# YOLO 디렉토리의 labels 채우기

!mkdir /content/yolov5/data/labels
get_YOLO_txt(train)

100%|██████████| 30227/30227 [03:03<00:00, 164.84it/s]


In [14]:
# 경로를 입력하면 dicom을 png로 바꾸는 함수
def dcm_to_png(dcm_path, destination):
    """
    Convert DICOM (.dcm) files to PNG (.png) files and save them in the specified destination folder.
    
    Parameters:
    dcm_path (list): List of file paths for the input DICOM files
    destination (str): Path to the output directory where the PNG files will be saved
    
    Returns:
    None
    """
    # Make sure destination folder exists
    if not os.path.exists(destination):
        os.makedirs(destination)
        
    # Iterate over DICOM files and convert to PNG
    for path in tqdm(dcm_path):
        # Load DICOM file
        dcm = pydicom.dcmread(path)
        
        # Convert pixel data to uint8 and normalize
        img = (dcm.pixel_array / np.max(dcm.pixel_array) * 255).astype(np.uint8)
        
        # Save as PNG file
        filename = os.path.basename(path).replace('.dcm', '.png')
        filepath = os.path.join(destination, filename)
        cv2.imwrite(filepath, img)

In [15]:
dcm_paths = glob.glob('/content/data/stage_2_train_images/*.dcm')
destination_path = '/content/yolov5/data/images'
dcm_to_png(dcm_paths, destination_path)

100%|██████████| 26684/26684 [19:24<00:00, 22.91it/s]


In [21]:
# png 변환이 오래걸려서, 구글드라이브에 압축해서 저장해놓으려고함

def compress_files(input_directory, output_zipfile):
    # input_directory 내의 모든 파일 목록 가져오기
    files = [f for f in os.listdir(input_directory) if os.path.isfile(os.path.join(input_directory, f))]
    num_files = len(files)

    # zip 파일 열기
    with zipfile.ZipFile(output_zipfile, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
        # 모든 파일 압축하기
        for file in tqdm(files, desc='Compressing files'):
            file_path = os.path.join(input_directory, file)
            zipf.write(file_path, arcname=file)

    print(f'{num_files} files compressed successfully into {output_zipfile}')

# Example usage
input_directory = '/content/yolov5/data/images'
output_zipfile = '/content/drive/MyDrive/P_images.zip'
compress_files(input_directory, output_zipfile)

Compressing files: 100%|██████████| 26684/26684 [10:02<00:00, 44.27it/s]


26684 files compressed successfully into /content/drive/MyDrive/P_images.zip


In [32]:
# train, valid
images_paths = glob.glob('/content/yolov5/data/images/*.png')

train_path, valid_path = train_test_split(images_paths,
                                          test_size=0.1,
                                          random_state=777,
                                          shuffle=True)

with open('/content/train.txt', 'w') as f:
  f.write('\n'.join(train_path) + '\n')

with open('/content/valid.txt', 'w') as f:
  f.write('\n'.join(valid_path) + '\n')

In [33]:
# train.txt, val.txt 파일 경로
train_file = '/content/train.txt'
val_file = '/content/valid.txt'

# 클래스 이름 리스트
classes = ['pneumonia']

# data.yaml 파일 경로
data_file = '/content/data.yaml'

# data.yaml 파일 생성
data = dict(
    train=train_file,
    val=val_file,
    nc=len(classes),
    names=classes
)

# 한글 문자열 지원을 위한 설정
yaml.add_representer(str, lambda dumper, data: dumper.represent_scalar('tag:yaml.org,2002:str', data, style='"'))

with open(data_file, 'w', encoding='UTF-8') as f:
    yaml.dump(data, f, allow_unicode=True)

In [None]:
!python train.py --img 416 --batch 16 --epochs 5 --data /content/data.yaml --cfg /content/yolov5/models/yolov5m.yaml --weights yolov5m.pt --name /content/drive/MyDrive/Pneumonia_model1_5peochs

[34m[1mtrain: [0mweights=yolov5m.pt, cfg=/content/yolov5/models/yolov5m.yaml, data=/content/data.yaml, hyp=data/hyps/hyp.scratch-low.yaml, epochs=5, batch_size=16, imgsz=416, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=runs/train, name=/content/drive/MyDrive/Pneumonia_model1_5peochs, exist_ok=False, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[0], save_period=-1, seed=0, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest
[34m[1mgithub: [0mup to date with https://github.com/ultralytics/yolov5 ✅
YOLOv5 🚀 v7.0-128-gb96f35c Python-3.9.16 torch-1.13.1+cu116 CUDA:0 (Tesla T4, 15102MiB)

[34m[1mhyperparameters: [0mlr0=0.01, lrf=0.01, momentum=0.937, weight_decay=0.0005, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1, bo

In [None]:
!python train.py --img 416 --batch 16 --epochs 5 --data /content/data.yaml --cfg /content/yolov5/models/yolov5m.yaml --weights /content/drive/MyDrive/Pneumonia_model1_5peochs/weights/best.pt --name /content/drive/MyDrive/Pneumonia_model1_10peochs

In [None]:
!python train.py --img 416 --batch 16 --epochs 5 --data /content/data.yaml --cfg /content/yolov5/models/yolov5m.yaml --weights yolov5m.pt --name /content/drive/MyDrive/Pneumonia_model1_15peochs

In [None]:
!python train.py --img 416 --batch 16 --epochs 5 --data /content/data.yaml --cfg /content/yolov5/models/yolov5m.yaml --weights yolov5m.pt --name /content/drive/MyDrive/Pneumonia_model1_20peochs

# inference 

In [None]:
dcm_paths = glob.glob('/content/data/stage_2_test_images/*.dcm')
destination_path = '/content/data/test_images'
dcm_to_png(dcm_paths, destination_path)

In [None]:
!python detect.py --weights /content/yolov5/models/yolov5m.yaml --img 416 --conf 0.25 --source /content/data/test_images --save-txt