In [1]:
pip install pandas numpy opencv-python



In [8]:
import pandas as pd
import numpy as np
import cv2
import os
from tqdm import tqdm

# 데이터 파일 경로 설정
train_file = 'train.csv'
test_file = 'test.csv'
labels_file = 'sample_submission.csv'

# 이미지 저장 경로 설정
image_dir = 'url_images'
annotation_dir = 'url_annotations'
test_image_dir = 'test_url_images'
os.makedirs(image_dir, exist_ok=True)
os.makedirs(annotation_dir, exist_ok=True)
os.makedirs(test_image_dir, exist_ok=True)

# 이미지 크기 설정
img_width = 256
img_height = 32

# 라벨 매핑 (악성: 1, 정상: 0)
label_map = {'benign': 0, 'malicious': 1}

# URL을 이미지로 변환하는 함수
def url_to_image(url, width, height):
    """URL 문자열을 간단한 이미지로 변환합니다."""
    img = np.zeros((height, width, 1), dtype=np.uint8)
    for i, char in enumerate(url):
        if i < width:
            img[height // 2, i, 0] = ord(char) % 256  # ASCII 값을 이용하여 픽셀 값 할당
    return img

def generate_yolo_annotation(label, img_width, img_height):
    """YOLO 형식의 annotation 생성 (단일 객체 가정)."""
    # 객체가 이미지 중앙에 있다고 가정 (단순화)
    x_center = 0.5
    y_center = 0.5
    width = 1.0
    height = 1.0
    return f"{label} {x_center} {y_center} {width} {height}"

try:
    # 학습 데이터 로드 및 전처리
    train_df = pd.read_csv(train_file)
    if 'label' in train_df.columns:
        train_df['label'] = train_df['label'].map(label_map)
        # 'label' 컬럼에 NaN 값이 있는 행 제거
        train_df.dropna(subset=['label'], inplace=True)
        # 라벨을 정수형으로 변환
        train_df['label'] = train_df['label'].astype(int)
    else:
        print("주의: 'label' 컬럼이 train.csv에 없습니다. 라벨 정보 확인 필요.")

    print("학습 데이터 이미지 및 annotation 생성...")
    for index, row in tqdm(train_df.iterrows(), total=len(train_df)):
        # URL 컬럼 찾기
        url_column = None
        for col in ['url', 'URL', 'address']:
            if col in row:
                url_column = col
                break
        if url_column is None:
            print(f"오류: URL 컬럼을 찾을 수 없습니다. (인덱스: {index})")
            continue
        url = row[url_column]
        label = int(row['label'])
        image = url_to_image(url, img_width, img_height)
        image_filename = f"train_{index}.png"
        cv2.imwrite(os.path.join(image_dir, image_filename), image)

        annotation = generate_yolo_annotation(label, img_width, img_height)
        annotation_filename = f"train_{index}.txt"
        with open(os.path.join(annotation_dir, annotation_filename), 'w') as f:
            f.write(annotation)

    # 학습 데이터 이미지 및 annotation 목록 파일 생성 (YOLOv 학습에 필요)
    with open('train.txt', 'w') as f:
        for index in range(len(train_df)):
            image_filename = f"url_images/train_{index}.png"
            f.write(f"{os.path.abspath(image_filename)}\n")

    # 테스트 데이터 로드 및 이미지 생성
    test_df = pd.read_csv(test_file)
    print("테스트 데이터 이미지 생성...")
    for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
        # URL 컬럼 찾기
        url_column = None
        for col in ['url', 'URL', 'address']:
            if col in row:
                url_column = col
                break
        if url_column is None:
            print(f"오류: 테스트 데이터에서 URL 컬럼을 찾을 수 없습니다. (인덱스: {index})")
            continue
        url = row[url_column]
        image = url_to_image(url, img_width, img_height)
        image_filename = f"test_{index}.png"
        cv2.imwrite(os.path.join(test_image_dir, image_filename), image)

    # 테스트 데이터 이미지 목록 파일 생성
    with open('test.txt', 'w') as f:
        for index in range(len(test_df)):
            image_filename = f"test_url_images/test_{index}.png"
            f.write(f"{os.path.abspath(image_filename)}\n")

except FileNotFoundError as e:
    print(f"파일을 찾을 수 없습니다: {e}")
except KeyError as e:
    print(f"KeyError 발생: {e}. CSV 파일에 해당 컬럼이 있는지 확인해주세요.")
except ValueError as e:
    print(f"ValueError 발생: {e}")

print("데이터 준비 완료. YOLOv 모델 학습을 진행할 수 있습니다.")
print("train.txt, test.txt, url_images, url_annotations 폴더를 확인하세요.")

학습 데이터 이미지 및 annotation 생성...


0it [00:00, ?it/s]


테스트 데이터 이미지 생성...


100%|██████████| 1747689/1747689 [07:13<00:00, 4032.07it/s]


데이터 준비 완료. YOLOv 모델 학습을 진행할 수 있습니다.
train.txt, test.txt, url_images, url_annotations 폴더를 확인하세요.


In [19]:
# ultralytics 패키지 설치 (이미 설치되어 있다면 생략 가능)
!pip install ultralytics

# YOLOv8 학습 스크립트 직접 실행
!python /path/to/yolov8/train.py --model yolov8s.yaml --data custom_data.yaml --imgsz 256 32 --epochs 100 --batch 32 --name url_classification

Collecting ultralytics
  Downloading ultralytics-8.3.112-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [20]:
!yolo train model=yolov8s.yaml data=custom_data.yaml imgsz=256 32 epochs=100 batch=32 name=url_classification

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
Traceback (most recent call last):
  File "/usr/local/bin/yolo", line 8, in <module>
    sys.exit(entrypoint())
             ^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ultralytics/cfg/__init__.py", line 909, in entrypoint
    check_dict_alignment(full_args_dict, {a: ""})
  File "/usr/local/lib/python3.11/dist-packages/ultralytics/cfg/__init__.py", line 499, in check_dict_alignment
    raise SyntaxError(string + CLI_HELP_MSG) from e
SyntaxError: '[31m[1m32[0m' is not a valid YOLO argument. 

    Arguments received: ['yolo', 'train', 'model=yolov8s.yaml', 'data=custom_data.yaml', 'imgsz=256', '32', 'epochs=100', 'batch=32', 'name=url_classification']. Ultralytic

In [21]:
!yolo train model=yolov8s.yaml data=custom_data.yaml imgsz=256 32 epochs=100 batch=32 name=url_classification

Traceback (most recent call last):
  File "/usr/local/bin/yolo", line 8, in <module>
    sys.exit(entrypoint())
             ^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ultralytics/cfg/__init__.py", line 909, in entrypoint
    check_dict_alignment(full_args_dict, {a: ""})
  File "/usr/local/lib/python3.11/dist-packages/ultralytics/cfg/__init__.py", line 499, in check_dict_alignment
    raise SyntaxError(string + CLI_HELP_MSG) from e
SyntaxError: '[31m[1m32[0m' is not a valid YOLO argument. 

    Arguments received: ['yolo', 'train', 'model=yolov8s.yaml', 'data=custom_data.yaml', 'imgsz=256', '32', 'epochs=100', 'batch=32', 'name=url_classification']. Ultralytics 'yolo' commands use the following syntax:

        yolo TASK MODE ARGS

        Where   TASK (optional) is one of frozenset({'obb', 'classify', 'segment', 'detect', 'pose'})
                MODE (required) is one of frozenset({'train', 'benchmark', 'track', 'export', 'predict', 'val'})
                ARGS (

In [22]:
!yolo train model=yolov8s.yaml data=custom_data.yaml imgsz=256,32 epochs=100 batch=32 name=url_classification

Ultralytics 8.3.112 🚀 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8s.yaml, data=custom_data.yaml, epochs=100, time=None, patience=100, batch=32, imgsz=(256, 32), save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=url_classification, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, sho

In [23]:
!yolo train model=yolov8s.yaml data=custom_data.yaml imgsz=256,32 epochs=100 batch=32 name=url_classification

Ultralytics 8.3.112 🚀 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8s.yaml, data=custom_data.yaml, epochs=100, time=None, patience=100, batch=32, imgsz=(256, 32), save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=url_classification2, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, sh

In [24]:
!yolo train model=yolov8s.yaml data=custom_data.yaml imgsz=256,32 epochs=100 batch=32 name=url_classification

Ultralytics 8.3.112 🚀 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8s.yaml, data=custom_data.yaml, epochs=100, time=None, patience=100, batch=32, imgsz=(256, 32), save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=url_classification3, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, sh

In [25]:
import os
print(os.getcwd())

/content


In [26]:
!yolo train model=yolov8s.yaml data=/content/custom_data.yaml imgsz=256,32 epochs=100 batch=32 name=url_classification

Ultralytics 8.3.112 🚀 Python-3.11.12 torch-2.6.0+cu124 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8s.yaml, data=/content/custom_data.yaml, epochs=100, time=None, patience=100, batch=32, imgsz=(256, 32), save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=url_classification4, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf