In [None]:
pip install -U -q trl bitsandbytes peft hf_xet tensorboard

In [None]:
huggingface-cli login

In [None]:
from datasets import load_dataset

dataset_name = "HuggingFaceH4/llava-instruct-mix-vsft"

# Load Dataset
dataset = load_dataset(dataset_name)

In [None]:
import os
import json
from PIL import Image
import requests
from io import BytesIO

# Single-image 데이터셋을 로컬 폴더에 저장
save_dir = "./llava_single_image_data"
os.makedirs(save_dir, exist_ok=True)
os.makedirs(os.path.join(save_dir, "images"), exist_ok=True)

print(f"Dataset info: {dataset}")
print(f"Keys: {dataset.keys()}")

# 훈련 데이터셋 저장
train_dataset = dataset['train']
print(f"Total training samples: {len(train_dataset)}")

# 첫 번째 샘플 구조 확인
if len(train_dataset) > 0:
    sample = train_dataset[0]
    print(f"Sample keys: {sample.keys()}")
    print(f"Sample structure: {type(sample)}")

# Single-image 샘플만 필터링하여 저장
num_samples_to_save = 500  # 원하는 만큼 조정 가능
saved_count = 0
metadata = []

for i in range(min(num_samples_to_save, len(train_dataset))):
    sample = train_dataset[i]
    
    # Single image 체크 및 저장
    if 'image' in sample and sample['image'] is not None:
        try:
            image = sample['image']
            image_filename = f"image_{saved_count:05d}.jpg"
            image_path = os.path.join(save_dir, "images", image_filename)
            
            # PIL Image로 저장
            if hasattr(image, 'save'):
                image.save(image_path)
                
                # 메타데이터 수집
                sample_metadata = {
                    "image_filename": image_filename,
                    "conversations": sample.get('conversations', []),
                    "original_index": i
                }
                metadata.append(sample_metadata)
                saved_count += 1
                
            else:
                # URL인 경우 다운로드
                if isinstance(image, str) and image.startswith('http'):
                    response = requests.get(image)
                    img = Image.open(BytesIO(response.content))
                    img.save(image_path)
                    
                    sample_metadata = {
                        "image_filename": image_filename,
                        "conversations": sample.get('conversations', []),
                        "original_index": i
                    }
                    metadata.append(sample_metadata)
                    saved_count += 1
                    
        except Exception as e:
            print(f"Error saving image {i}: {e}")
    
    if i % 50 == 0:
        print(f"Processed {i}/{num_samples_to_save} samples, saved {saved_count} single-image samples")

# 메타데이터를 JSON 파일로 저장
metadata_path = os.path.join(save_dir, "metadata.json")
with open(metadata_path, 'w', encoding='utf-8') as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)

print(f"Single-image 데이터셋 저장 완료!")
print(f"저장 위치: {save_dir}")
print(f"이미지 폴더: {os.path.join(save_dir, 'images')}")
print(f"저장된 single-image 샘플 수: {saved_count}")
print(f"메타데이터 파일: {metadata_path}")
