In [2]:
import os
import numpy as np
import json

# Configuration
NUM_SAMPLES = 10  # Increased slightly to have enough for all splits
DATA_ROOT = "./Data/data"
IMAGE_DIR = os.path.join(DATA_ROOT, "images")
TEXT_DIR = os.path.join(DATA_ROOT, "texts")

# 1. Create Directories
os.makedirs(IMAGE_DIR, exist_ok=True)
os.makedirs(TEXT_DIR, exist_ok=True)
print(f"Created directories at {DATA_ROOT}")

# Initialize index with all three splits
dataset_index = {
    "train": [],
    "validation": [],
    "test": []
}

for i in range(NUM_SAMPLES):
    filename = f"fake_case_{i:03d}"
    
    # 2. Generate Fake Image (C, D, H, W) -> (1, 32, 256, 256)
    # Using float32 with values between 0 and 1
    image_data = np.random.rand(1, 32, 256, 256).astype(np.float32)
    
    image_path_rel = os.path.join("images", f"{filename}.npy")
    image_path_abs = os.path.join(DATA_ROOT, image_path_rel)
    
    np.save(image_path_abs, image_data)
    
    # 3. Generate Fake Text
    text_data = f"This is a random caption for {filename}. It shows a simulated medical scan for testing purposes."
    
    text_path_rel = os.path.join("texts", f"{filename}.txt")
    text_path_abs = os.path.join(DATA_ROOT, text_path_rel)
    
    with open(text_path_abs, "w") as f:
        f.write(text_data)
    
    # 4. Add to Index (Split Logic: 70% Train, 20% Val, 10% Test)
    entry = {
        "image": image_path_rel,
        "text": text_path_rel
    }
    
    if i < 7:
        dataset_index["train"].append(entry)
    elif i < 9:
        dataset_index["validation"].append(entry)
    else:
        dataset_index["test"].append(entry)

    print(f"Generated: {filename}")

# 5. Save JSON Index
json_path = os.path.join(DATA_ROOT, "my_dataset.json")
with open(json_path, "w") as f:
    json.dump(dataset_index, f, indent=4)

print(f"\nSuccess! Index saved to: {json_path}")
print(f"Train: {len(dataset_index['train'])}, Validation: {len(dataset_index['validation'])}, Test: {len(dataset_index['test'])}")

Created directories at ./Data/data
Generated: fake_case_000
Generated: fake_case_001
Generated: fake_case_002
Generated: fake_case_003
Generated: fake_case_004
Generated: fake_case_005
Generated: fake_case_006
Generated: fake_case_007
Generated: fake_case_008
Generated: fake_case_009

Success! Index saved to: ./Data/data/my_dataset.json
Train: 7, Validation: 2, Test: 1
