In [7]:
import json
import numpy as np
import random

def simulate_missing_modality(metadata_path, missing_type, missing_ratio, seed=2025):
    """
    模拟多模态缺失数据
    :param data_json: 数据（json格式，包含文本和图像信息）
    :param missing_type: 缺失类型 ("text_only", "image_only", "mixed")
    :param missing_ratio: 缺失比例 (0-1之间的小数)
    :return: 缺失数据的索引列表
    """
    
    # 加载数据
    data = json.loads(metadata_path)
    n_samples = len(data)
    missing_count = int(n_samples * missing_ratio)
    
    missing_config = {
        "type": missing_type,
        "missing_ratio": missing_ratio,
    }
    
    # two-element, 1: avaliable, 0: missing.
    # 1st: text, 2nd: image.
    missing_details = {}

    # 确保随机性
    random.seed(seed)
    np.random.seed(seed)

    # 生成缺失索引
    if missing_type == "text":
        # 选择缺失文本的样本索引
        missing_indices = random.sample(range(n_samples), missing_count)
        missing_config["missing_indices"] = missing_indices
        for idx in missing_indices:
            missing_details[idx] = (0, 1)  # 将文本置为空，表示缺失
    elif missing_type == "image":
        # 选择缺失图像的样本索引
        missing_indices = random.sample(range(n_samples), missing_count)
        missing_config["missing_indices"] = missing_indices
        for idx in missing_indices:
            missing_details[idx] = (1, 0)
    elif missing_type == "mixed":
        # 混合缺失
        missing_indices = random.sample(range(n_samples), missing_count)
        missing_config["missing_indices"] = missing_indices
        for idx in missing_indices:
            if random.random() > 0.5:
                missing_details[idx] = (0, 1)  # 缺失文本
            else:
                missing_details[idx] = (1, 0)  # 缺失图像
    else:
        raise ValueError("Invalid missing_type. Choose from 'text_only', 'image_only', or 'mixed'.")
    
    missing_config["missing_details"] = missing_details
    
    return missing_config

# 示例用法
data_json = json.dumps([
    {"id": 1, "text": "This is a sample text.", "image": "image1.jpg"},
    {"id": 2, "text": "Another sample text.", "image": "image2.jpg"},
    {"id": 2, "text": "Another sample text.", "image": "image2.jpg"},
    {"id": 2, "text": "Another sample text.", "image": "image2.jpg"},
    {"id": 2, "text": "Another sample text.", "image": "image2.jpg"},
    {"id": 2, "text": "Another sample text.", "image": "image2.jpg"},
    {"id": 2, "text": "Another sample text.", "image": "image2.jpg"},
    {"id": 2, "text": "Another sample text.", "image": "image2.jpg"},
    # 可以添加更多样本
])

missing_type = "mixed"  # 缺失类型: "text_only", "image_only", or "mixed"
missing_ratio = 0.3  # 缺失比例

missing_indices = simulate_missing_modality(data_json, missing_type, missing_ratio)

print("缺失数据的索引:", missing_indices)


缺失数据的索引: {'type': 'mixed', 'missing_ratio': 0.3, 'missing_indices': [1, 5], 'missing_details': {1: (0, 1), 5: (0, 1)}}
