In [None]:
import re
import json

# 1) 定义若干“可能匹配到代码”的正则规则
#    可根据需要添加/删除/修改
code_block_pattern = re.compile(r'```[\s\S]*?```')
python_keyword_line_pattern = re.compile(
    r'(?:^|\n)\s*(?:from\s+\S+\s+import\s+\S+|import\s+\S+|def\s+\S+\s*\(.*\)\s*:|class\s+\S+\s*\(?.*\)?:|if\s+.*?:|elif\s+.*?:|else\s*:|for\s+.*?:|while\s+.*?:|try\s*:|except\s+.*?:)',
    flags=re.MULTILINE
)
assignment_pattern = re.compile(r'(?:^|\n)\s*\w+\s*=\s*[^=]+')
colon_pattern = re.compile(r'(?:^|\n)\s*(?:if|elif|else|for|while|try|except|def|class)\b[^:\n]*:')

# 将所有正则放在一个列表里，只要命中任意一个就视为含“代码”
patterns = [
    code_block_pattern,
    python_keyword_line_pattern,
    assignment_pattern,
    colon_pattern
]

# 2) 读取原始 JSON 文件
input_path = "/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-ReCap-558K/blip558k_stage1.5_finetune_w_prompt_abspath.json"
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# 3) 遍历 data，检测是否有“代码”
items_containing_code = []

for item in data:
    # 若没有 conversations 字段，则跳过
    if 'conversations' not in item:
        continue
    
    code_found = False  # 用于标记当前 item 是否检测到代码
    for conv in item['conversations']:
        text = conv.get('value', '')
        # 逐条正则检测，只要命中任意一个即可判定含代码
        for pattern in patterns:
            results = pattern.findall(text)
            if results:
                code_found = True
                break  # 这里 break 跳出正则循环
        if code_found:
            break  # 这里 break 跳出对话循环
    
    # 如果检测到代码，则将整条 item 收集起来
    if code_found:
        items_containing_code.append(item)

# # 4) 将包含代码的整条 item 写入新文件
# output_path = "items_with_code.json"
# with open(output_path, "w", encoding="utf-8") as f:
#     json.dump(items_containing_code, f, ensure_ascii=False, indent=2)

# print(f"匹配到含“代码”的 item 数量: {len(items_containing_code)}")
# print(f"结果已写入: {output_path}")


In [1]:
import os
base_path = "/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-OneVision-Mid-Data"  
directories = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]  

In [3]:
json_file_paths = []
for directory in directories:  
    json_file_path = os.path.join(base_path, directory, f"{directory}.json")  
    json_file_paths.append(json_file_path)
json_file_paths

['/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-OneVision-Mid-Data/.git/.git.json',
 '/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-OneVision-Mid-Data/evol_instruct/evol_instruct.json',
 '/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-OneVision-Mid-Data/synthdog_en/synthdog_en.json',
 '/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-OneVision-Mid-Data/synthdog_zh/synthdog_zh.json',
 '/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-OneVision-Mid-Data/ureader_tr/ureader_tr.json']

In [4]:
base_path = "/mnt/lingjiejiang/multimodal_code/data/llava_onevision/"  
directories = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]  
json_file_paths = []
for directory in directories:  
    json_file_path = os.path.join(base_path, directory, f"{directory}.json")  
    json_file_paths.append(json_file_path)
json_file_paths

['/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-NeXT-Data/LLaVA-NeXT-Data.json',
 '/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-OneVision-Data/LLaVA-OneVision-Data.json',
 '/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-OneVision-Mid-Data/LLaVA-OneVision-Mid-Data.json',
 '/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-Pretrain/LLaVA-Pretrain.json',
 '/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-ReCap-118K/LLaVA-ReCap-118K.json',
 '/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-ReCap-558K/LLaVA-ReCap-558K.json',
 '/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-ReCap-CC3M/LLaVA-ReCap-CC3M.json',
 '/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-Stage2-Si/LLaVA-Stage2-Si.json']

In [2]:
import yaml  
  
# 定义 YAML 文件的路径  
yaml_file_path = "/home/v-lingjiang/project/LLaVa_NeXT/scripts/train/mid_stage_mypath.yaml"  
  
# 读取 YAML 文件  
try:  
    with open(yaml_file_path, "r", encoding="utf-8") as file:  
        yaml_data = yaml.safe_load(file)  
except Exception as e:  
    print(f"读取 YAML 文件时出错: {e}")  
    exit(1)  
  
# 提取 json_path 信息  
json_paths = []  
datasets = yaml_data.get('datasets', [])  
for dataset in datasets:  
    json_path = dataset.get('json_path')  
    if json_path:  
        json_paths.append(json_path)  
  
# 输出所有 json_path  
print("提取到的 JSON 路径:")  
for path in json_paths:  
    print(path)  

提取到的 JSON 路径:
/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-ReCap-558K/blip558k_stage1.5_finetune_w_prompt_abspath.json
/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-ReCap-118K/coco118k_stage1.5_finetune_w_prompt_abspath.json
/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-ReCap-CC3M/cc3m_recap_data_prompt_v2_abspath.json
/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-OneVision-Mid-Data/ureader_tr/ureader_tr_processed_abspath.json
/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-OneVision-Mid-Data/evol_instruct/evol_instruct_processed.json
/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-OneVision-Mid-Data/synthdog_zh/synthdog_zh_processed_abs.json
/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-OneVision-Mid-Data/synthdog_en/synthdog_en_processed_abspath.json


In [1]:
import json
with open("/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-ReCap-558K/blip558k_stage1.5_finetune_w_prompt_abspath.json", "r") as f:
    data = json.load(f)
data[:2]

[{'id': '000000010',
  'image': '/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-ReCap-558K/blip558k_images/000000010.jpg',
  'conversations': [{'from': 'human',
    'value': '<image>\nPlease generate detailed descriptions of the given image.'},
   {'from': 'gpt',
    'value': 'The image shows a scene of a significant vehicle accident. A white car is severely damaged, with its body crumpled and parts of the vehicle scattered around. The car appears to have been involved in a high-speed impact, as suggested by the extensive damage and the disarray of the surrounding area.\n\nIn the foreground, there is a person wearing a high-visibility vest, which is typically worn by emergency personnel or workers to ensure they are easily seen in such situations. This individual is standing near the wreckage, possibly assessing the situation or preparing to provide assistance.\n\nThe setting appears to be an outdoor area with grass, and there are various items scattered around, including

In [None]:
import json  
  
# 读取原始数据  
with open("/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-Pretrain/blip_laion_cc_sbu_558k.json", 'r') as f:  
    data = json.load(f)  
  
# 定义转换函数  
def convert_conversations_to_messages(data):  
    for item in data:  
        messages = []  
        for conversation in item['conversations']:  
            role = 'user' if conversation['from'] == 'human' else 'assistant'  
            # 确保 <image> 在消息的开头  
            content = conversation['value']  
            if '<image>' in content:  
                content = '<image>' + content.replace('<image>', '').strip()  
            messages.append({  
                "content": content,  
                "role": role  
            })  
        # 用新格式替换旧的 conversations  
        item['messages'] = messages  
        item['images'] = [f"/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-Pretrain/{item.pop('image')}"]  
        del item['conversations']  
    return data  
  
# 执行转换  
new_data = convert_conversations_to_messages(data)  
  
# 打印转换后的前两个数据以验证结果  
print(json.dumps(new_data[:2], indent=2))  

In [3]:
import os  
import re  
import json  
from tqdm import tqdm

# 定义正则表达式模式  
code_block_pattern = re.compile(r'```[\s\S]*?```')  
python_keyword_line_pattern = re.compile(  
    r'(?:^|\n)\s*(?:from\s+\S+\s+import\s+\S+|import\s+\S+|def\s+\S+\s*\(.*\)\s*:|class\s+\S+\s*\(?.*\)?:|if\s+.*?:|elif\s+.*?:|else\s*:|for\s+.*?:|while\s+.*?:|try\s*:|except\s+.*?:)',  
    flags=re.MULTILINE  
)  
assignment_pattern = re.compile(r'(?:^|\n)\s*\w+\s*=\s*[^=]+')  
colon_pattern = re.compile(r'(?:^|\n)\s*(?:if|elif|else|for|while|try|except|def|class)\b[^:\n]*:')  
  
patterns = [  
    code_block_pattern,  
    python_keyword_line_pattern,  
    assignment_pattern,  
    colon_pattern  
]  
  
# 根目录路径  
base_path = "/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-OneVision-Mid-Data"  
  
# 获取根目录下所有子目录  
directories = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]  
  
# 用于存储所有包含代码的items  
all_items_with_code = []  
  
# # 遍历每个子目录  
# for directory in directories:  
#     json_file_path = os.path.join(base_path, directory, f"{directory}.json")  

for json_file_path in tqdm(json_paths):
    if not os.path.exists(json_file_path):  
        print(f"JSON 文件不存在: {json_file_path}")  
        continue  
  
    try:  
        # 读取 JSON 文件  
        with open(json_file_path, "r", encoding="utf-8") as f:  
            data = json.load(f)  
    except Exception as e:  
        print(f"读取文件时出错: {e}")  
        continue  
  
    # 检测当前 JSON 文件中的代码  
    items_containing_code = []  
    for item in data:  
        if 'conversations' not in item:  
            continue  
  
        code_found = False  
        for conv in item['conversations']:  
            text = conv.get('value', '')  
            for pattern in patterns:  
                if pattern.findall(text):  
                    code_found = True  
                    break  
            if code_found:  
                break  
  
        if code_found:  
            items_containing_code.append(item)  
      
    # 将当前文件的结果添加到总结果中  
    all_items_with_code.extend(items_containing_code)  
  
# 将所有包含代码的 items 写入一个新文件  
# output_path = os.path.join(base_path, "all_items_with_code.json")  
# try:  
#     with open(output_path, "w", encoding="utf-8") as f:  
#         json.dump(all_items_with_code, f, ensure_ascii=False, indent=2)  
#     print(f"匹配到含“代码”的 item 总数量: {len(all_items_with_code)}")  
#     print(f"结果已写入: {output_path}")  
# except Exception as e:  
#     print(f"写入文件时出错: {e}")  

  0%|          | 0/7 [00:00<?, ?it/s]

100%|██████████| 7/7 [03:52<00:00, 33.26s/it]


In [4]:
len(all_items_with_code)

37088

In [5]:
all_items_with_code[:2]

[{'id': '000004509',
  'image': '/mnt/lingjiejiang/multimodal_code/data/llava_onevision/LLaVA-ReCap-558K/blip558k_images/000004509.jpg',
  'conversations': [{'from': 'human',
    'value': '<image>\nPlease generate detailed descriptions of the given image.'},
   {'from': 'gpt',
    'value': 'The image displays a screenshot of a computer interface showing a SQL (Structured Query Language) command line example. The SQL command is written in a text box at the top of the image, which reads\n\n```sql\nSELECT Last_Name, First_Name\nFROM Addresses, First_Name\n```\n\nBelow the SQL command, there is a table with two columns, "Last_Name" and "First_Name". The table contains a list of names, with each row showing a combination of a last name and a first name. The names are presented in a structured format, with the last name on the left and the first name on the right.\n\nThe table includes names such as "Adams", "Aki", "Alleyson", "Angeles", "Aragon", "Armstrong", "Beckel", "Black", "Brennan", a

In [6]:
output_path = "/mnt/lingjiejiang/multimodal_code/data/llava_code_data/llava_midstage_code.json"
with open(output_path, "w", encoding="utf-8") as f:  
    json.dump(all_items_with_code, f, ensure_ascii=False, indent=0) 

In [1]:
import json
output_path = "/mnt/lingjiejiang/multimodal_code/data/llava_code_data/llava_midstage_code.json"
with open(output_path, "r", encoding="utf-8") as f:  
    data = json.load(f)

In [5]:
import json  
  
input_path = "/mnt/lingjiejiang/multimodal_code/data/llava_code_data/llava_midstage_code.json"  
output_path = "/mnt/lingjiejiang/multimodal_code/data/llava_code_data/llava_midstage_code_job.json"

with open(input_path, "r", encoding="utf-8") as f:  
    data = json.load(f)  
  
# Assuming data is a list of dictionaries  
for item in data:  
    if 'image' in item:  
        # Convert 'image' to 'images' list  
        item['images'] = [item.pop('image')]  
  
# If you want to save the modified data back to a file, uncomment the following lines:  
with open(output_path, "w", encoding="utf-8") as f:  
    json.dump(data, f, ensure_ascii=False, indent=0)  
  
# Print the modified data to verify  
# print(data)  

In [2]:
import json  
  
input_path = "/mnt/lingjiejiang/multimodal_code/data/llava_code_data/llava_midstage_code.json"  
output_path = "/mnt/lingjiejiang/multimodal_code/data/llava_code_data/llava_midstage_code_job.json"
with open(output_path, "r", encoding="utf-8") as f:  
    data = json.load(f)  
data[:2]

[{'id': '000004509',
  'conversations': [{'from': 'human',
    'value': '<image>\nPlease generate detailed descriptions of the given image.'},
   {'from': 'gpt',
    'value': 'The image displays a screenshot of a computer interface showing a SQL (Structured Query Language) command line example. The SQL command is written in a text box at the top of the image, which reads\n\n```sql\nSELECT Last_Name, First_Name\nFROM Addresses, First_Name\n```\n\nBelow the SQL command, there is a table with two columns, "Last_Name" and "First_Name". The table contains a list of names, with each row showing a combination of a last name and a first name. The names are presented in a structured format, with the last name on the left and the first name on the right.\n\nThe table includes names such as "Adams", "Aki", "Alleyson", "Angeles", "Aragon", "Armstrong", "Beckel", "Black", "Brennan", and others. Each name is separated by a line, indicating a new entry in the table.\n\nThe background of the image is 

In [3]:
for item in data:  
    if 'conversations' in item:  
        # Convert 'image' to 'images' list  
        item['messages'] = [item.pop('conversations')]  

In [5]:
with open(output_path, "w", encoding="utf-8") as f:  
    json.dump(data, f, ensure_ascii=False, indent=0)  

In [10]:
import json  
  
# 输入和输出文件路径  
input_path = "/mnt/lingjiejiang/multimodal_code/data/llava_code_data/llava_midstage_code_job.json"  
output_path = "/mnt/lingjiejiang/multimodal_code/data/llava_code_data/llava_midstage_code_job.json"  
  
# 从输入文件中读取数据  
with open(input_path, "r", encoding="utf-8") as f:  
    data = json.load(f)  
  
# 将 messages 转换为新格式的函数  
def convert_messages_format(messages):  
    new_format = []  
    for message_pair in messages:  
        for message in message_pair:  
            new_format.append({  
                "content": message['value'],  
                "role": "user" if message['from'] == 'human' else "assistant"  
            })  
    return new_format  
  
# 遍历数据并转换 messages  
for item in data:  
    item['messages'] = convert_messages_format(item['messages'])  
  
# # 将转换后的数据保存到输出文件  
with open(output_path, "w", encoding="utf-8") as f:  
    json.dump(data, f, ensure_ascii=False, indent=0)  
  
# print(f"转换后的数据已保存到 {output_path}")  

In [11]:
len(data)

37088

In [20]:
filtered_data = []  
for item in data:  
    # 检查 images 字段是否为空或包含空字符串  
    if 'images' not in item:  
        item['images'] = [""]  
    # if 'images' in item and (not item['images'] or all(img == "" for img in item['images'])):  
    #     continue  
    # if "images" not in item:
    #     continue
    # 转换 messages 格式  
    # item['messages'] = convert_messages_format(item['messages'])  
    filtered_data.append(item)  
  
# # 将转换后的数据保存到输出文件  
# with open(output_path, "w", encoding="utf-8") as f:  
#     json.dump(filtered_data, f, ensure_ascii=False, indent=2)  
  
# print(f"转换后的数据已保存到 {output_path}") 

In [18]:
len(filtered_data)

37088

In [21]:
output_path = "/mnt/lingjiejiang/multimodal_code/data/llava_code_data/llava_midstage_code_debug_job.json"  
# # 将转换后的数据保存到输出文件  
with open(output_path, "w", encoding="utf-8") as f:  
    json.dump(filtered_data, f, ensure_ascii=False, indent=0)    