In [5]:
import os
import json

# 配置路径
DATA_PATH = "/mnt/lingjiejiang/multimodal_code/data/dpo/merged_html_chart_150k.json"
OUTPUT_DIR = "qwen_bash"
PYTHON_SCRIPT = "data_process/dpo_openmodel/html_generate.py"
MODEL_NAME = "Qwen2-VL-72B-Instruct"
MODEL_PATH = "/mnt/lingjiejiang/multimodal_code/checkpoints/llms/Qwen2-VL-72B-Instruct"
CUDA_DEVICES = "0,1,2,3,4,5,6,7"

def get_total_lines(file_path):
    """ 计算 JSON 文件的总数据量 """
    with open(file_path, "r") as f:
        data = json.load(f)
    return len(data)

def generate_bash_scripts(machine_num):
    """ 生成 run_X.sh 脚本，并在 {MODEL_NAME}_log/ 目录下记录日志 """
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    log_dir = f"/mnt/lingjiejiang/multimodal_code/data/dpo/{MODEL_NAME}_log"
    os.makedirs(log_dir, exist_ok=True)  # 创建日志目录

    total_lines = get_total_lines(DATA_PATH)
    batch_size = total_lines // machine_num  # 每台机器的索引范围

    for i in range(machine_num):
        start_index = i * batch_size
        end_index = total_lines if i == machine_num - 1 else (i + 1) * batch_size  # 最后一台机器处理剩余部分

        log_file = f"{log_dir}/dpo_{start_index}_{end_index}.log"
        script_name = os.path.join(OUTPUT_DIR, f"run_{i+1}.sh")

        with open(script_name, "w") as f:
            f.write(f"""#!/bin/bash

export CUDA_VISIBLE_DEVICES={CUDA_DEVICES}

python {PYTHON_SCRIPT} \\
    --model_name {MODEL_NAME} \\
    --model_path {MODEL_PATH} \\
    --start_index {start_index} \\
    --end_index {end_index} | tee -a {log_file}

python run_gpu.py
""")
        # os.chmod(script_name, 0o755)  # 赋予可执行权限
        print(f"Generated {script_name} with index range [{start_index}, {end_index}) and logging to {log_file}")

if __name__ == "__main__":
    machine_num = int(input("Enter the number of machines: "))
    generate_bash_scripts(machine_num)


Generated qwen_bash/run_1.sh with index range [0, 12539) and logging to /mnt/lingjiejiang/multimodal_code/data/dpo/Qwen2-VL-72B-Instruct_log/dpo_0_12539.log
Generated qwen_bash/run_2.sh with index range [12539, 25078) and logging to /mnt/lingjiejiang/multimodal_code/data/dpo/Qwen2-VL-72B-Instruct_log/dpo_12539_25078.log
Generated qwen_bash/run_3.sh with index range [25078, 37617) and logging to /mnt/lingjiejiang/multimodal_code/data/dpo/Qwen2-VL-72B-Instruct_log/dpo_25078_37617.log
Generated qwen_bash/run_4.sh with index range [37617, 50156) and logging to /mnt/lingjiejiang/multimodal_code/data/dpo/Qwen2-VL-72B-Instruct_log/dpo_37617_50156.log
Generated qwen_bash/run_5.sh with index range [50156, 62695) and logging to /mnt/lingjiejiang/multimodal_code/data/dpo/Qwen2-VL-72B-Instruct_log/dpo_50156_62695.log
Generated qwen_bash/run_6.sh with index range [62695, 75234) and logging to /mnt/lingjiejiang/multimodal_code/data/dpo/Qwen2-VL-72B-Instruct_log/dpo_62695_75234.log
Generated qwen_bas

In [7]:
import os
import json

# 配置路径
DATA_PATH = "/mnt/lingjiejiang/multimodal_code/data/dpo/merged_html_chart_150k.json"
OUTPUT_DIR = "qwen_bash_7b"
PYTHON_SCRIPT = "data_process/dpo_openmodel/html_generate_7b.py"
MODEL_NAME = "Qwen2-VL-7B-Instruct"
MODEL_PATH = "/mnt/lingjiejiang/textual_aesthetics/model_checkpoint/vlm_checkpoints/Qwen2-VL-7B-Instruct"
CUDA_DEVICES = ["0", "1", "2", "3"]  # **设置可用的 GPU 编号**

def get_total_lines(file_path):
    """计算 JSON 文件的总数据量"""
    with open(file_path, "r") as f:
        data = json.load(f)
    return len(data)

def generate_multi_gpu_script():
    """生成 `run_multi_gpu.sh`，在一台机器上并行使用多个 GPU"""
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    log_dir = f"/mnt/lingjiejiang/multimodal_code/data/dpo/{MODEL_NAME}_log"
    os.makedirs(log_dir, exist_ok=True)  # 创建日志目录

    total_lines = get_total_lines(DATA_PATH)
    num_gpus = len(CUDA_DEVICES)
    batch_size = total_lines // num_gpus  # 每个 GPU 处理的索引范围

    script_name = os.path.join(OUTPUT_DIR, "run_multi_gpu.sh")
    
    with open(script_name, "w") as f:
        f.write("#!/bin/bash\n\n")

        for i, gpu_id in enumerate(CUDA_DEVICES):
            start_index = i * batch_size
            end_index = total_lines if i == num_gpus - 1 else (i + 1) * batch_size  # 最后一个 GPU 处理剩余部分

            log_file = f"{log_dir}/dpo_{start_index}_{end_index}.log"

            f.write(f"""CUDA_VISIBLE_DEVICES={gpu_id} python {PYTHON_SCRIPT} \\
    --model_name {MODEL_NAME} \\
    --model_path {MODEL_PATH} \\
    --batch_size 64 \\
    --start_index {start_index} \\
    --end_index {end_index} | tee -a {log_file} &\n""")  # **后台运行 (&) 任务**

        f.write("\nwait\n")  # **等待所有进程完成**

    # os.chmod(script_name, 0o755)  # 赋予执行权限
    print(f"Generated {script_name} for multi-GPU execution.")

if __name__ == "__main__":
    generate_multi_gpu_script()


Generated qwen_bash_7b/run_multi_gpu.sh for multi-GPU execution.


In [9]:
import json
import os

# 配置路径
SAVE_PATH = "/mnt/lingjiejiang/multimodal_code/data/dpo/code/code_95k.json"
OUTPUT_DIR = "code_bash"
PYTHON_SCRIPT = "data_process/dpo_openmodel/code_generate.py"
MODEL_NAME = "Meta-Llama-3.1-8B-Instruct"
MODEL_PATH = "/mnt/lingjiejiang/textual_aesthetics/model_checkpoint/sft_merge_checkpoints/Meta-Llama-3.1-8B-Instruct"
NUM_GPUS = 8  # GPU 数量
BATCH_SIZE = 256  # 设定的 batch_size
LOG_DIR = f"/mnt/lingjiejiang/multimodal_code/data/dpo/{MODEL_NAME}_log"

# 确保日志目录存在
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

def get_total_lines(file_path):
    """ 计算 JSON 文件的总数据量 """
    with open(file_path, "r") as f:
        data = json.load(f)
    return len(data)

def generate_multi_gpu_script():
    """ 生成 `run_multi_gpu.sh`，在 8 个 GPU 上并行运行 """
    total_lines = get_total_lines(SAVE_PATH)
    chunk_size = total_lines // NUM_GPUS  # 每个 GPU 处理的数据量

    script_name = os.path.join(OUTPUT_DIR, "run_multi_gpu.sh")
    
    with open(script_name, "w") as f:
        f.write("#!/bin/bash\n\n")

        for i in range(NUM_GPUS):
            start_index = i * chunk_size
            end_index = total_lines if i == NUM_GPUS - 1 else (i + 1) * chunk_size  # 最后一个 GPU 处理剩余部分

            log_file = f"{LOG_DIR}/dpo_{start_index}_{end_index}.log"

            f.write(f"""CUDA_VISIBLE_DEVICES={i} python {PYTHON_SCRIPT} \\
    --model_name {MODEL_NAME} \\
    --model_path {MODEL_PATH} \\
    --batch_size {BATCH_SIZE} \\
    --start_index {start_index} \\
    --end_index {end_index} | tee -a {log_file} &\n""")  # **后台运行 (&) 任务**

        f.write("\nwait\n")  # **等待所有进程完成**

    os.chmod(script_name, 0o755)  # 赋予执行权限
    print(f"Generated {script_name} for multi-GPU execution.")

if __name__ == "__main__":
    generate_multi_gpu_script()


Generated code_bash/run_multi_gpu.sh for multi-GPU execution.


In [2]:
import os
import json

# 配置路径
DATA_PATH = "/mnt/lingjiejiang/multimodal_code/data/chart_data/ChartBench/chartbench_images_46k_code_dpo.json"
OUTPUT_DIR = "qwen_chartbench"
PYTHON_SCRIPT = "data_process/dpo_openmodel/html_generate_chart_bench.py"
MODEL_NAME = "Qwen2-VL-72B-Instruct"
MODEL_PATH = "/mnt/lingjiejiang/multimodal_code/checkpoints/llms/Qwen2-VL-72B-Instruct"
CUDA_DEVICES = "0,1,2,3,4,5,6,7"

def get_total_lines(file_path):
    """ 计算 JSON 文件的总数据量 """
    with open(file_path, "r") as f:
        data = json.load(f)
    return len(data)

def generate_bash_scripts(machine_num):
    """ 生成 run_X.sh 脚本，并在 {MODEL_NAME}_log/ 目录下记录日志 """
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    log_dir = f"/mnt/lingjiejiang/multimodal_code/data/dpo/{MODEL_NAME}_log"
    os.makedirs(log_dir, exist_ok=True)  # 创建日志目录

    total_lines = get_total_lines(DATA_PATH)
    batch_size = total_lines // machine_num  # 每台机器的索引范围

    for i in range(machine_num):
        start_index = i * batch_size
        end_index = total_lines if i == machine_num - 1 else (i + 1) * batch_size  # 最后一台机器处理剩余部分

        log_file = f"{log_dir}/dpo_{start_index}_{end_index}_chartbench46k.log"
        script_name = os.path.join(OUTPUT_DIR, f"run_{i+1}.sh")

        with open(script_name, "w") as f:
            f.write(f"""#!/bin/bash

export CUDA_VISIBLE_DEVICES={CUDA_DEVICES}

python {PYTHON_SCRIPT} \\
    --model_name {MODEL_NAME} \\
    --model_path {MODEL_PATH} \\
    --start_index {start_index} \\
    --end_index {end_index} | tee -a {log_file}

python run_gpu.py
""")
        # os.chmod(script_name, 0o755)  # 赋予可执行权限
        print(f"Generated {script_name} with index range [{start_index}, {end_index}) and logging to {log_file}")

if __name__ == "__main__":
    machine_num = 4
    generate_bash_scripts(machine_num)


Generated qwen_chartbench/run_1.sh with index range [0, 11744) and logging to /mnt/lingjiejiang/multimodal_code/data/dpo/Qwen2-VL-72B-Instruct_log/dpo_0_11744_chartbench46k.log
Generated qwen_chartbench/run_2.sh with index range [11744, 23488) and logging to /mnt/lingjiejiang/multimodal_code/data/dpo/Qwen2-VL-72B-Instruct_log/dpo_11744_23488_chartbench46k.log
Generated qwen_chartbench/run_3.sh with index range [23488, 35232) and logging to /mnt/lingjiejiang/multimodal_code/data/dpo/Qwen2-VL-72B-Instruct_log/dpo_23488_35232_chartbench46k.log
Generated qwen_chartbench/run_4.sh with index range [35232, 46977) and logging to /mnt/lingjiejiang/multimodal_code/data/dpo/Qwen2-VL-72B-Instruct_log/dpo_35232_46977_chartbench46k.log


In [4]:
import os
import json

# 配置路径
DATA_PATH = "/mnt/lingjiejiang/multimodal_code/data/chart_data/ChartBench/chartbench_images_46k_code_dpo.json"
OUTPUT_DIR = "qwen_bash_7b"
PYTHON_SCRIPT = "data_process/dpo_openmodel/html_generate_7b_chart_bench.py"
MODEL_NAME = "Qwen2-VL-7B-Instruct"
MODEL_PATH = "/mnt/lingjiejiang/textual_aesthetics/model_checkpoint/vlm_checkpoints/Qwen2-VL-7B-Instruct"
CUDA_PAIRS = [("0,1"), ("2,3"), ("4,5"), ("6,7")]
OUTPUT_DIR_PARAM = "/mnt/lingjiejiang/multimodal_code/data/dpo/Qwen2-VL-7B-Instruct_generate_chartbench46k"  # 新增 output_dir 参数

def get_total_lines(file_path):
    """计算 JSON 文件的总数据量"""
    with open(file_path, "r") as f:
        data = json.load(f)
    return len(data)

def generate_multi_gpu_script():
    """生成 `run_multi_gpu.sh`，在一台机器上并行使用多个 GPU"""
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    log_dir = f"/mnt/lingjiejiang/multimodal_code/data/dpo/{MODEL_NAME}_log"
    os.makedirs(log_dir, exist_ok=True)  # 创建日志目录

    total_lines = get_total_lines(DATA_PATH)
    num_jobs = len(CUDA_PAIRS)
    batch_size = total_lines // num_jobs  # 每个作业处理的索引范围

    script_name = os.path.join(OUTPUT_DIR, "run_multi_gpu_chart_bench.sh")
    
    with open(script_name, "w") as f:
        f.write("#!/bin/bash\n\n")

        for i in range(num_jobs):
            cuda_devices = CUDA_PAIRS[i % len(CUDA_PAIRS)]  # 交替使用 GPU 设备
            start_index = i * batch_size
            end_index = total_lines if i == num_jobs - 1 else (i + 1) * batch_size  # 最后一个 GPU 处理剩余部分

            log_file = f"{log_dir}/dpo_{start_index}_{end_index}_chart_bench.log"

            f.write(f"""CUDA_VISIBLE_DEVICES={cuda_devices} python {PYTHON_SCRIPT} \
    --model_name {MODEL_NAME} \
    --model_path {MODEL_PATH} \
    --batch_size 64 \
    --start_index {start_index} \
    --end_index {end_index} \
    --output_dir {OUTPUT_DIR_PARAM} | tee -a {log_file} &\n""")  # **后台运行 (&) 任务**

        f.write("\nwait\n")  # **等待所有进程完成**

    print(f"Generated {script_name} for multi-GPU execution.")

if __name__ == "__main__":
    generate_multi_gpu_script()


Generated qwen_bash_7b/run_multi_gpu_chart_bench.sh for multi-GPU execution.


In [7]:
import os
import json

# 配置路径
DATA_PATH = "/mnt/lingjiejiang/multimodal_code/data/chart_data/ChartBench/chartbench_images_46k_code_dpo.json"
OUTPUT_DIR = "qwen_bash_7b"
PYTHON_SCRIPT = "data_process/dpo_openmodel/html_generate_7b_chart_bench.py"
MODEL_NAME = "Qwen2-VL-7B-Instruct"
MODEL_PATH = "/mnt/lingjiejiang/textual_aesthetics/model_checkpoint/vlm_checkpoints/Qwen2-VL-7B-Instruct"
CUDA_PAIRS = [("0,1"), ("2,3"), ("4,5"), ("6,7")]
OUTPUT_DIR_PARAM = "/mnt/lingjiejiang/multimodal_code/data/dpo/Qwen2-VL-7B-Instruct_generate_chartbench46k_4"  # 新增 output_dir 参数

def get_total_lines(file_path):
    """计算 JSON 文件的总数据量"""
    with open(file_path, "r") as f:
        data = json.load(f)
    return len(data)

def generate_multi_gpu_script():
    """生成 `run_multi_gpu.sh`，在一台机器上并行使用多个 GPU"""
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    log_dir = f"/mnt/lingjiejiang/multimodal_code/data/dpo/{MODEL_NAME}_log"
    os.makedirs(log_dir, exist_ok=True)  # 创建日志目录

    total_lines = get_total_lines(DATA_PATH)
    num_jobs = len(CUDA_PAIRS)
    batch_size = total_lines // num_jobs  # 每个作业处理的索引范围

    script_name = os.path.join(OUTPUT_DIR, "run_multi_gpu_chart_bench_4.sh")
    
    with open(script_name, "w") as f:
        f.write("#!/bin/bash\n\n")

        for i in range(num_jobs):
            cuda_devices = CUDA_PAIRS[i % len(CUDA_PAIRS)]  # 交替使用 GPU 设备
            start_index = i * batch_size
            end_index = total_lines if i == num_jobs - 1 else (i + 1) * batch_size  # 最后一个 GPU 处理剩余部分

            log_file = f"{log_dir}/dpo_{start_index}_{end_index}_chart_bench.log"

            f.write(f"""CUDA_VISIBLE_DEVICES={cuda_devices} python {PYTHON_SCRIPT} \
    --model_name {MODEL_NAME} \
    --model_path {MODEL_PATH} \
    --batch_size 64 \
    --start_index {start_index} \
    --end_index {end_index} \
    --output_dir {OUTPUT_DIR_PARAM} | tee -a {log_file} &\n""")  # **后台运行 (&) 任务**

        f.write("\nwait\n")  # **等待所有进程完成**

    print(f"Generated {script_name} for multi-GPU execution.")

if __name__ == "__main__":
    generate_multi_gpu_script()


Generated qwen_bash_7b/run_multi_gpu_chart_bench_4.sh for multi-GPU execution.
