In [None]:
import json
import random
import os

def resample_jsonl(file_path, output_dir, target_size):
    random.seed(3407)
    # 读取JSONL文件
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    current_size = len(lines)
    
    if current_size > target_size:
        # 向下采样
        resampled_lines = random.sample(lines, target_size)
    else:
        # 向上采样（重复采样直到达到目标数量）
        resampled_lines = lines * (target_size // current_size) + random.sample(lines, target_size % current_size)

    # 创建输出目录（如果不存在）
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 使用原始文件名保存到新的文件夹
    output_file = os.path.join(output_dir, os.path.basename(file_path))
    with open(output_file, 'w', encoding='utf-8') as file:
        file.writelines(resampled_lines)

    print(f'Resampling complete. Saved to: {output_file}')

def get_middle_value(input_dir):
    file_sizes = []
    
    # 获取所有JSONL文件的行数
    for filename in os.listdir(input_dir):
        if filename.endswith('.jsonl'):
            file_path = os.path.join(input_dir, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
                file_sizes.append(len(lines))
    
    # 计算最大值和最小值的中间值
    if file_sizes:
        max_size = max(file_sizes)
        min_size = min(file_sizes)
        middle_value = (max_size + min_size) // 2
        print(f'Max size: {max_size}, Min size: {min_size}, Middle value: {middle_value}')
        return middle_value
    else:
        print("No JSONL files found.")
        return None

# 批量处理多个JSONL文件
def resample_multiple_jsonl_files(input_dir, output_dir):
    middle_value = get_middle_value(input_dir)
    
    if middle_value:
        for filename in os.listdir(input_dir):
            if filename.endswith('.jsonl'):
                file_path = os.path.join(input_dir, filename)
                resample_jsonl(file_path, output_dir, middle_value)

# 运行函数
input_directory = '/path/to/CoSec2/data_train_val/train_org'  # 替换为你的JSONL文件夹路径
output_directory = '/path/to/CoSec2/data_train_val/train'
resample_multiple_jsonl_files(input_directory, output_directory)


Max size: 184, Min size: 38, Middle value: 111
Resampling complete. Saved to: /home/liuchao/shushanfu/CoSec2/data_train_val/train/cwe-476.jsonl
Resampling complete. Saved to: /home/liuchao/shushanfu/CoSec2/data_train_val/train/cwe-079.jsonl
Resampling complete. Saved to: /home/liuchao/shushanfu/CoSec2/data_train_val/train/cwe-078.jsonl
Resampling complete. Saved to: /home/liuchao/shushanfu/CoSec2/data_train_val/train/cwe-022.jsonl
Resampling complete. Saved to: /home/liuchao/shushanfu/CoSec2/data_train_val/train/cwe-787.jsonl
Resampling complete. Saved to: /home/liuchao/shushanfu/CoSec2/data_train_val/train/cwe-089.jsonl
Resampling complete. Saved to: /home/liuchao/shushanfu/CoSec2/data_train_val/train/cwe-125.jsonl
Resampling complete. Saved to: /home/liuchao/shushanfu/CoSec2/data_train_val/train/cwe-416.jsonl
Resampling complete. Saved to: /home/liuchao/shushanfu/CoSec2/data_train_val/train/cwe-190.jsonl
