In [None]:
import json
import os

In [None]:
# 评估文件路径
eval_file_path = '/root/xm/HistoryTrans/eval/data/eval_results/eval_resu_20231217215841.json'
# 输出文件路径
output_file_path = '../dataset_clean/'
# 设置BLEU分数阈值
bleu_threshold = 0.1

# 输出文件夹不存在则创建
if not os.path.exists(output_file_path):
    os.makedirs(output_file_path)

# 加载评估结果
with open(eval_file_path, 'r') as file:
    eval_results = json.load(file)

# 总共有多少个样本
total_num = len(eval_results['samples'])
print('总共有{}个样本'.format(total_num))

In [None]:
# 过滤出质量高的数据
high_quality_samples = [sample for sample in eval_results['samples'] if sample['BLEU'] >= bleu_threshold]

# 可选：过滤出质量低的数据
low_quality_samples = [sample for sample in eval_results['samples'] if sample['BLEU'] < bleu_threshold]
# 低质量数据额外加上字段
for sample in low_quality_samples:
    sample['low_quality'] = 1

In [None]:
# 保存高质量数据集
with open(os.path.join(output_file_path, f'high_quality_data.json'), 'w') as outfile:
    json.dump(high_quality_samples, outfile, ensure_ascii=False, indent=4)

# 可选：保存低质量数据集，以便后续审核或分析
with open(os.path.join(output_file_path, f'low_quality_data.json'), 'w') as outfile:
    json.dump(low_quality_samples, outfile, ensure_ascii=False, indent=4)

print(f"高质量数据集大小：{len(high_quality_samples)}")
print(f"低质量数据集大小：{len(low_quality_samples)}")

### 分析低质量数据集

**重要提示**：在运行以下步骤之前，请确保手动检查并更新低质量数据集中的 `low_quality` 字段。
- 使用 `1` 标记那些确实质量较低的数据集样本。
- 使用 `0` 标记那些数据集质量本身没有问题，但模型预测结果未达预期的样本。

In [None]:
file_path = os.path.join(output_file_path, 'low_quality_data.json')
# 读取数据
with open(file_path, 'r') as file:
    low_quality_data = json.load(file)

In [None]:
# 初始化统计变量
total_low_quality_samples = len(low_quality_data)
low_quality_count = 0
low_quality_bleu_sum = 0
model_issue_count = 0
model_issue_bleu_sum = 0

# 统计
for sample in low_quality_data:
    if sample['low_quality'] == 1:
        low_quality_count += 1
        low_quality_bleu_sum += sample['BLEU']
    elif sample['low_quality'] == 0:
        model_issue_count += 1
        model_issue_bleu_sum += sample['BLEU']

# 计算比例和平均分数
low_quality_ratio = low_quality_count / total_num
low_quality_avg_bleu = low_quality_bleu_sum / low_quality_count if low_quality_count > 0 else 0
model_issue_ratio = model_issue_count / total_num
model_issue_avg_bleu = model_issue_bleu_sum / model_issue_count if model_issue_count > 0 else 0

# 输出结果
print(f"总样本数: {len(eval_results['samples'])}")
print(f"低质量总样本数: {total_low_quality_samples}, 质量低的样本数: {low_quality_count}, 模型预测问题样本数: {model_issue_count}")
print(f"质量低的样本比例: {low_quality_ratio:.2f}, 平均BLEU分数: {low_quality_avg_bleu:.2f}")
print(f"模型预测问题样本比例: {model_issue_ratio:.2f}, 平均BLEU分数: {model_issue_avg_bleu:.2f}")