In [1]:
import os
import json
import re
import glob
from typing import Dict, List

def parse_log_to_json(log_content: str) -> Dict:
    # 初始化结果字典
    result = {
        "experiment_config": {},
        "training_metrics": []
    }

    # 解析实验参数配置
    config_start = False
    for line in log_content.split('\n'):
        if "实验参数配置:" in line:
            config_start = True
            continue
        if config_start and "===" in line:
            config_start = False
            continue
        if config_start and "INFO - " in line:
            parts = line.split("INFO - ")
            if len(parts) > 1:
                parts = parts[1].split(": ", 1)
                if len(parts) == 2:
                    key, value = parts[0], parts[1]
                    # 类型转换
                    if value == "True":
                        value = True
                    elif value == "False":
                        value = False
                    elif value.startswith("[") and value.endswith("]"):
                        value = [item.strip("'\" ") for item in value[1:-1].split(",")]
                    elif "." in value and value.replace(".", "", 1).isdigit():
                        value = float(value)
                    else:
                        try:
                            value = int(value)
                        except:
                            pass
                    result["experiment_config"][key] = value

    # 解析训练指标
    epoch_pattern = re.compile(r"INFO - Epoch (\d+)/5")
    
    # 时间指标模式
    data_transfer_pattern = re.compile(r"INFO -   数据传输总耗时: ([\d\.]+)秒")
    forward_pattern = re.compile(r"INFO -   前向传播总耗时: ([\d\.]+)秒")
    backward_pattern = re.compile(r"INFO -   反向传播总耗时: ([\d\.]+)秒")
    optimizer_pattern = re.compile(r"INFO -   优化器步骤总耗时: ([\d\.]+)秒")
    epoch_duration_pattern = re.compile(r"INFO - Epoch \d+ 完成，耗时 ([\d\.]+) 秒。")
    
    # 评估指标模式
    eval_duration_pattern = re.compile(r"INFO - 评估完成，耗时 ([\d\.]+) 秒。")
    eval_result_pattern = re.compile(r"INFO - 评估结果: \{'pearson': ([\d\.]+)\}")
    
    lines = log_content.split('\n')
    current_epoch = None
    
    for i, line in enumerate(lines):
        # 解析epoch开始
        epoch_match = epoch_pattern.search(line)
        if epoch_match:
            current_epoch = int(epoch_match.group(1))
            result["training_metrics"].append({
                "epoch": current_epoch,
                "time_metrics": {},
                "evaluation": {}
            })
            continue
        
        # 已有epoch的情况下解析指标
        if current_epoch is not None:
            current_metrics = next((m for m in result["training_metrics"] if m["epoch"] == current_epoch), None)
            
            if current_metrics:
                # 解析时间指标
                data_transfer_match = data_transfer_pattern.search(line)
                if data_transfer_match:
                    current_metrics["time_metrics"]["data_transfer"] = float(data_transfer_match.group(1))
                    continue
                
                forward_match = forward_pattern.search(line)
                if forward_match:
                    current_metrics["time_metrics"]["forward"] = float(forward_match.group(1))
                    continue
                
                backward_match = backward_pattern.search(line)
                if backward_match:
                    current_metrics["time_metrics"]["backward"] = float(backward_match.group(1))
                    continue
                
                optimizer_match = optimizer_pattern.search(line)
                if optimizer_match:
                    current_metrics["time_metrics"]["optimizer_step"] = float(optimizer_match.group(1))
                    continue
                
                duration_match = epoch_duration_pattern.search(line)
                if duration_match:
                    current_metrics["time_metrics"]["epoch_duration"] = float(duration_match.group(1))
                    continue
                
                # 解析评估指标
                eval_duration_match = eval_duration_pattern.search(line)
                if eval_duration_match:
                    current_metrics["evaluation"]["duration"] = float(eval_duration_match.group(1))
                    continue
                
                eval_result_match = eval_result_pattern.search(line)
                if eval_result_match:
                    current_metrics["evaluation"]["pearson"] = float(eval_result_match.group(1))
                    continue

    return result

In [2]:
def process_log_files(folder_path: str, output_file: str) -> None:
    """
    处理指定文件夹下的所有.txt和.log文件，解析为JSON并保存到单个文件
    
    Args:
        folder_path: 包含日志文件的文件夹路径
        output_file: 输出JSON文件的路径
    """
    # 获取所有的.txt和.log文件
    file_paths = glob.glob(os.path.join(folder_path, "*.txt"))
    file_paths.extend(glob.glob(os.path.join(folder_path, "*.log")))
    
    # 存储所有解析结果的数组
    all_results = []
    
    # 处理每个文件
    for file_path in file_paths:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                log_content = f.read()
            
            # 添加文件名信息到日志内容开头
            file_name = os.path.basename(file_path)
            log_content = f"{file_name}\n{log_content}"
            
            # 解析日志
            result = parse_log_to_json(log_content)
            
            # 如果解析成功，添加文件路径信息
            if result and "experiment_config" in result:
                result["file_path"] = file_path
                all_results.append(result)
                print(f"成功解析 {file_path}")
            else:
                print(f"解析 {file_path} 失败，未找到有效的实验配置")
        except Exception as e:
            print(f"处理文件 {file_path} 时出错: {str(e)}")
    
    # 将所有结果保存到一个JSON文件
    if all_results:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(all_results, f, ensure_ascii=False, indent=2)
        print(f"成功将 {len(all_results)} 个实验结果保存到 {output_file}")
    else:
        print(f"没有找到可解析的文件，未生成输出")

In [3]:
folder_path = "./data/stsb_lora_rank3"
output_file = "./data/stsb_lora_rank3.json"

process_log_files(folder_path, output_file)

成功解析 ./data/stsb_lora_rank3\stsb_lora_rank3_20250323_122038.txt
成功解析 ./data/stsb_lora_rank3\stsb_lora_rank3_20250323_123635.txt
成功解析 ./data/stsb_lora_rank3\stsb_lora_rank3_20250323_125253.txt
成功解析 ./data/stsb_lora_rank3\stsb_lora_rank3_20250323_130904.txt
成功解析 ./data/stsb_lora_rank3\stsb_lora_rank3_seed42.txt
成功将 5 个实验结果保存到 ./data/stsb_lora_rank3.json


In [4]:
folder_path = "./data/stsb_adapter"
output_file = "./data/stsb_adapter.json"

process_log_files(folder_path, output_file)

成功解析 ./data/stsb_adapter\stsb_adapter_rank_20250323_154600.log
成功解析 ./data/stsb_adapter\stsb_adapter_rank_20250323_160057.log
成功解析 ./data/stsb_adapter\stsb_adapter_rank_20250323_161607.log
成功解析 ./data/stsb_adapter\stsb_adapter_rank_20250323_163105.log
成功解析 ./data/stsb_adapter\stsb_adapter_rank_20250323_164602.log
成功将 5 个实验结果保存到 ./data/stsb_adapter.json


In [5]:
folder_path = "./data/ag_news_lora"
output_file = "./data/ag_news_lora.json"

process_log_files(folder_path, output_file)

成功解析 ./data/ag_news_lora\ag_news_lora_rank3_20250324_001933.log
成功将 1 个实验结果保存到 ./data/ag_news_lora.json


In [6]:
folder_path = "./data/ag_news_adapter"
output_file = "./data/ag_news_adapter.json"

process_log_files(folder_path, output_file)

成功解析 ./data/ag_news_adapter\ag_news_adapter_rank_20250323_002101.log
成功将 1 个实验结果保存到 ./data/ag_news_adapter.json
