In [9]:
import sys
import os
import argparse

# 添加项目路径
sys.path.append('/Users/haha/Story')  # 你的项目根目录

# 导入所有必要模块
from src.constant import output_dir
from src.utils.utils import save_md, save_json, load_json, extract_plot_list
from src.generation.outline_generator import generate_outline
from src.generation.chapter_reorder import reorder_chapters
from src.generation.generate_characters import generate_characters_v1
from src.generation.expand_story import expand_story_v1
from src.compile_story import compile_full_story_by_sentence, compile_full_story_by_chapter
from src.enhance_story import enhance_story_with_transitions, polish_dialogues_in_story
from src.generation.dialogue_inserter import analyze_dialogue_insertions, run_dialogue_insertion, analyze_dialogue_insertions_v2
from src.utils.utils import extract_behavior_llm, convert_dialogue_dict_to_list
from src.sync.plot_sync_manager import sync_plot_and_dialogue_from_behavior
from src.sync.auto_propagate_plot_update import auto_propagate_plot_update
from src.analysis.character_state_tracker import run_character_state_tracker
from src.utils.logger import append_log, build_log_record, build_simple_log, init_log_path
from src.version_namer import build_version_name 

print("✅ 所有模块导入成功")

#

✅ 所有模块导入成功


In [None]:
import json
import re

# 读取story.json
version_folder = "/Users/haha/Story/data/output/小红帽_科幻_linear_T0.7_s1"  # 替换成你的路径
story_path = f"{version_folder}/story.json"

with open(story_path, 'r', encoding='utf-8') as f:
    story_data = json.load(f)

print("=== 检查story.json中的plot内容 ===")
print(f"总章节数: {len(story_data)}")

# 检查每个章节的plot是否有角色名重复
for i, chapter in enumerate(story_data):
    chapter_id = chapter.get("chapter_id", f"第{i+1}章")
    plot = chapter.get("plot", "")
    
    # 检查角色名重复
    duplicates = re.findall(r'小红帽小红帽|机械狼机械狼|外婆外婆|小蓝小蓝', plot)
    
    if duplicates:
        print(f"\n❌ {chapter_id} 发现重复:")
        print(f"   重复内容: {duplicates}")
        print(f"   片段: {plot[:200]}...")
    else:
        print(f"✅ {chapter_id} 无重复")

# 显示第一章的完整plot作为示例
print(f"\n=== 第一章完整plot示例 ===")
if story_data:
    print(story_data[0].get("plot", "无plot内容"))

In [None]:
# 检查sentence_dialogues.json
sentence_path = f"{version_folder}/sentence_dialogues.json"

try:
    with open(sentence_path, 'r', encoding='utf-8') as f:
        sentence_data = json.load(f)
    
    print("=== 检查sentence_dialogues.json ===")
    print(f"总句子数: {len(sentence_data)}")
    
    # 检查前5个有对话的句子
    dialogue_count = 0
    duplicate_count = 0
    
    for i, sentence in enumerate(sentence_data):
        if sentence.get("dialogue") and len(sentence["dialogue"]) > 0:
            dialogue_count += 1
            
            # 检查这个句子的对话是否有角色名重复
            for dlg in sentence["dialogue"]:
                speaker = dlg.get("speaker", "")
                dialogue_text = dlg.get("dialogue", "")
                action = dlg.get("action", "")
                
                # 检查各个字段是否有重复
                if re.search(r'小红帽小红帽|机械狼机械狼|外婆外婆|小蓝小蓝', speaker):
                    print(f"❌ 句子{i} speaker有重复: {speaker}")
                    duplicate_count += 1
                
                if re.search(r'小红帽小红帽|机械狼机械狼|外婆外婆|小蓝小蓝', dialogue_text):
                    print(f"❌ 句子{i} dialogue有重复: {dialogue_text}")
                    duplicate_count += 1
                
                if re.search(r'小红帽小红帽|机械狼机械狼|外婆外婆|小蓝小蓝', action):
                    print(f"❌ 句子{i} action有重复: {action}")
                    duplicate_count += 1
                
                # 显示前3个对话示例
                if dialogue_count <= 3:
                    print(f"\n对话示例{dialogue_count}:")
                    print(f"  章节: {sentence['chapter_id']}")
                    print(f"  句子: {sentence['sentence'][:50]}...")
                    print(f"  speaker: {speaker}")
                    print(f"  dialogue: {dialogue_text}")
                    print(f"  action: {action}")
    
    print(f"\n统计:")
    print(f"  有对话的句子: {dialogue_count}")
    print(f"  发现重复的次数: {duplicate_count}")
    
except FileNotFoundError:
    print("❌ sentence_dialogues.json 文件不存在")
except Exception as e:
    print(f"❌ 读取失败: {e}")

In [None]:
# 检查novel_story.md，看编译结果
novel_path = f"{version_folder}/novel_story.md"

with open(novel_path, 'r', encoding='utf-8') as f:
    novel_content = f.read()

print("=== 检查novel_story.md中的重复问题 ===")

# 查找带action的对话格式（应该是: 角色action，"对话" ——角色）
import re
duplicates = re.findall(r'小红帽小红帽|机械狼机械狼|外婆外婆|小蓝小蓝', novel_content)

print(f"发现的重复: {len(duplicates)} 个")
if duplicates:
    print(f"重复类型: {set(duplicates)}")

# 查找具体的重复行
lines = novel_content.split('\n')
duplicate_lines = []
for i, line in enumerate(lines):
    if re.search(r'小红帽小红帽|机械狼机械狼|外婆外婆|小蓝小蓝', line):
        duplicate_lines.append((i+1, line.strip()))

print(f"\n重复出现的行数: {len(duplicate_lines)}")
for line_num, line in duplicate_lines[:5]:  # 只显示前5个
    print(f"第{line_num}行: {line}")

# 查找原始action拼接的模式
action_pattern = r'([^，\n]+小红帽[^，\n]+)，"([^"]+)" ——小红帽'
action_matches = re.findall(action_pattern, novel_content)

print(f"\n发现的action拼接模式: {len(action_matches)} 个")
if action_matches:
    print("示例:")
    for i, (action_part, dialogue) in enumerate(action_matches[:3]):
        print(f"  {i+1}. {action_part}，\"{dialogue}\" ——小红帽")

In [None]:
# 修复compile_story.py中的format_dialogue_with_action函数
def fixed_format_dialogue_with_action(speaker, dialogue_text, action):
    """修复后的对话格式化函数"""
    if not action or not action.strip():
        # 无动作时的标准格式
        return f'"{dialogue_text.strip()}" ——{speaker}\n\n'
    
    action_clean = action.strip()
    
    # 🔧 关键修复：检查action是否已经包含角色名
    if action_clean.startswith(speaker):
        # 如果包含，直接使用action（不重复添加speaker）
        formatted_action = action_clean
    else:
        # 否则才拼接speaker
        formatted_action = f'{speaker}{action_clean}'
    
    return f'{formatted_action}，"{dialogue_text.strip()}" ——{speaker}\n\n'

# 测试修复效果
print("=== 测试修复效果 ===")

# 测试几个例子
test_cases = [
    ("小红帽", "我准备好了", "小红帽启动飞船,输入目标坐标"),  # action包含角色名
    ("小红帽", "收到指令", "点头确认"),  # action不包含角色名
    ("机械狼", "交出芯片", "机械狼冷笑一声,步步逼近"),  # 另一个包含角色名的例子
]

for speaker, dialogue, action in test_cases:
    result = fixed_format_dialogue_with_action(speaker, dialogue, action)
    print(f"原始: speaker='{speaker}', action='{action}'")
    print(f"结果: {result.strip()}")
    print()

In [None]:
# 直接在notebook中定义修复后的编译函数
def compile_full_story_by_sentence_FIXED(story_json, sentence_dialogues):
    """修复版本的句子级编译函数"""
    from src.utils.utils import split_plot_into_sentences
    
    def fixed_format_dialogue_with_action(speaker, dialogue_text, action):
        """修复后的对话格式化函数"""
        if not action or not action.strip():
            return f'"{dialogue_text.strip()}" ——{speaker}\n\n'
        
        action_clean = action.strip()
        
        # 🔧 关键修复：检查action是否已经包含角色名
        if action_clean.startswith(speaker):
            formatted_action = action_clean
        else:
            formatted_action = f'{speaker}{action_clean}'
        
        return f'{formatted_action}，"{dialogue_text.strip()}" ——{speaker}\n\n'
    
    # 组织句子级对话数据
    dialogue_map = {}
    for item in sentence_dialogues:
        if item.get("need_to_action") == 1 and item.get("dialogue"):
            chapter_id = item["chapter_id"]
            sentence_idx = item["sentence_index"]
            
            if chapter_id not in dialogue_map:
                dialogue_map[chapter_id] = {}
            dialogue_map[chapter_id][sentence_idx] = item["dialogue"]
    
    full_story = ""
    
    for chapter in story_json:
        chapter_id = chapter.get("chapter_id", f"Unknown")
        title = chapter.get("title", f"Unknown")
        plot = chapter.get("plot", "").strip()
        
        full_story += f"# {chapter_id}：{title}\n\n"
        
        # 按句子分割并插入对话
        sentences = split_plot_into_sentences(plot)
        
        for sent_idx, sentence in enumerate(sentences):
            # 添加叙述句子
            full_story += sentence + "\n\n"
            
            # 检查是否需要插入对话
            if (chapter_id in dialogue_map and 
                sent_idx in dialogue_map[chapter_id]):
                
                dialogues = dialogue_map[chapter_id][sent_idx]
                
                if dialogues:
                    for line in dialogues:
                        if isinstance(line, dict):
                            speaker = line.get("speaker", "")
                            dialogue_text = line.get("dialogue", line.get("line", ""))
                            action = line.get("action", "")
                            
                            if speaker and dialogue_text:
                                # 🔧 使用修复后的格式化函数
                                formatted_line = fixed_format_dialogue_with_action(speaker, dialogue_text, action)
                                full_story += formatted_line
                        elif isinstance(line, str):
                            full_story += line.strip() + "\n\n"
        
        full_story += "-" * 40 + "\n\n"
    
    return full_story

# 重新编译小说
print("🔧 使用修复后的编译函数重新生成小说...")

# 读取数据
with open(f"{version_folder}/story.json", 'r', encoding='utf-8') as f:
    story_data = json.load(f)

with open(f"{version_folder}/sentence_dialogues.json", 'r', encoding='utf-8') as f:
    sentence_data = json.load(f)

# 编译修复版小说
fixed_novel = compile_full_story_by_sentence_FIXED(story_data, sentence_data)

# 保存修复版
with open(f"{version_folder}/novel_story_FIXED.md", 'w', encoding='utf-8') as f:
    f.write(fixed_novel)

print("✅ 修复版小说已生成: novel_story_FIXED.md")

# 验证修复效果
duplicates_fixed = re.findall(r'小红帽小红帽|机械狼机械狼|外婆外婆|小蓝小蓝', fixed_novel)
print(f"修复后发现的重复: {len(duplicates_fixed)} 个")

if len(duplicates_fixed) == 0:
    print("🎉 角色名重复问题已完全修复！")
else:
    print(f"⚠️ 仍有 {len(duplicates_fixed)} 个重复，需要进一步调试")

# 预览修复效果
print(f"\n📖 修复版预览（前800字符）:")
print(fixed_novel[:800])

In [None]:
# 完善的修复函数，同时处理角色名重复和标点符号问题
def compile_full_story_by_sentence_COMPLETE_FIX(story_json, sentence_dialogues):
    """完整修复版：处理角色名重复 + 标点符号问题"""
    from src.utils.utils import split_plot_into_sentences
    import re
    
    def clean_punctuation(text):
        """清理标点符号问题"""
        if not text:
            return text
        
        # 统一中英文标点符号
        text = text.replace(',', '，')  # 统一用中文逗号
        text = text.replace('.', '。')   # 统一用中文句号
        text = text.replace('!', '！')  # 统一用中文感叹号
        text = text.replace('?', '？')  # 统一用中文问号
        text = text.replace(';', '；')  # 统一用中文分号
        
        # 修复标点符号错误组合
        text = re.sub(r'。，+', '，', text)  # 句号+逗号 -> 逗号
        text = re.sub(r'，。+', '。', text)  # 逗号+句号 -> 句号  
        text = re.sub(r'。+，', '，', text)  # 句号+逗号 -> 逗号
        text = re.sub(r'，+。', '。', text)  # 逗号+句号 -> 句号
        
        # 清理重复标点
        text = re.sub(r'，{2,}', '，', text)  # 多个逗号 -> 单个逗号
        text = re.sub(r'。{2,}', '。', text)  # 多个句号 -> 单个句号
        
        # 清理标点前的空格
        text = re.sub(r'\s+([，。！？；])', r'\1', text)
        
        return text.strip()
    
    def fixed_format_dialogue_with_action(speaker, dialogue_text, action):
        """修复后的对话格式化函数：处理角色名重复 + 标点问题"""
        if not action or not action.strip():
            return f'"{dialogue_text.strip()}" ——{speaker}\n\n'
        
        action_clean = action.strip()
        
        # 🔧 修复1：检查action是否已经包含角色名
        if action_clean.startswith(speaker):
            formatted_action = action_clean
        else:
            formatted_action = f'{speaker}{action_clean}'
        
        # 🔧 修复2：清理标点符号
        formatted_action = clean_punctuation(formatted_action)
        dialogue_text = clean_punctuation(dialogue_text)
        
        # 🔧 修复3：确保action以正确标点结尾
        if formatted_action and not formatted_action.endswith(('，', '。', '！', '？', '；')):
            # 如果action没有结尾标点，添加逗号
            formatted_action += '，'
        elif formatted_action.endswith('。'):
            # 如果以句号结尾，改为逗号（因为后面还有对话）
            formatted_action = formatted_action[:-1] + '，'
        
        return f'{formatted_action}"{dialogue_text.strip()}" ——{speaker}\n\n'
    
    # 组织句子级对话数据
    dialogue_map = {}
    for item in sentence_dialogues:
        if item.get("need_to_action") == 1 and item.get("dialogue"):
            chapter_id = item["chapter_id"]
            sentence_idx = item["sentence_index"]
            
            if chapter_id not in dialogue_map:
                dialogue_map[chapter_id] = {}
            dialogue_map[chapter_id][sentence_idx] = item["dialogue"]
    
    full_story = ""
    
    for chapter in story_json:
        chapter_id = chapter.get("chapter_id", f"Unknown")
        title = chapter.get("title", f"Unknown")
        plot = chapter.get("plot", "").strip()
        
        full_story += f"# {chapter_id}：{title}\n\n"
        
        # 按句子分割并插入对话
        sentences = split_plot_into_sentences(plot)
        
        for sent_idx, sentence in enumerate(sentences):
            # 🔧 清理叙述句子的标点
            clean_sentence = clean_punctuation(sentence)
            full_story += clean_sentence + "\n\n"
            
            # 检查是否需要插入对话
            if (chapter_id in dialogue_map and 
                sent_idx in dialogue_map[chapter_id]):
                
                dialogues = dialogue_map[chapter_id][sent_idx]
                
                if dialogues:
                    for line in dialogues:
                        if isinstance(line, dict):
                            speaker = line.get("speaker", "")
                            dialogue_text = line.get("dialogue", line.get("line", ""))
                            action = line.get("action", "")
                            
                            if speaker and dialogue_text:
                                formatted_line = fixed_format_dialogue_with_action(speaker, dialogue_text, action)
                                full_story += formatted_line
                        elif isinstance(line, str):
                            clean_line = clean_punctuation(line)
                            full_story += clean_line + "\n\n"
        
        full_story += "-" * 40 + "\n\n"
    
    return full_story

# 测试标点修复效果
print("=== 测试标点修复效果 ===")

# 测试标点清理函数
def clean_punctuation(text):
    if not text:
        return text
    
    text = text.replace(',', '，').replace('.', '。').replace('!', '！').replace('?', '？').replace(';', '；')
    text = re.sub(r'。，+', '，', text)
    text = re.sub(r'，。+', '。', text)
    text = re.sub(r'。+，', '，', text)
    text = re.sub(r'，+。', '。', text)
    text = re.sub(r'，{2,}', '，', text)
    text = re.sub(r'。{2,}', '。', text)
    text = re.sub(r'\s+([，。！？；])', r'\1', text)
    return text.strip()

test_punctuation = [
    "小红帽启动飞船,输入目标坐标,开始升空前往医疗中心.，",
    "机械狼冷笑一声，.准备攻击",
    "外婆虚弱地说道...，，",
]

for test in test_punctuation:
    fixed = clean_punctuation(test)
    print(f"原始: {test}")
    print(f"修复: {fixed}")
    print()

In [None]:
# 使用完整修复版重新编译小说
print("🔧 使用完整修复版（角色名+标点）重新生成小说...")

# 重新编译
complete_fixed_novel = compile_full_story_by_sentence_COMPLETE_FIX(story_data, sentence_data)

# 保存完整修复版
with open(f"{version_folder}/novel_story_COMPLETE_FIXED.md", 'w', encoding='utf-8') as f:
    f.write(complete_fixed_novel)

print("✅ 完整修复版小说已生成: novel_story_COMPLETE_FIXED.md")

# 验证修复效果
print("\n=== 修复效果验证 ===")

# 1. 检查角色名重复
duplicates_fixed = re.findall(r'小红帽小红帽|机械狼机械狼|外婆外婆|小蓝小蓝', complete_fixed_novel)
print(f"角色名重复: {len(duplicates_fixed)} 个")

# 2. 检查标点符号问题
punct_issues = []
if re.search(r'[,，][.。]', complete_fixed_novel):
    punct_issues.append("逗号+句号")
if re.search(r'[.。][,，]', complete_fixed_novel):
    punct_issues.append("句号+逗号")
if re.search(r'[,，]{2,}', complete_fixed_novel):
    punct_issues.append("重复逗号")
if re.search(r'[.。]{2,}', complete_fixed_novel):
    punct_issues.append("重复句号")

print(f"标点符号问题: {len(punct_issues)} 种")
if punct_issues:
    print(f"  问题类型: {punct_issues}")

# 3. 统计对话数量
dialogue_count = complete_fixed_novel.count('" ——')
print(f"对话数量: {dialogue_count} 条")

if len(duplicates_fixed) == 0 and len(punct_issues) == 0:
    print("\n🎉 所有问题已完全修复！")
else:
    print(f"\n⚠️ 仍有问题需要处理")

# 4. 对比修复前后
print(f"\n📊 修复前后对比:")
with open(f"{version_folder}/novel_story.md", 'r', encoding='utf-8') as f:
    original_novel = f.read()

original_duplicates = len(re.findall(r'小红帽小红帽|机械狼机械狼|外婆外婆|小蓝小蓝', original_novel))
print(f"  角色名重复: {original_duplicates} -> {len(duplicates_fixed)}")

# 5. 预览修复效果（前1000字符）
print(f"\n📖 完整修复版预览:")
print(complete_fixed_novel[:1000])
print("...")

In [None]:
# 在你的ipynb中快速测试
import sys
sys.path.append('/Users/haha/Story')

# 重新编译测试
from src.compile_story import compile_full_story_by_sentence
from src.utils.utils import load_json

version_folder = "/Users/haha/Story/data/output/小红帽_科幻_linear_T0.7_s1"

story_data = load_json(f"{version_folder}/story.json")
sentence_data = load_json(f"{version_folder}/sentence_dialogues.json")

# 用修复后的函数编译
test_novel = compile_full_story_by_sentence(story_data, sentence_data)

# 检查是否还有问题
import re
duplicates = re.findall(r'小红帽小红帽|机械狼机械狼|外婆外婆|小蓝小蓝', test_novel)
print(f"角色名重复: {len(duplicates)} 个")

# 检查对话数量
dialogue_count = test_novel.count('" ——')
print(f"对话数量: {dialogue_count} 条")

if len(duplicates) == 0:
    print("🎉 修复成功！")
    # 保存测试结果
    with open(f"{version_folder}/novel_story_FINAL_TEST.md", 'w', encoding='utf-8') as f:
        f.write(test_novel)
else:
    print("⚠️ 还有问题需要检查")

In [None]:
# 置参数
topic = "小红帽"
style = "科幻改写"
reorder_mode = "linear"
use_cache = False
behavior_model = "gpt-4.1"
temperature = 0.7
seed = 1

print(f"📋 测试参数:")
print(f"   Topic: {topic}")
print(f"   Style: {style}")
print(f"   Reorder mode: {reorder_mode}")
print(f"   Temperature: {temperature}")
print(f"   Seed: {seed}")

In [None]:
# 自动构建版本名称
version = build_version_name(
    topic=topic,
    style=style,
    temperature=temperature,
    seed=seed,
    order_mode=reorder_mode
)

print(f"🏷️ 生成版本名: {version}")

# 创建输出文件夹
def ensure_output_dir(version):
    folder = os.path.join(output_dir, version)
    os.makedirs(folder, exist_ok=True)
    return folder

def step_file(version, filename):
    return os.path.join(output_dir, version, filename)

folder = ensure_output_dir(version)
role_state = {}

plot_log_path = init_log_path(folder, "plot")
dialogue_log_path = init_log_path(folder, "dialogue")

print(f"📁 输出文件夹: {folder}")
print(f"📝 日志路径: plot={plot_log_path}, dialogue={dialogue_log_path}")

# Step 1 - Outline Generation

In [None]:
print("=== Step 1: Outline Generation ===")

# 复制main_pipeline.py的outline生成逻辑
outline_base_path = os.path.join(output_dir, "reference_outline", f"{topic}_{style}_T{temperature}_s{seed}outline.json")
os.makedirs(os.path.dirname(outline_base_path), exist_ok=True)

if os.path.exists(outline_base_path) and use_cache:
    outline = load_json(outline_base_path)
    print(f"📖 已加载共享 outline：{outline_base_path}")
else:
    print(f"🔄 生成新的outline...")
    outline = generate_outline(topic=topic, style=style, custom_instruction="")
    save_json(outline, "reference_outline", f"{topic}_{style}_T{temperature}_s{seed}_outline.json")
    print(f"💾 生成并保存共享 outline：{outline_base_path}")

print(f"✅ Outline生成完成")
print(f"   章节数: {len(outline)}")
for i, ch in enumerate(outline):
    print(f"   {i+1}. {ch.get('chapter_id', 'Unknown')}: {ch.get('title', 'Unknown')}")
    print(f"      Summary: {ch.get('summary', '')[:60]}...")

# Step 2 - Chapter Reordering

In [None]:
print("=== Step 2: Chapter Reordering ===")

# 复制main_pipeline.py的章节重排逻辑
if reorder_mode == "linear":
    reorder_outline_raw = outline
    save_json(outline, version, "test_outline.json")
    print("✅ 使用 linear 顺序（直接来自 outline）")

elif reorder_mode == "nonlinear":
    save_json(outline, version, "test_outline_linear.json")
    reorder_path = os.path.join(output_dir, "reference_reorder", f"{topic}_{style}_T{temperature}_s{seed}_nonlinear.json")
    os.makedirs(os.path.dirname(reorder_path), exist_ok=True)

    if os.path.exists(reorder_path) and use_cache:
        reorder_outline_raw = load_json(reorder_path)
        print(f"📖 已加载 cached nonlinear 顺序：{reorder_path}")
    else:
        print(f"🔄 生成nonlinear顺序...")
        reorder_outline_raw = reorder_chapters(outline, mode="nonlinear")

        # 添加日志记录
        reorder_log_path = init_log_path(folder, "reorder")
        reorder_log = build_simple_log(
            module="chapter_reorder",
            task_name=version,
            input_data={"outline": outline},
            output_data={"reorder_result": reorder_outline_raw}
        )
        append_log(reorder_log_path, reorder_log)

        # 检查是否真的生成了 new_order 字段
        if not any("new_order" in ch for ch in reorder_outline_raw):
            print("⚠️ LLM 重排失败：未检测到任何 new_order 字段，回退为原始顺序")
        else:
            print("✅ reorder_chapters 成功生成非线性顺序")

        save_json(reorder_outline_raw, "reference_reorder", f"{topic}_{style}_T{temperature}_s{seed}_nonlinear.json")
        print(f"💾 生成 nonlinear 顺序并缓存：{reorder_path}")

else:
    raise ValueError("order_mode 必须为 'linear' 或 'nonlinear'")

# 统一结构：补全 summary 字段
reorder_outline = []
for reordered_ch in reorder_outline_raw:
    match = next((x for x in outline if x["chapter_id"] == reordered_ch["chapter_id"]), None)
    if match:
        merged = {
            "chapter_id": reordered_ch["chapter_id"],
            "title": reordered_ch["title"],
            "summary": match.get("summary", "")
        }
        if "new_order" in reordered_ch:
            merged["new_order"] = reordered_ch["new_order"]
        reorder_outline.append(merged)

save_json(reorder_outline, version, "test_reorder_outline.json")
print("✅ 章节顺序处理完成（已保留 summary）")

# 显示最终顺序
print(f"📋 最终章节顺序:")
for i, ch in enumerate(reorder_outline):
    order_info = f" (原顺序: {ch.get('new_order', i+1)})" if 'new_order' in ch else ""
    print(f"   {i+1}. {ch['chapter_id']}: {ch['title']}{order_info}")

# Step 3 - Character Generation

In [None]:
print("=== Step 3: Character Generation ===")

character_path = step_file(version, "characters.json")
if use_cache and os.path.exists(character_path):
    characters = load_json(character_path)
    print("📖 已加载角色设定")
else:
    print(f"🔄 生成角色设定...")
    characters = generate_characters_v1(reorder_outline)
    save_json(characters, version, "characters.json")
    print("💾 生成角色设定完成")

print(f"✅ 角色生成完成")
print(f"   角色数: {len(characters)}")
for i, char in enumerate(characters):
    print(f"   {i+1}. {char.get('name', 'Unknown')}")
    print(f"      特征: {char.get('traits', 'Unknown')[:50]}...")
    print(f"      背景: {char.get('background', 'Unknown')[:50]}...")

# Step 4 - Story Expansion

In [None]:
print("=== Step 4: Story Expansion ===")

plot_path = step_file(version, "story.json")
if use_cache and os.path.exists(plot_path):
    story = load_json(plot_path)
    print("📖 已加载故事内容")
else:
    print(f"🔄 生成故事内容...")
    story = expand_story_v1(reorder_outline, characters, custom_instruction="")
    
    # 确保每章都有正确的ID和标题
    for idx, ch in enumerate(story):
        ch.setdefault("chapter_id", reorder_outline[idx]["chapter_id"])
        ch.setdefault("title", reorder_outline[idx]["title"])

        # 记录日志
        log = build_log_record(
            module="expand_story", step="plot",
            task_name=version, chapter_id=ch["chapter_id"],
            model=behavior_model, 
            input_data={"outline": reorder_outline[idx]},
            output_data={"plot": ch["plot"]},
            temperature=temperature, 
            seed=seed
        )
        append_log(plot_log_path, log)

    save_json(story, version, "story.json")
    print("💾 故事内容生成完成")

print(f"✅ 故事生成完成")
print(f"   章节数: {len(story)}")
for i, ch in enumerate(story):
    print(f"   {i+1}. {ch.get('chapter_id', 'Unknown')}: {ch.get('title', 'Unknown')}")
    print(f"      Plot长度: {len(ch.get('plot', ''))} 字符")
    print(f"      Scene长度: {len(ch.get('scene', ''))} 字符")
    print(f"      Plot预览: {ch.get('plot', '')[:80]}...")

# Step 5&6 - 新版对话生成（关键测试）

In [None]:
print("=== Step 5&6: 新版对话生成（句子级分析 + 章节级兼容） ===")

print(f"🔄 开始分析 {len(story)} 个章节...")

try:
    # 调用新的v2函数
    chapter_results, sentence_results, behavior_timeline = analyze_dialogue_insertions_v2(story, characters)
    
    print(f"✅ 对话分析完成！")
    print(f"   Chapter results: {len(chapter_results)} 个章节")
    print(f"   Sentence results: {len(sentence_results)} 个句子") 
    print(f"   Behavior timeline: {len(behavior_timeline)} 条记录")
    
    # 保存三种格式的数据
    save_json(chapter_results, version, "dialogue_marks.json")        # 兼容格式
    save_json(sentence_results, version, "sentence_dialogues.json")    # 句子级详细分析
    save_json(behavior_timeline, version, "behavior_timeline_raw.json")  # 原始behavior数据
    
    # 设置dialogue_result以保持后续流程兼容
    dialogue_result = chapter_results
    print("💾 三种格式数据已保存")
    
    # 快速统计
    print(f"\n📊 句子分析统计:")
    chapter_stats = {}
    dialogue_count = 0
    
    for sentence in sentence_results:
        chapter_id = sentence['chapter_id']
        if chapter_id not in chapter_stats:
            chapter_stats[chapter_id] = {'total': 0, 'dialogue': 0}
        chapter_stats[chapter_id]['total'] += 1
        if sentence['need_to_action'] == 1:
            chapter_stats[chapter_id]['dialogue'] += 1
            dialogue_count += 1
    
    print(f"   总句子数: {len(sentence_results)}")
    print(f"   需要对话的句子: {dialogue_count}")
    
    for chapter_id, stats in chapter_stats.items():
        print(f"   {chapter_id}: {stats['total']} 句，{stats['dialogue']} 句需对话")
    
    # Behavior统计
    if behavior_timeline:
        behavior_chars = {}
        for behavior in behavior_timeline:
            char = behavior['character']
            if char not in behavior_chars:
                behavior_chars[char] = 0
            behavior_chars[char] += 1
        
        print(f"\n👥 Behavior统计:")
        for char, count in behavior_chars.items():
            print(f"   {char}: {count} 个行为")
    
    print(f"✅ 新版对话生成完成")

except Exception as e:
    print(f"❌ 对话生成失败: {e}")
    import traceback
    traceback.print_exc()
    # 如果失败，可以在这里停止或使用旧方法

In [None]:
# 在 Step 5&6 之后添加
print("\n=== 验证句子级对话数据 ===")

# 检查保存的文件
sentence_dialogues_path = os.path.join(folder, "sentence_dialogues.json")
if os.path.exists(sentence_dialogues_path):
    saved_sentence_data = load_json(sentence_dialogues_path)
    print(f"✅ sentence_dialogues.json 存在，包含 {len(saved_sentence_data)} 个句子")
    
    # 检查是否有dialogue和action
    dialogues_with_action = 0
    for sentence in saved_sentence_data:
        if sentence.get("dialogue"):
            for d in sentence["dialogue"]:
                if d.get("action"):
                    dialogues_with_action += 1
                    print(f"   找到action: {d['speaker']}: {d['action']}")
                    break  # 只打印第一个作为示例
    
    print(f"📊 有 {dialogues_with_action} 个句子包含带action的对话")
else:
    print("❌ sentence_dialogues.json 不存在！")

# Step 8 - 检查长度匹配和记录日志

In [None]:
print("=== 数据一致性检查和日志记录 ===")

# 检查长度是否匹配
if len(story) != len(dialogue_result):
    print(f"⚠️ 警告：story 有 {len(story)} 章，但 dialogue_result 只有 {len(dialogue_result)} 条对白")
else:
    print(f"✅ 数据长度匹配：{len(story)} 章节")

# 记录对话生成日志
print(f"📝 记录对话生成日志...")
for ch, dlg in zip(story, dialogue_result):
    log = build_log_record(
        module="dialogue_inserter", step="dialogue",
        task_name=version, chapter_id=ch["chapter_id"],
        model=behavior_model,
        input_data={"plot": ch["plot"]},
        output_data={"dialogue": dlg["dialogue"]},
        temperature=temperature, seed=seed
    )
    append_log(dialogue_log_path, log)

print(f"✅ 日志记录完成")

# Step 6.5 - 新版Behavior处理

In [None]:
print("=== Step 6.5: 新版Behavior处理 ===")

# 组织角色弧线
character_arcs = {}
for item in behavior_timeline:
    char = item["character"]
    if char not in character_arcs:
        character_arcs[char] = []
    character_arcs[char].append({
        "chapter": item["chapter_id"],
        "sentence": item["sentence_index"],
        "behavior": item["behavior"],
        "scene": item["scene_context"][:30] + "..." if len(item["scene_context"]) > 30 else item["scene_context"]
    })

# 生成完整的behavior_trace
behavior_trace = {
    "timeline": behavior_timeline,
    "character_arcs": character_arcs,
    "statistics": {
        "total_dialogue_moments": len(behavior_timeline),
        "characters_behavior_count": {char: len(arcs) for char, arcs in character_arcs.items()}
    },
    "legacy_behaviors": [f"{item['character']}：{item['behavior']}" for item in behavior_timeline]
}

save_json(behavior_trace, version, "behavior_trace.json")

# 兼容role_state
role_state = {}
for item in behavior_timeline:
    role = item["character"]
    behavior_item = item["behavior"]
    role_state.setdefault(role, [])
    if behavior_item not in role_state[role]:
        role_state[role].append(behavior_item)

print(f"✅ 新版behavior trace生成完成")
print(f"📊 角色弧线统计:")
for char, arcs in character_arcs.items():
    print(f"   {char}: {len(arcs)} 个行为节点")

# 显示角色弧线示例
print(f"\n👤 角色弧线示例（每个角色前3个行为）:")
for char, arcs in character_arcs.items():
    print(f"   {char}:")
    for i, arc in enumerate(arcs[:3]):
        print(f"      {i+1}. {arc['chapter']}第{arc['sentence']+1}句: {arc['behavior']}")
    if len(arcs) > 3:
        print(f"      ... 还有 {len(arcs)-3} 个行为")

# Step 6.7 - 联动机制

In [None]:
# Step 6.7 - 联动机制
print("=== Step 6.7: 联动机制 ===")

print(f"🔄 运行plot和dialogue联动机制...")
try:
    # 使用章节级数据进行sync
    story, chapter_results_updated, revision_log = sync_plot_and_dialogue_from_behavior(
        story, chapter_results, characters, model=behavior_model)
    
    print(f"✅ 联动机制完成")
    print(f"   修订记录数: {len(revision_log) if revision_log else 0}")
    
except Exception as e:
    print(f"⚠️ 联动机制出错: {e}")
    chapter_results_updated = chapter_results  # 添加这行
    revision_log = []

# Step 7 - 保存所有输出

In [None]:
# Step 7 - 保存所有输出
print("=== Step 7: 保存所有输出 ===")

# 保存核心数据
save_json(role_state, version, "role_state.json")
save_json(story, version, "story_updated.json")
save_json(sentence_results, version, "dialogue_updated.json")  # 改为保存句子级！
save_json(revision_log, version, "revision_log.json")

print(f"💾 核心数据已保存")

# 生成小说文件
print(f"📚 生成小说文件...")

# 使用章节级编译（作为对比）
compiled_story = compile_full_story_by_chapter(story, chapter_results_updated)
save_md(compiled_story, os.path.join(folder, "novel_story_chapter.md"))
print(f"   ✅ novel_story_chapter.md 已生成（章节级）")

# 使用句子级精确编译（主要输出）
compiled_updated = compile_full_story_by_sentence(story, sentence_results)
save_md(compiled_updated, os.path.join(folder, "novel_story.md"))  # 也保存为主文件
print(f"   ✅ novel_story.md 已生成（句子级精确）")

In [None]:
# 在 Step 7 编译小说后添加
print("\n=== 验证编译结果 ===")

# 读取并检查novel_story.md
novel_path = os.path.join(folder, "novel_story.md")
if os.path.exists(novel_path):
    with open(novel_path, 'r', encoding='utf-8') as f:
        novel_content = f.read()
    
    # 检查是否有带action的对话格式
    import re
    # 匹配 "角色xxx，"对话" ——角色" 的格式
    action_pattern = r'(\w+)([^，]+)，"([^"]+)" ——\1'
    action_matches = re.findall(action_pattern, novel_content)
    
    print(f"✅ 找到 {len(action_matches)} 个带action的对话")
    if action_matches:
        print("示例：")
        for i, (speaker, action, dialogue) in enumerate(action_matches[:3]):
            print(f"   {i+1}. {speaker}{action}，\"{dialogue}\" ——{speaker}")

# Step 8 - 增强处理

In [None]:
print("=== Step 8: 增强处理 ===")

print(f"🔄 运行故事增强...")
try:
    enhance_story_with_transitions(task_name=version, input_story_file="story_updated.json")
    print(f"   ✅ 故事过渡增强完成")
except Exception as e:
    print(f"   ⚠️ 故事增强失败: {e}")

print(f"🔄 运行对话润色...")
try:
    polish_dialogues_in_story(task_name=version, input_dialogue_file="dialogue_updated.json")
    print(f"   ✅ 对话润色完成")
except Exception as e:
    print(f"   ⚠️ 对话润色失败: {e}")

print(f"🔄 运行角色状态追踪...")
try:
    run_character_state_tracker(version=version, dialogue_file="dialogue_updated.json", model=behavior_model)
    print(f"   ✅ 角色状态追踪完成")
except Exception as e:
    print(f"   ⚠️ 角色状态追踪失败: {e}")

print(f"✅ 增强处理完成")

In [None]:
# 在 Step 8 之后添加
print("\n=== 验证增强版本 ===")

enhanced_path = os.path.join(folder, "enhanced_story_dialogue_updated.md")
if os.path.exists(enhanced_path):
    with open(enhanced_path, 'r', encoding='utf-8') as f:
        enhanced_content = f.read()
    
    # 检查是否还有原始格式的对话
    original_format_count = enhanced_content.count('" ——')
    print(f"📊 增强版本中原始格式对话数: {original_format_count}")
    
    if original_format_count == 0:
        print("✅ 所有对话都已自然化！")
    else:
        print("⚠️ 还有部分对话未被润色")
    
    # 预览增强效果
    print("\n📖 增强版本预览（前800字符）：")
    print(enhanced_content[:800])

In [None]:
def debug_action_flow(version):
    """调试action信息的完整流程"""
    print("🔍 调试ACTION信息流程")
    
    folder = os.path.join(output_dir, version)
    
    # 1. 检查dialogue_updated.json
    dialogue_path = os.path.join(folder, "dialogue_updated.json")
    if os.path.exists(dialogue_path):
        dialogue_data = load_json(dialogue_path)
        
        action_count = 0
        action_examples = []
        
        for sentence in dialogue_data:
            if sentence.get("dialogue"):
                for d in sentence["dialogue"]:
                    if d.get("action"):
                        action_count += 1
                        if len(action_examples) < 3:
                            action_examples.append({
                                "chapter": sentence["chapter_id"],
                                "sentence": sentence["sentence_index"],
                                "speaker": d["speaker"],
                                "action": d["action"],
                                "dialogue": d["dialogue"]
                            })
        
        print(f"\n📊 Action统计：")
        print(f"   总action数: {action_count}")
        print(f"\n🎯 Action示例：")
        for ex in action_examples:
            print(f"   {ex['chapter']} 句{ex['sentence']+1}:")
            print(f"   {ex['speaker']}: {ex['action']}")
            print(f'   对话: "{ex["dialogue"]}"')  # 修复：使用单引号包裹，内部用双引号
            print()
    
    # 2. 检查novel_story.md中的action
    novel_path = os.path.join(folder, "novel_story.md")
    if os.path.exists(novel_path):
        with open(novel_path, 'r', encoding='utf-8') as f:
            novel = f.read()
        
        # 查找带逗号的格式
        import re
        pattern = r'([^，\n]+)，"([^"]+)" ——(\w+)'
        matches = re.findall(pattern, novel)
        
        print(f"\n📖 小说中的Action格式：")
        print(f"   找到 {len(matches)} 个可能的action格式")
        if matches:
            print(f"\n示例：")
            for i, (action_part, dialogue, speaker) in enumerate(matches[:3]):
                if speaker in action_part:
                    print(f'   {i+1}. {action_part}，"{dialogue}" ——{speaker}')  # 修复
    
    # 3. 检查enhanced版本
    enhanced_path = os.path.join(folder, "enhanced_story_dialogue_updated.md")
    if os.path.exists(enhanced_path):
        print(f"\n📚 增强版本存在：{enhanced_path}")
        print(f"   可以检查润色效果")
    
    return action_count > 0

# 运行调试
if debug_action_flow(version):
    print("\n✅ Action信息流程正常！")
else:
    print("\n❌ 未检测到Action信息")

# 最终结果检查

In [None]:
print("=== 最终结果检查 ===")

# 检查生成的文件
output_files = [
    "story.json", "characters.json", "dialogue_marks.json",
    "sentence_analysis.json", "behavior_timeline_raw.json", "behavior_trace.json",
    "story_updated.json", "dialogue_updated.json", "role_state.json",
    "novel_story.md", "novel_story_updated.md"
]

print(f"📁 检查输出文件:")
for filename in output_files:
    filepath = os.path.join(folder, filename)
    if os.path.exists(filepath):
        size = os.path.getsize(filepath)
        print(f"   ✅ {filename}: {size} bytes")
    else:
        print(f"   ❌ {filename}: 缺失")

# 显示关键统计
print(f"\n📊 最终统计:")
print(f"   版本名称: {version}")
print(f"   输出目录: {folder}")
print(f"   故事章节数: {len(story)}")
print(f"   角色数量: {len(characters)}")
print(f"   句子分析数量: {len(sentence_results)}")
print(f"   行为记录数量: {len(behavior_timeline)}")

print(f"\n🎉 完整流程执行完毕！")
print(f"📂 所有文件保存在: {folder}")

# 显示生成的小说预览
try:
    with open(os.path.join(folder, "novel_story.md"), 'r', encoding='utf-8') as f:
        novel_content = f.read()
    print(f"\n📖 生成小说预览（前500字符）:")
    print(f"{novel_content[:500]}...")
except:
    print(f"❌ 无法读取生成的小说文件")

In [None]:
# 在你的notebook中运行这个调试代码

print("🔍 调试sentence_results数据...")

# 1. 检查sentence_results的结构
if 'sentence_results' in locals() and sentence_results:
    print(f"✅ sentence_results存在，数量: {len(sentence_results)}")
    
    # 检查前几个句子的结构
    print(f"\n📊 前3个句子的结构:")
    for i, sentence in enumerate(sentence_results[:3]):
        print(f"句子{i+1}:")
        print(f"  章节: {sentence.get('chapter_id', 'Unknown')}")
        print(f"  句子索引: {sentence.get('sentence_index', 'Unknown')}")
        print(f"  需要对话: {sentence.get('need_to_action', 0)}")
        print(f"  演员: {sentence.get('actor_list', [])}")
        print(f"  有dialogue字段: {'dialogue' in sentence}")
        
        if sentence.get('dialogue'):
            print(f"  对话数量: {len(sentence['dialogue'])}")
            if sentence['dialogue']:
                print(f"  第一个对话: {sentence['dialogue'][0]}")
        else:
            print(f"  对话: 空")
        print()
    
    # 统计有对话的句子
    dialogue_sentences = [s for s in sentence_results if s.get('need_to_action') == 1 and s.get('dialogue')]
    print(f"📈 统计:")
    print(f"  总句子数: {len(sentence_results)}")
    print(f"  需要对话的句子: {len([s for s in sentence_results if s.get('need_to_action') == 1])}")
    print(f"  实际有对话数据的句子: {len(dialogue_sentences)}")
    
    if dialogue_sentences:
        print(f"\n✅ 有对话的句子示例:")
        sample = dialogue_sentences[0]
        print(f"  章节: {sample['chapter_id']}")
        print(f"  句子索引: {sample['sentence_index']}")
        print(f"  句子内容: {sample['sentence'][:50]}...")
        print(f"  对话数量: {len(sample['dialogue'])}")
        print(f"  对话示例: {sample['dialogue'][0] if sample['dialogue'] else 'None'}")
    else:
        print(f"❌ 没有找到有对话数据的句子！")
        
else:
    print(f"❌ sentence_results不存在或为空")

# 2. 检查compile_full_story_by_sentence的映射逻辑
print(f"\n🔍 测试对话映射逻辑...")

if 'sentence_results' in locals() and sentence_results:
    # 模拟compile_full_story_by_sentence的映射逻辑
    dialogue_map = {}
    for item in sentence_results:
        if item.get("need_to_action") == 1 and item.get("dialogue"):
            chapter_id = item["chapter_id"]
            sentence_idx = item["sentence_index"]
            
            if chapter_id not in dialogue_map:
                dialogue_map[chapter_id] = {}
            dialogue_map[chapter_id][sentence_idx] = item["dialogue"]
    
    print(f"📋 对话映射结果:")
    for chapter_id, chapter_dialogues in dialogue_map.items():
        print(f"  {chapter_id}: {len(chapter_dialogues)} 个句子有对话")
        for sent_idx, dialogues in list(chapter_dialogues.items())[:2]:  # 只显示前2个
            print(f"    句子{sent_idx}: {len(dialogues)} 条对话")
    
    if not dialogue_map:
        print(f"❌ 对话映射为空！这就是问题所在")
    else:
        print(f"✅ 对话映射正常")

In [None]:

# ===============================================
# 方案1: 重新分配对话到句子级（在notebook中运行）
# ===============================================

def redistribute_dialogues_to_sentences(story, chapter_results, sentence_results):
    """
    将章节级对话重新分配到句子级
    """
    print("🔄 重新分配对话到句子级...")
    
    # 按章节分组
    chapters_sentences = {}
    for sentence in sentence_results:
        chapter_id = sentence['chapter_id']
        if chapter_id not in chapters_sentences:
            chapters_sentences[chapter_id] = []
        chapters_sentences[chapter_id].append(sentence)
    
    # 重新分配对话
    fixed_sentence_results = []
    
    for chapter_idx, chapter_result in enumerate(chapter_results):
        chapter_id = story[chapter_idx].get('chapter_id', f'Chapter {chapter_idx+1}')
        chapter_dialogues = chapter_result.get('dialogue', [])
        
        # 获取这个章节的句子
        chapter_sentences = chapters_sentences.get(chapter_id, [])
        need_dialogue_sentences = [s for s in chapter_sentences if s.get('need_to_action') == 1]
        
        print(f"  {chapter_id}: {len(chapter_dialogues)} 条对话，{len(need_dialogue_sentences)} 个句子需要对话")
        
        # 分配对话到句子
        if chapter_dialogues and need_dialogue_sentences:
            dialogues_per_sentence = len(chapter_dialogues) // len(need_dialogue_sentences)
            remaining_dialogues = len(chapter_dialogues) % len(need_dialogue_sentences)
            
            dialogue_idx = 0
            for sentence in chapter_sentences:
                if sentence.get('need_to_action') == 1 and dialogue_idx < len(chapter_dialogues):
                    # 分配对话
                    num_dialogues = dialogues_per_sentence
                    if remaining_dialogues > 0:
                        num_dialogues += 1
                        remaining_dialogues -= 1
                    
                    sentence_dialogues = chapter_dialogues[dialogue_idx:dialogue_idx + num_dialogues]
                    sentence['dialogue'] = sentence_dialogues
                    dialogue_idx += num_dialogues
                    
                    print(f"    句子{sentence['sentence_index']}: 分配了 {len(sentence_dialogues)} 条对话")
                else:
                    sentence['dialogue'] = []
                
                fixed_sentence_results.append(sentence)
        else:
            # 没有对话的章节
            for sentence in chapter_sentences:
                sentence['dialogue'] = []
                fixed_sentence_results.append(sentence)
    
    return fixed_sentence_results

# 运行重新分配
if 'sentence_results' in locals() and 'chapter_results' in locals():
    print("🔧 开始重新分配对话...")
    fixed_sentence_results = redistribute_dialogues_to_sentences(story, chapter_results, sentence_results)
    
    # 验证结果
    dialogue_count = len([s for s in fixed_sentence_results if s.get('dialogue')])
    print(f"✅ 重新分配完成，{dialogue_count} 个句子有对话数据")
    
    # 保存修正后的数据
    save_json(fixed_sentence_results, version, "sentence_dialogues_fixed.json")
    
    # 重新编译小说
    print("📖 使用修正后的数据重新编译小说...")
    compiled_fixed = compile_full_story_by_sentence(story, fixed_sentence_results)
    save_md(compiled_fixed, os.path.join(folder, "novel_story_SENTENCE_FIXED.md"))
    
    # 检查结果
    dialogue_in_novel = compiled_fixed.count('" ——')
    print(f"✅ 修正版小说生成完成，检测到 {dialogue_in_novel} 条对话")
    
    if dialogue_in_novel > 0:
        print("🎉 成功！对话已按句子精确插入")
        # 预览
        print(f"\n📖 预览（前800字符）:")
        print(compiled_fixed[:800])
    else:
        print("❌ 仍然没有对话，需要进一步调试")


In [None]:
import json

# 读取 story.json 文件
with open("/Users/haha/Story/data/output/小红帽_科幻_linear_T0.7_s1/story.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 提取所有 plot
plots = [item["plot"] for item in data if "plot" in item]

# 输出到屏幕
for i, plot in enumerate(plots, 1):
    print(f"Chapter {i}:\n{plot}\n{'-'*50}\n")

# 也可以保存到一个 txt 文件
with open("plots.txt", "w", encoding="utf-8") as f:
    for i, plot in enumerate(plots, 1):
        f.write(f"Chapter {i}:\n{plot}\n{'-'*50}\n")


In [2]:
# ========== Cell 1: 加载数据并分析现状 ==========
import json
from collections import defaultdict

version_folder = "/Users/haha/Story/data/output/小红帽_科幻_linear_T0.7_s1"

# 加载现有文件
with open(f"{version_folder}/behavior_trace.json", 'r', encoding='utf-8') as f:
    behavior_data = json.load(f)

with open(f"{version_folder}/role_state.json", 'r', encoding='utf-8') as f:
    current_role_state = json.load(f)

print("=== behavior_trace.json 分析 ===")
timeline = behavior_data.get("timeline", [])
print(f"timeline 记录数: {len(timeline)}")

# 查看前几条
for i, item in enumerate(timeline[:5]):
    print(f"{i}: {item['chapter_id']}, {item['character']}: {item['behavior']}")

print(f"\n=== role_state.json 分析 ===")
print(f"章节数: {len(current_role_state)}")
print("章节列表:", list(current_role_state.keys())[:10], "...")

=== behavior_trace.json 分析 ===
timeline 记录数: 311
0: Chapter 1, 小红帽: 谨慎
1: Chapter 1, 小红帽: 服从
2: Chapter 1, 小红帽: 积极准备
3: Chapter 1, 殖民星球管理员（林博士）: 冷静
4: Chapter 1, 殖民星球管理员（林博士）: 指导

=== role_state.json 分析 ===
章节数: 31
章节列表: ['Chapter 3', 'Chapter 5', 'Chapter 10', 'Chapter 14', 'Chapter 15', 'Chapter 20', 'Chapter 21', 'Chapter 22', 'Chapter 23', 'Chapter 24'] ...


In [3]:
# ========== Cell 2: 验证章节ID是否正确 ==========
# 从behavior_trace中统计真实的章节ID
chapters_in_behavior = set(item['chapter_id'] for item in timeline)
chapters_in_role_state = set(current_role_state.keys())

print("behavior_trace 中的章节ID:")
for ch in sorted(chapters_in_behavior):
    print(f"  {ch}")

print(f"\nrole_state 中的章节ID:")
for ch in sorted(chapters_in_role_state):
    print(f"  {ch}")

print(f"\n章节ID是否匹配: {chapters_in_behavior == chapters_in_role_state}")

if chapters_in_behavior != chapters_in_role_state:
    print(f"缺失: {chapters_in_behavior - chapters_in_role_state}")
    print(f"多余: {chapters_in_role_state - chapters_in_behavior}")

behavior_trace 中的章节ID:
  Chapter 1
  Chapter 2
  Chapter 3
  Chapter 4
  Chapter 5
  Chapter 6
  Chapter 7

role_state 中的章节ID:
  Chapter 10
  Chapter 14
  Chapter 15
  Chapter 20
  Chapter 21
  Chapter 22
  Chapter 23
  Chapter 24
  Chapter 25
  Chapter 26
  Chapter 27
  Chapter 28
  Chapter 29
  Chapter 3
  Chapter 30
  Chapter 31
  Chapter 32
  Chapter 34
  Chapter 35
  Chapter 36
  Chapter 37
  Chapter 38
  Chapter 39
  Chapter 41
  Chapter 42
  Chapter 43
  Chapter 44
  Chapter 45
  Chapter 46
  Chapter 47
  Chapter 5

章节ID是否匹配: False
缺失: {'Chapter 7', 'Chapter 6', 'Chapter 4', 'Chapter 1', 'Chapter 2'}
多余: {'Chapter 29', 'Chapter 21', 'Chapter 41', 'Chapter 42', 'Chapter 24', 'Chapter 47', 'Chapter 39', 'Chapter 15', 'Chapter 37', 'Chapter 36', 'Chapter 43', 'Chapter 10', 'Chapter 35', 'Chapter 44', 'Chapter 26', 'Chapter 32', 'Chapter 31', 'Chapter 20', 'Chapter 14', 'Chapter 38', 'Chapter 27', 'Chapter 30', 'Chapter 34', 'Chapter 25', 'Chapter 23', 'Chapter 22', 'Chapter 28', 'C

In [4]:
# ========== Cell 10: 检查 dialogue_updated.json 的具体格式 ==========
with open(f"{version_folder}/dialogue_updated.json", 'r', encoding='utf-8') as f:
    dialogue_data = json.load(f)

print("=== dialogue_updated.json 详细分析 ===")
print(f"顶层数据类型: {type(dialogue_data)}")
print(f"顶层数据长度: {len(dialogue_data)}")

print(f"\n前3个元素的类型和结构:")
for i in range(min(3, len(dialogue_data))):
    item = dialogue_data[i]
    print(f"  [{i}] 类型: {type(item)}")
    if isinstance(item, dict):
        print(f"      键: {list(item.keys())}")
        print(f"      chapter_id: {item.get('chapter_id', '无')}")
    elif isinstance(item, list):
        print(f"      列表长度: {len(item)}")
        if len(item) > 0:
            print(f"      第一个元素类型: {type(item[0])}")
            if isinstance(item[0], dict):
                print(f"      第一个元素的chapter_id: {item[0].get('chapter_id', '无')}")

# 这就是问题！如果dialogue_updated.json是sentence_results格式
# 那它包含的是每个句子的数据，不是7个章节，而是几十个句子！

=== dialogue_updated.json 详细分析 ===
顶层数据类型: <class 'list'>
顶层数据长度: 48

前3个元素的类型和结构:
  [0] 类型: <class 'dict'>
      键: ['chapter_id', 'sentence_index', 'sentence', 'need_to_action', 'actor_list', 'dialogue', 'scene_context']
      chapter_id: Chapter 1
  [1] 类型: <class 'dict'>
      键: ['chapter_id', 'sentence_index', 'sentence', 'need_to_action', 'actor_list', 'dialogue', 'scene_context']
      chapter_id: Chapter 1
  [2] 类型: <class 'dict'>
      键: ['chapter_id', 'sentence_index', 'sentence', 'need_to_action', 'actor_list', 'dialogue', 'scene_context']
      chapter_id: Chapter 1


In [5]:
# ========== Cell 11: 确认 run_character_state_tracker 的错误 ==========
print("=== 模拟 run_character_state_tracker 的错误行为 ===")

# 模拟错误的章节映射
simulated_chapters = []
for idx, item in enumerate(dialogue_data):
    wrong_chapter_key = f"Chapter {idx + 1}"
    simulated_chapters.append(wrong_chapter_key)

print(f"错误映射会生成 {len(simulated_chapters)} 个章节:")
print(f"前10个: {simulated_chapters[:10]}")
print(f"后5个: {simulated_chapters[-5:]}")

# 这解释了为什么role_state.json有31个奇怪的章节！
print(f"\n这就解释了为什么 role_state.json 有 {len(current_role_state)} 个章节")
print("它把每个句子都当成了一个章节！")

=== 模拟 run_character_state_tracker 的错误行为 ===
错误映射会生成 48 个章节:
前10个: ['Chapter 1', 'Chapter 2', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Chapter 6', 'Chapter 7', 'Chapter 8', 'Chapter 9', 'Chapter 10']
后5个: ['Chapter 44', 'Chapter 45', 'Chapter 46', 'Chapter 47', 'Chapter 48']

这就解释了为什么 role_state.json 有 31 个章节
它把每个句子都当成了一个章节！


In [6]:
# ========== Cell 12: 生成正确的 role_state.json ==========
def generate_correct_role_state_from_behavior_trace():
    """从behavior_trace直接生成正确的role_state"""
    from collections import defaultdict
    
    # 按章节和角色分组
    role_state_by_chapter = defaultdict(lambda: defaultdict(set))
    
    for item in timeline:
        chapter_id = item["chapter_id"]
        character = item["character"]
        behavior = item["behavior"]
        
        role_state_by_chapter[chapter_id][character].add(behavior)
    
    # 转换为最终格式
    result = {}
    for chapter_id in sorted(role_state_by_chapter.keys()):  # 按章节ID排序
        result[chapter_id] = {}
        for character, behaviors in role_state_by_chapter[chapter_id].items():
            result[chapter_id][character] = sorted(list(behaviors))
    
    return result

# 生成正确版本
correct_role_state = generate_correct_role_state_from_behavior_trace()

print("=== 正确的 role_state ===")
print(f"章节数: {len(correct_role_state)} (应该是7个)")
print("章节ID:", sorted(correct_role_state.keys()))

# 显示每个章节的角色数据
for chapter_id in sorted(correct_role_state.keys()):
    characters = correct_role_state[chapter_id]
    print(f"\n{chapter_id} ({len(characters)} 个角色):")
    for character, behaviors in characters.items():
        print(f"  {character}: {len(behaviors)} 个行为状态")

=== 正确的 role_state ===
章节数: 7 (应该是7个)
章节ID: ['Chapter 1', 'Chapter 2', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Chapter 6', 'Chapter 7']

Chapter 1 (3 个角色):
  小红帽: 3 个行为状态
  殖民星球管理员（林博士）: 3 个行为状态
  飞船人工智能（小智）: 3 个行为状态

Chapter 2 (4 个角色):
  小红帽: 3 个行为状态
  飞船人工智能（小智）: 3 个行为状态
  祖母: 3 个行为状态
  医疗机器人（小白）: 3 个行为状态

Chapter 3 (3 个角色):
  机械狼: 6 个行为状态
  小红帽: 9 个行为状态
  飞船人工智能（小智）: 4 个行为状态

Chapter 4 (5 个角色):
  机械狼: 12 个行为状态
  祖母: 13 个行为状态
  医疗机器人（小白）: 9 个行为状态
  小红帽: 18 个行为状态
  飞船人工智能（小智）: 3 个行为状态

Chapter 5 (7 个角色):
  祖母: 11 个行为状态
  小白: 7 个行为状态
  机械狼: 9 个行为状态
  小红帽: 7 个行为状态
  飞船人工智能（小智）: 10 个行为状态
  黑客主脑: 4 个行为状态
  殖民星球管理员（林博士）: 5 个行为状态

Chapter 6 (5 个角色):
  小红帽: 17 个行为状态
  祖母: 10 个行为状态
  医疗机器人（小白）: 15 个行为状态
  机械狼: 13 个行为状态
  飞船人工智能（小智）: 11 个行为状态

Chapter 7 (7 个角色):
  小红帽: 13 个行为状态
  小白: 9 个行为状态
  祖母: 13 个行为状态
  林博士: 6 个行为状态
  机械狼: 8 个行为状态
  黑客主脑: 5 个行为状态
  小智: 9 个行为状态


In [7]:
# ========== Cell 1: 生成正确的 role_state.json 文件 ==========
import json
from collections import defaultdict

version_folder = "/Users/haha/Story/data/output/小红帽_科幻_linear_T0.7_s1"

# 加载 behavior_trace.json
with open(f"{version_folder}/behavior_trace.json", 'r', encoding='utf-8') as f:
    behavior_data = json.load(f)

def generate_role_state_from_behavior_trace(behavior_trace_data):
    """从behavior_trace直接生成role_state，避免重复LLM调用和错误映射"""
    timeline = behavior_trace_data.get("timeline", [])
    role_state_by_chapter = defaultdict(lambda: defaultdict(set))
    
    for item in timeline:
        chapter_id = item["chapter_id"]
        character = item["character"]
        behavior = item["behavior"]
        role_state_by_chapter[chapter_id][character].add(behavior)
    
    # 转换为最终格式
    result = {}
    for chapter_id in sorted(role_state_by_chapter.keys()):
        result[chapter_id] = {}
        for character, behaviors in role_state_by_chapter[chapter_id].items():
            result[chapter_id][character] = sorted(list(behaviors))
    
    return result

# 生成正确的 role_state
correct_role_state = generate_role_state_from_behavior_trace(behavior_data)

print(f"✅ 生成了正确的 role_state")
print(f"章节数: {len(correct_role_state)}")
print(f"章节ID: {sorted(correct_role_state.keys())}")

# 保存到临时文件
temp_file = f"{version_folder}/role_state_fixed_temp.json"
with open(temp_file, 'w', encoding='utf-8') as f:
    json.dump(correct_role_state, f, ensure_ascii=False, indent=4)

print(f"✅ 已保存到临时文件: {temp_file}")

✅ 生成了正确的 role_state
章节数: 7
章节ID: ['Chapter 1', 'Chapter 2', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Chapter 6', 'Chapter 7']
✅ 已保存到临时文件: /Users/haha/Story/data/output/小红帽_科幻_linear_T0.7_s1/role_state_fixed_temp.json


In [8]:
# ========== Cell 2: 备份原文件并替换 ==========
import shutil
from datetime import datetime

# 备份原文件
original_file = f"{version_folder}/role_state.json"
backup_file = f"{version_folder}/role_state_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

# 创建备份
shutil.copy(original_file, backup_file)
print(f"✅ 已备份原文件到: {backup_file}")

# 替换为正确版本
shutil.copy(temp_file, original_file)
print(f"✅ 已替换 role_state.json")

# 验证替换结果
with open(original_file, 'r', encoding='utf-8') as f:
    new_data = json.load(f)

print(f"✅ 验证成功！新的 role_state.json:")
print(f"   章节数: {len(new_data)}")
print(f"   章节ID: {sorted(new_data.keys())}")

# 删除临时文件
import os
os.remove(temp_file)
print("✅ 已删除临时文件")

✅ 已备份原文件到: /Users/haha/Story/data/output/小红帽_科幻_linear_T0.7_s1/role_state_backup_20250812_182435.json
✅ 已替换 role_state.json
✅ 验证成功！新的 role_state.json:
   章节数: 7
   章节ID: ['Chapter 1', 'Chapter 2', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Chapter 6', 'Chapter 7']
✅ 已删除临时文件


In [11]:
import os
import json
import numpy as np
from collections import Counter
from src.utils.utils import generate_response, save_json, load_json


In [4]:
# 简化版：直接加载story.json并测试事件提取

import json
import os

## 1. 检查和加载story.json文件
def load_story_json(file_path="story.json"):
    """加载story.json文件"""
    print(f"🔍 正在查找文件: {file_path}")
    
    # 检查文件是否存在
    if not os.path.exists(file_path):
        print(f"❌ 文件不存在: {file_path}")
        print(f"📍 当前工作目录: {os.getcwd()}")
        print("📁 当前目录下的文件:")
        for f in os.listdir('.'):
            if f.endswith('.json'):
                print(f"  - {f}")
        return None
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"✅ 成功加载文件: {file_path}")
        return data
    except Exception as e:
        print(f"❌ 加载失败: {e}")
        return None

## 2. 解析story数据格式
def parse_story_data(data):
    """解析不同格式的story数据"""
    if data is None:
        return []
    
    print("🔄 正在解析数据格式...")
    
    # 情况1: 直接是章节列表
    if isinstance(data, list):
        print("✅ 检测到格式: 直接章节列表")
        return data
    
    # 情况2: 字典格式，查找可能的章节数据
    if isinstance(data, dict):
        # 常见的字段名
        possible_keys = ['story', 'chapters', 'data', 'content', 'episodes']
        
        for key in possible_keys:
            if key in data and isinstance(data[key], list):
                print(f"✅ 检测到格式: 从'{key}'字段提取章节")
                return data[key]
        
        # 如果没找到，显示所有字段
        print("🔍 可用字段:")
        for key, value in data.items():
            print(f"  - {key}: {type(value)}")
            if isinstance(value, list) and len(value) > 0:
                print(f"    (列表长度: {len(value)})")
    
    print("⚠️ 无法自动识别格式，请检查数据结构")
    return []

## 3. 验证章节数据
def validate_chapters(chapters):
    """验证章节数据是否符合要求"""
    if not chapters:
        print("❌ 没有找到章节数据")
        return False
    
    print(f"📊 找到 {len(chapters)} 个章节")
    
    # 检查第一个章节的结构
    first_chapter = chapters[0]
    print("🔍 第一章结构:")
    for key, value in first_chapter.items():
        print(f"  - {key}: {type(value).__name__}")
    
    # 检查是否有必要字段
    content_fields = ['plot', 'content', 'text', 'story']
    id_fields = ['chapter_id', 'id', 'title', 'name']
    
    content_field = None
    id_field = None
    
    for field in content_fields:
        if field in first_chapter:
            content_field = field
            break
    
    for field in id_fields:
        if field in first_chapter:
            id_field = field
            break
    
    if content_field:
        print(f"✅ 找到内容字段: {content_field}")
    else:
        print("⚠️ 未找到内容字段，支持的字段名: plot, content, text, story")
    
    if id_field:
        print(f"✅ 找到ID字段: {id_field}")
    else:
        print("⚠️ 未找到ID字段，支持的字段名: chapter_id, id, title, name")
    
    return content_field is not None

## 4. 标准化章节格式
def standardize_chapters(chapters):
    """将章节数据标准化为统一格式"""
    standardized = []
    
    # 检测字段映射
    if not chapters:
        return []
    
    sample = chapters[0]
    
    # 找到内容字段
    content_field = None
    for field in ['plot', 'content', 'text', 'story']:
        if field in sample:
            content_field = field
            break
    
    # 找到ID字段
    id_field = None
    for field in ['chapter_id', 'id', 'title', 'name']:
        if field in sample:
            id_field = field
            break
    
    print(f"🔄 标准化格式: {id_field} -> chapter_id, {content_field} -> plot")
    
    for i, chapter in enumerate(chapters):
        std_chapter = {}
        
        # 设置chapter_id
        if id_field and id_field in chapter:
            std_chapter['chapter_id'] = str(chapter[id_field])
        else:
            std_chapter['chapter_id'] = f"第{i+1}章"
        
        # 设置plot
        if content_field and content_field in chapter:
            std_chapter['plot'] = str(chapter[content_field])
        else:
            std_chapter['plot'] = ""
        
        # 保留其他字段
        for key, value in chapter.items():
            if key not in [id_field, content_field]:
                std_chapter[key] = value
        
        standardized.append(std_chapter)
    
    print(f"✅ 标准化完成: {len(standardized)} 个章节")
    return standardized

## 5. 预览数据
def preview_data(chapters, max_show=3):
    """预览章节数据"""
    print("\n📖 数据预览:")
    print("=" * 50)
    
    for i, chapter in enumerate(chapters[:max_show]):
        print(f"\n章节 {i+1}:")
        print(f"  ID: {chapter.get('chapter_id', 'N/A')}")
        
        plot = chapter.get('plot', '')
        print(f"  内容长度: {len(plot)} 字符")
        
        if len(plot) > 100:
            print(f"  内容预览: {plot[:100]}...")
        else:
            print(f"  内容: {plot}")
    
    if len(chapters) > max_show:
        print(f"\n... 还有 {len(chapters) - max_show} 个章节")

## 6. 主要加载流程
def load_and_prepare_story(file_path="story.json"):
    """完整的story.json加载和准备流程"""
    print("🚀 开始加载story.json...")
    print("=" * 50)
    
    # 1. 加载文件
    raw_data = load_story_json(file_path)
    if raw_data is None:
        return None
    
    # 2. 解析格式
    chapters = parse_story_data(raw_data)
    if not chapters:
        return None
    
    # 3. 验证数据
    if not validate_chapters(chapters):
        return None
    
    # 4. 标准化格式
    standardized_chapters = standardize_chapters(chapters)
    
    # 5. 预览数据
    preview_data(standardized_chapters)
    
    print("=" * 50)
    print("✅ story.json加载完成!")
    return standardized_chapters

## 7. 使用示例
# 加载你的story.json文件
story_data = load_and_prepare_story("story.json")

# 如果文件在其他位置，修改路径：
# story_data = load_and_prepare_story("/path/to/your/story.json")
# story_data = load_and_prepare_story("data/story.json")

## 8. 如果加载成功，可以继续测试事件提取
if story_data:
    print(f"\n🎯 准备测试事件提取 ({len(story_data)} 个章节)")
    print("如果要继续测试事件提取，运行:")
    print("result = extract_events_no_hallucination(story_data)")
else:
    print("\n💡 常见解决方案:")
    print("1. 检查文件路径是否正确")
    print("2. 检查JSON格式是否有效")
    print("3. 检查文件编码是否为UTF-8")
    print("\n📋 支持的JSON格式示例:")
    print("""
{
  "story": [
    {"chapter_id": "第一章", "plot": "故事内容..."},
    {"chapter_id": "第二章", "plot": "故事内容..."}
  ]
}

或者:

[
  {"chapter_id": "第一章", "plot": "故事内容..."},
  {"chapter_id": "第二章", "plot": "故事内容..."}
]
""")
story_data = load_and_prepare_story("/Users/haha/Story/data/output/小红帽_科幻_linear_T0.7_s1/story.json")

🚀 开始加载story.json...
🔍 正在查找文件: story.json
❌ 文件不存在: story.json
📍 当前工作目录: /Users/haha/Story
📁 当前目录下的文件:
  - run_loop_status.json

💡 常见解决方案:
1. 检查文件路径是否正确
2. 检查JSON格式是否有效
3. 检查文件编码是否为UTF-8

📋 支持的JSON格式示例:

{
  "story": [
    {"chapter_id": "第一章", "plot": "故事内容..."},
    {"chapter_id": "第二章", "plot": "故事内容..."}
  ]
}

或者:

[
  {"chapter_id": "第一章", "plot": "故事内容..."},
  {"chapter_id": "第二章", "plot": "故事内容..."}
]

🚀 开始加载story.json...
🔍 正在查找文件: /Users/haha/Story/data/output/小红帽_科幻_linear_T0.7_s1/story.json
✅ 成功加载文件: /Users/haha/Story/data/output/小红帽_科幻_linear_T0.7_s1/story.json
🔄 正在解析数据格式...
✅ 检测到格式: 直接章节列表
📊 找到 7 个章节
🔍 第一章结构:
  - scene: str
  - characters: list
  - plot: str
  - chapter_id: str
  - title: str
✅ 找到内容字段: plot
✅ 找到ID字段: chapter_id
🔄 标准化格式: chapter_id -> chapter_id, plot -> plot
✅ 标准化完成: 7 个章节

📖 数据预览:

章节 1:
  ID: Chapter 1
  内容长度: 206 字符
  内容预览: 清晨,小红帽身穿红色智能制服,背着密封医疗舱,神情坚定地走入太空港主控大厅.林博士已等候在服务台旁,他目光关切,递上最后的许可文件.两人低声交谈,林博士叮嘱小红帽注意途中安全,尤其要小心近期频发的黑客...

章节 2:
  ID: Chapter 2
  内容长度

In [13]:
concat_story_plot = ""
for i in story_data :
    concat_story_plot += i['chapter_id'] + "\n" + i['plot'] + "\n\n"  

In [29]:
concat_story_plot

'Chapter 1\n清晨,小红帽身穿红色智能制服,背着密封医疗舱,神情坚定地走入太空港主控大厅.林博士已等候在服务台旁,他目光关切,递上最后的许可文件.两人低声交谈,林博士叮嘱小红帽注意途中安全,尤其要小心近期频发的黑客袭击.小红帽点头,眼中闪烁着坚定与焦虑,为救祖母而踏上的星际任务即将启程.大厅窗外,专属飞船的银色身影静静停泊,飞船人工智能小智已启动欢迎程序,蓝色光芒在舱门边流转.气氛中弥漫着期待与一丝紧张,新的冒险即将开始.\n\nChapter 2\n小红帽驾驶着红星号飞船,载着病重的祖母和医疗机器人小白,穿行在幽蓝的太空跃迁通道.飞船内部一片有序,祖母面色虚弱却温柔地微笑,小白在旁调试医疗设备.小红帽与飞船AI小智协作,时刻关注导航与飞船防御系统,却未察觉机械狼已潜伏在外部,通过黑客手段试图突破飞船屏障.小智发出警报,舱内气氛骤然紧张,小红帽当机立断,启动防御协议并与祖母和小白迅速沟通,准备应对即将到来的危机.\n\nChapter 3\n小红帽驾驶悬浮滑板,谨慎地穿梭在暮色森林间.她正准备加速赶往祖母的居所,却突然被一道银色残影拦住去路.机械狼潜伏在树根阴影下,双眼泛起蓝色冷光,身躯金属部件在夜色中反射着冰冷光芒.它以低沉的机械声调发出威胁,要求小红帽交出医疗芯片.小红帽警觉地按下通讯耳机,呼叫飞船AI小智支援,同时利用树木和滑板灵活周旋,试图摆脱机械狼的追击.森林深处不断传来机械狼的脚步声和电子干扰信号,小红帽运用自己的黑客技能,试图破解机械狼的控制信号,为自己争取逃脱时间.空气中弥漫着紧张与对峙的气息,危机一触即发.\n\nChapter 4\n夜幕降临,祖母依靠着医疗床静静休息,医疗机器人小白在旁细心检查生命体征.突然,智能居住系统发出警报,灯光闪烁,门锁自动解除.机械狼以维修员伪装闯入舱内,试图黑入祖母的医疗芯片.小红帽接到飞船人工智能小智的紧急通知,火速赶回居住舱.她熟练地操作舱内终端,与机械狼展开激烈的数据对抗.祖母惊醒后,坚强地指导小红帽并协助小白采取防御措施.机械狼步步紧逼,试图窃取医疗数据,但在小红帽的机智和团队协作下,最终被迫撤退.危机暂时解除,祖母安慰着小红帽,屋内恢复平静,星空下的智能舱再次守护着她们的安全.\n\nChapter 5\n夜色下,祖母的智能医疗舱被未知黑客势力入侵,警报灯闪烁.机械狼利用高级黑客程序试图劫持医疗系

In [23]:
sys_prompt = """
请从以下故事中提取所有关键事件。

要求：
1. 每个事件用10-20字简洁描述
2. 按时间顺序排列

重要限制：
1. 只提取每个章节内明确描述发生的事件, 不要添加任何假设或未明确描述的事件, 不要推断章节之间发生了什么
2. 准确描述动作的性质, 保持动作描述的准确性，不要夸大或改变其性质,比如不要把"准备做X"理解为"已经完成X"也不要把"威胁要做Y"理解为"已经做了Y"
3. 提取事件的同时需要标注事件来源

输出格式：
[
  {"description":"事件描述1","reference":"chapter 1:文本原文"},
  {"description":"事件描述2","reference":"chapter 2:文本原文"},
  ...
]
"""

In [24]:
response1 = generate_response([
        {"role": "user", "content":sys_prompt},
    {"role": "user", "content": f"extract the key events base on the following story plot:\n\n{concat_story_plot}"}
    ], model="gpt-4.1", temperature=0.1)

In [30]:
event = ""
for i,j in enumerate(eval(response1)):
    event += f"{i+1}. {j['description']}\n"

1. 小红帽身穿红色智能制服走入太空港主控大厅
2. 林博士递上最后的许可文件并叮嘱小红帽注意安全
3. 小红帽点头,准备踏上星际任务
4. 飞船AI小智启动欢迎程序
5. 小红帽驾驶飞船载祖母和小白穿行太空跃迁通道
6. 小白调试医疗设备,祖母微笑
7. 小红帽与小智协作关注导航与防御系统
8. 机械狼潜伏外部试图黑入飞船
9. 小智发出警报,小红帽启动防御协议并沟通应对危机
10. 小红帽驾驶悬浮滑板穿梭暮色森林
11. 机械狼拦住小红帽,威胁要求交出医疗芯片
12. 小红帽呼叫小智支援,利用滑板周旋机械狼
13. 小红帽尝试破解机械狼控制信号争取逃脱时间
14. 祖母休息,小白检查生命体征
15. 智能居住系统警报,门锁自动解除
16. 机械狼伪装维修员闯入舱内,试图黑入医疗芯片
17. 小红帽接到小智通知,赶回居住舱与机械狼数据对抗
18. 祖母协助小红帽和小白防御机械狼
19. 机械狼试图窃取医疗数据,最终被迫撤退
20. 祖母安慰小红帽,屋内恢复平静
21. 祖母医疗舱被黑客入侵,警报灯闪烁
22. 机械狼试图劫持医疗系统,夺取医疗芯片
23. 小红帽和小智反制,调动安保系统与虚拟空间作战工具
24. 小白守在祖母身侧准备应急护理
25. 机械狼与小智在虚拟空间激烈对抗,黑客主脑指挥
26. 林博士带安保小队赶来增援
27. 小红帽利用智能防御与AI协同反击,逐步夺回控制权
28. 小红帽驾驶飞船降落祖母居所外,步入舱门
29. 祖母迎接小红帽,小白忙碌在旁
30. 机械狼伪装维修工闯入,试图窃取芯片
31. 小红帽和小智识破机械狼伪装,协同作战
32. 小白守护祖母,准备应急医疗处理
33. 小红帽成功将芯片植入祖母体内,机械狼被击退
34. 居所恢复平静,祖母安心微笑
35. 小红帽踏入医疗站,与祖母相拥,递交医疗芯片
36. 林博士检查安全,察觉异常
37. 小白准备手术时,机械狼暴露身份企图夺取芯片
38. 小智启动安保程序,与机械狼虚拟空间对抗
39. 林博士与小红帽封锁医疗站出口
40. 祖母激励小红帽利用黑客技能逆转局势
41. 小红帽与小智合力击退机械狼,芯片成功植入祖母体内
42. 医疗站恢复安全,众人重获希望



In [31]:
event

'1. 小红帽身穿红色智能制服走入太空港主控大厅\n2. 林博士递上最后的许可文件并叮嘱小红帽注意安全\n3. 小红帽点头,准备踏上星际任务\n4. 飞船AI小智启动欢迎程序\n5. 小红帽驾驶飞船载祖母和小白穿行太空跃迁通道\n6. 小白调试医疗设备,祖母微笑\n7. 小红帽与小智协作关注导航与防御系统\n8. 机械狼潜伏外部试图黑入飞船\n9. 小智发出警报,小红帽启动防御协议并沟通应对危机\n10. 小红帽驾驶悬浮滑板穿梭暮色森林\n11. 机械狼拦住小红帽,威胁要求交出医疗芯片\n12. 小红帽呼叫小智支援,利用滑板周旋机械狼\n13. 小红帽尝试破解机械狼控制信号争取逃脱时间\n14. 祖母休息,小白检查生命体征\n15. 智能居住系统警报,门锁自动解除\n16. 机械狼伪装维修员闯入舱内,试图黑入医疗芯片\n17. 小红帽接到小智通知,赶回居住舱与机械狼数据对抗\n18. 祖母协助小红帽和小白防御机械狼\n19. 机械狼试图窃取医疗数据,最终被迫撤退\n20. 祖母安慰小红帽,屋内恢复平静\n21. 祖母医疗舱被黑客入侵,警报灯闪烁\n22. 机械狼试图劫持医疗系统,夺取医疗芯片\n23. 小红帽和小智反制,调动安保系统与虚拟空间作战工具\n24. 小白守在祖母身侧准备应急护理\n25. 机械狼与小智在虚拟空间激烈对抗,黑客主脑指挥\n26. 林博士带安保小队赶来增援\n27. 小红帽利用智能防御与AI协同反击,逐步夺回控制权\n28. 小红帽驾驶飞船降落祖母居所外,步入舱门\n29. 祖母迎接小红帽,小白忙碌在旁\n30. 机械狼伪装维修工闯入,试图窃取芯片\n31. 小红帽和小智识破机械狼伪装,协同作战\n32. 小白守护祖母,准备应急医疗处理\n33. 小红帽成功将芯片植入祖母体内,机械狼被击退\n34. 居所恢复平静,祖母安心微笑\n35. 小红帽踏入医疗站,与祖母相拥,递交医疗芯片\n36. 林博士检查安全,察觉异常\n37. 小白准备手术时,机械狼暴露身份企图夺取芯片\n38. 小智启动安保程序,与机械狼虚拟空间对抗\n39. 林博士与小红帽封锁医疗站出口\n40. 祖母激励小红帽利用黑客技能逆转局势\n41. 小红帽与小智合力击退机械狼,芯片成功植入祖母体内\n42. 医疗站恢复安全,众人重获希望\n'

In [28]:
len(eval(response1))


42

In [7]:


def extract_events_no_hallucination(story_data, model="gpt-4.1", temperature=None):
    """
    无幻觉版本：先分句编号，再让LLM选择
    temperature: None使用默认值，0表示固定模式，>0表示随机模式
    """
    from src.utils.utils import split_plot_into_sentences
    
    # 第一步：预处理原文，给每个句子编号
    all_sentences = []
    sentence_map = {}
    sentence_counter = 0
    
    for ch in story_data:
        chapter_id = ch.get('chapter_id', '')
        plot = ch.get('plot', '')
        sentences = split_plot_into_sentences(plot)
        
        for sent in sentences:
            sentence_map[sentence_counter] = {
                "sentence": sent,
                "chapter": chapter_id
            }
            all_sentences.append(f"{sentence_counter}: {sent}")
            sentence_counter += 1
    
    # 准备编号句子列表
    numbered_sentences = "\n".join(all_sentences)
    double_newline = "\n\n"

    print("  第一步：提取事件...")
    # 第二步：提取事件
    step1_prompt = f"""
请从以下故事中提取所有关键事件。

要求：
1. 每个事件用10-20字简洁描述
2. 按时间顺序排列

重要限制：
1. 只提取每个章节内明确描述发生的事件, 不要添加任何假设或未明确描述的事件, 不要推断章节之间发生了什么
2. 准确描述动作的性质, 保持动作描述的准确性，不要夸大或改变其性质,比如不要把"准备做X"理解为"已经完成X"也不要把"威胁要做Y"理解为"已经做了Y"

输出格式：
[
  "事件描述1",
  "事件描述2"
]

故事文本：
{double_newline.join([f"【{ch.get('chapter_id', '')}】{ch.get('plot', '')}" for ch in story_data])}
"""
    
    # 根据temperature参数调用
    if temperature is not None:
        response1 = generate_response([{"role": "user", "content": step1_prompt}], model=model, temperature=temperature)
    else:
        response1 = generate_response([{"role": "user", "content": step1_prompt}], model=model)
    
    try:
        from src.utils.utils import convert_json
        events_only = convert_json(response1)
        if not isinstance(events_only, list):
            print(f"⚠️ 事件提取失败")
            return []
    except Exception as e:
        print(f"⚠️ 事件提取失败：{e}")
        return []
    
    print(f"  ✅ 提取到 {len(events_only)} 个事件")
    print("  第二步：匹配原文句子...")
    
    # 第三步：让LLM为每个事件选择句子编号
    step2_prompt = f"""
给定事件列表和编号的原文句子，请为每个事件选择最匹配的句子编号。

事件列表：
{json.dumps(events_only, ensure_ascii=False, indent=2)}

编号的原文句子：
{numbered_sentences}

要求：
1. 为每个事件选择一个最匹配的句子编号
2. 如果找不到匹配的句子，编号填写-1
3. 只能选择已给出的编号，不能填写其他数字

输出格式：
[
  {{
    "event": "事件描述",
    "sentence_number": 编号数字,
    "confidence": "high/medium/low"
  }}
]
"""
    
    # 根据temperature参数调用
    if temperature is not None:
        response2 = generate_response([{"role": "user", "content": step2_prompt}], model=model, temperature=temperature)
    else:
        response2 = generate_response([{"role": "user", "content": step2_prompt}], model=model)
    
    try:
        matches = convert_json(response2)
        if not isinstance(matches, list):
            print(f"⚠️ 匹配失败")
            return []
        
        # 构建最终结果
        final_events = []
        filtered_count = 0
        
        for match in matches:
            sentence_num = match.get("sentence_number", -1)
            
            if sentence_num == -1 or sentence_num not in sentence_map:
                filtered_count += 1
                print(f"  ⚠️ 过滤事件（无匹配句子）：{match.get('event', '')}")
                continue
            
            sentence_info = sentence_map[sentence_num]
            final_events.append({
                "event": match.get("event", ""),
                "source": sentence_info["sentence"],
                "chapter": sentence_info["chapter"],
                "confidence": match.get("confidence", "unknown")
            })
        
        print(f"  ✅ 匹配完成，过滤了 {filtered_count} 个无匹配事件")

        validated_events = validate_events_against_source(final_events, model=model, temperature=temperature)
        return validated_events

        
    except Exception as e:
        print(f"⚠️ 匹配失败：{e}")
        return []


SyntaxError: invalid syntax (story_evaluator.py, line 723)

In [6]:
extract_events_no_hallucination(story_data, model="gpt-4.1", temperature=0.1)

  第一步：提取事件...
原始 content: [
  "小红帽走入太空港主控大厅",
  "林博士递上许可文件并叮嘱小红帽注意黑客袭击",
  "小红帽与林博士低声交谈",
  "小红帽驾驶飞船穿越太空跃迁通道",
  "祖母在飞船内病重休息，小白调试医疗设备",
  "小红帽与小智协作关注导航与防御系统",
  "机械狼潜伏在飞船外试图黑入飞船屏障",
  "小智发出警报，小红帽启动防御协议并沟通应对危机",
  "小红帽驾驶悬浮滑板穿梭暮色森林",
  "机械狼拦住小红帽，威胁要求交出医疗芯片",
  "小红帽呼叫小智支援并与机械狼周旋",
  "小红帽尝试破解机械狼控制信号争取逃脱时间",
  "祖母在医疗床休息，小白检查生命体征",
  "智能居住系统发出警报，门锁自动解除",
  "机械狼伪装维修员闯入舱内试图黑入医疗芯片",
  "小红帽接到小智通知赶回居住舱",
  "小红帽与机械狼展开数据对抗",
  "祖母协助小白采取防御措施",
  "机械狼试图窃取医疗数据后被迫撤退",
  "祖母安慰小红帽，屋内恢复平静",
  "祖母的医疗舱被黑客势力入侵",
  "机械狼试图劫持医疗系统夺取医疗芯片",
  "小红帽与小智协作反制黑客攻击",
  "小白守在祖母身侧准备应急护理",
  "机械狼与小智在虚拟空间对抗",
  "林博士带安保小队赶来增援",
  "小红帽利用智能防御与AI协同反击",
  "机械狼被迫撤退，祖母安然无恙",
  "小红帽驾驶飞船降落祖母居所外",
  "小红帽带着医疗芯片步入舱门",
  "机械狼伪装维修工闯入试图窃取芯片",
  "小红帽和小智识破机械狼伪装并协同作战",
  "小白守护祖母准备应急医疗处理",
  "小红帽成功将芯片植入祖母体内",
  "机械狼被击退，居所恢复平静",
  "小红帽踏入医疗站与祖母相拥",
  "小红帽将医疗芯片递交小白准备植入",
  "林博士检查安全并察觉异常",
  "机械狼暴露身份企图夺取医疗芯片",
  "小智启动安保程序与机械狼对抗",
  "林博士与小红帽封锁医疗站出口",
  "祖母激励小红帽利用黑客技能逆转局势",
  "小红帽与小智合力击退机械狼",
  "医疗芯片成功植入祖母体内"
]...
  ✅ 提取到 44 个事件
  第二步：匹配原文句子...
原始 c

[]