In [77]:
model = 'Here is the completed code segment with all population data filled in:\n\n```problog\n{\n  "HASH": "23C2CC6E",\n  "Code": "pop(china, 8250).\npop(india, 5863).\npop(ussr, 2521).\npop(usa, 2119).\npop(indonesia, 1276).\npop(japan, 1097).\npop(brazil, 1042).\npop(bangladesh, 750).\npop(pakistan, 682).\npop(w_germany, 613).\npop(nigeria, 613).\npop(mexico, 581).\npop(uk, 559).\npop(italy, 554).\npop(france, 525).\npop(philippines, 415).\npop(thailand, 410).\npop(turkey, 383).\npop(egypt, 364).\npop(spain, 352).\npop(poland, 337).\npop(s_korea, 335).\npop(iran, 320).\npop(ethiopia, 272).\npop(argentina, 251)."\n}\n```'


In [85]:
import re
import json
import uuid
import hashlib
from problog.program import PrologString, Clause, AnnotatedDisjunction, Term
from typing import Literal, List, Union, Tuple, Dict, Any
from langgraph.graph import END, StateGraph
from state import BasicState, Mode
from config import paths
def _find_all_blocks(type:Literal["report","code","other"], text:str, ext_mark:str="") -> List[dict]:
    """
    find block with certain patterns in json form.
    """
    blocks:List[dict] = []
    
    if type == "other" or type == "report":
        pattern = r"```(?:report|[a-z]*)?\n(.*?)```"
    elif type == "code":
        pattern = r"```(?:problog|[a-z]*)?\n(.*?)```"
    else:
        raise ValueError("you must choose from ['report','code','other']")
    
    matches = re.findall(pattern, text, re.DOTALL)
    if not matches:
        pattern = r"```(?:json|[a-z]*)?\n(.*?)```"
        matches = re.findall(pattern, text, re.DOTALL)

    if matches:
        for match in matches:
            try:
                # 直接尝试解析 JSON
                match_json = json.loads(match)
            except json.JSONDecodeError:
                # 对于 ProbLog 代码，需要特殊处理转义
                # 保护 ProbLog 特殊操作符
                protected_match = match
                
                # 创建特殊操作符的映射
                special_patterns = {
                    r'\\\\=': '<<<BACKSLASH_BACKSLASH_EQUALS>>>',
                    r'\\\\\\\\=': '<<<QUAD_BACKSLASH_EQUALS>>>',  # 处理已经有四个反斜杠的情况
                    r'\\n'        : '<<<ESCAPED_NEWLINE>>>',      # 新增这一行
                }
                # 临时替换特殊操作符
                for pattern, placeholder in special_patterns.items():
                    protected_match = re.sub(pattern, placeholder, protected_match)
                
                # 修复其他无效的转义序列
                fixed_code = re.sub(r'\\(?!["\\/bfnrtu])', r'\\\\', protected_match)

                # 还原特殊操作符
                for pattern, placeholder in special_patterns.items():
                    fixed_code = fixed_code.replace(placeholder, pattern)
                
                try:
                    match_json = json.loads(fixed_code)
                except json.JSONDecodeError as e:
                    print(f"Failed to parse JSON: {e}")
                    print(f"Original match: {match}")
                    print(f"Fixed code: {fixed_code}")
                    continue

            # 处理不同类型的块
            if type == "other":
                try:
                    blocks.append({ext_mark:match_json})
                except ValueError:
                    blocks.append({ext_mark:None})

            elif type == "code":
                try:
                    blocks.append({match_json["HASH"]:match_json["Code"]})
                except (ValueError, KeyError):
                    blocks.append({match_json.get("HASH", "unknown"):f"could not parse the code block of {match_json}"})
            
            elif type == "report":
                try:
                    blocks.append({match_json["HASH"]:match_json})
                except (ValueError, KeyError):
                    blocks.append({match_json.get("HASH", "unknown"):f"could not parse the report block of {match_json}"})

        return blocks
    else:
        return None

In [84]:
import json
matches = _find_all_blocks("code",model)
print(matches)
# data 现在就是一个标准 dict，可以直接操作


fixed_code {
  "HASH": "23C2CC6E",
  "Code": "pop(china, 8250).
pop(india, 5863).
pop(ussr, 2521).
pop(usa, 2119).
pop(indonesia, 1276).
pop(japan, 1097).
pop(brazil, 1042).
pop(bangladesh, 750).
pop(pakistan, 682).
pop(w_germany, 613).
pop(nigeria, 613).
pop(mexico, 581).
pop(uk, 559).
pop(italy, 554).
pop(france, 525).
pop(philippines, 415).
pop(thailand, 410).
pop(turkey, 383).
pop(egypt, 364).
pop(spain, 352).
pop(poland, 337).
pop(s_korea, 335).
pop(iran, 320).
pop(ethiopia, 272).
pop(argentina, 251)."
}

Failed to parse JSON: Invalid control character at: line 3 column 29 (char 52)
Original match: {
  "HASH": "23C2CC6E",
  "Code": "pop(china, 8250).
pop(india, 5863).
pop(ussr, 2521).
pop(usa, 2119).
pop(indonesia, 1276).
pop(japan, 1097).
pop(brazil, 1042).
pop(bangladesh, 750).
pop(pakistan, 682).
pop(w_germany, 613).
pop(nigeria, 613).
pop(mexico, 581).
pop(uk, 559).
pop(italy, 554).
pop(france, 525).
pop(philippines, 415).
pop(thailand, 410).
pop(turkey, 383).
pop(egypt, 364).

In [65]:
for match in matches:
    print(match)
    res = json.loads(match)
    print(res)

In [6]:
def _find_all_blocks(type: Literal["report", "code", "other"], text: str, ext_mark: str = "") -> List[dict]:
    """
    修复版的解析函数，正确处理ProbLog代码中的转义序列
    """
    blocks: List[dict] = []
    
    # 根据类型选择正则表达式
    if type == "other" or type == "report":
        pattern = r"```(?:report|[a-z]*)?\n(.*?)```"
    elif type == "code":
        pattern = r"```(?:problog|[a-z]*)?\n(.*?)```"
    else:
        raise ValueError("you must choose from ['report','code','other']")
    
    matches = re.findall(pattern, text, re.DOTALL)
    
    if not matches:
        pattern = r"```(?:json|[a-z]*)?\n(.*?)```"
        matches = re.findall(pattern, text, re.DOTALL)
    
    for match in matches:
        match = match.strip()
        
        try:
            # 直接尝试解析JSON
            match_json = json.loads(match)
        except json.JSONDecodeError:
            # 智能处理转义序列
            try:
                # 创建一个映射来临时替换特殊序列
                replacements = {
                    r'\\\\=': '___DOUBLE_BACKSLASH_EQUALS___',
                    r'\\=': '___BACKSLASH_EQUALS___',
                    r'\\n': '___BACKSLASH_N___',
                    '\n': '\\n',  # 将实际的换行符替换为转义的换行符
                }
                
                processed_match = match
                
                # 首先处理实际的换行符
                processed_match = processed_match.replace('\n', '\\n')
                
                # 然后处理其他特殊序列（按长度降序，确保先处理长的）
                for pattern, replacement in sorted(replacements.items(), key=lambda x: len(x[0]), reverse=True):
                    if pattern != '\n':  # 跳过已处理的换行符
                        processed_match = processed_match.replace(pattern, replacement)
                
                # 尝试解析JSON
                match_json = json.loads(processed_match)
                
                # 还原特殊序列
                if isinstance(match_json, dict) and "Code" in match_json:
                    code = match_json["Code"]
                    # 还原顺序也很重要
                    for pattern, replacement in replacements.items():
                        if pattern != '\n':  # 不需要还原换行符
                            code = code.replace(replacement, pattern)
                    match_json["Code"] = code
                
            except json.JSONDecodeError as e:
                print(f"JSON解析失败: {e}")
                print(f"原始内容: {repr(match)}")
                print(f"处理后内容: {repr(processed_match)}")
                continue
        
        # 根据类型处理解析结果
        if type == "other":
            blocks.append({ext_mark: match_json})
        elif type == "code":
            if isinstance(match_json, dict) and "HASH" in match_json and "Code" in match_json:
                blocks.append({match_json["HASH"]: match_json["Code"]})
            else:
                blocks.append({"unknown": f"code: invalid structure - {match_json}"})
        elif type == "report":
            if isinstance(match_json, dict) and "HASH" in match_json:
                blocks.append({match_json["HASH"]: match_json})
            else:
                blocks.append({"unknown": f"report: invalid structure - {match_json}"})
    
    return blocks

In [7]:

model = 'Here is the completed code segment for the {LANGDA} placeholder:\n\n```problog\n{"HASH": "CD6DE9CB","Code": "query_pop([C1,D1,C2,D2]) :- \n    density(C1,D1),\n    density(C2,D2),\n    C1 \\\\= C2,\n    abs(D1 - D2) =< max(D1,D2)*0.05."}\n```\n\nThis implementation:\n1. Uses the `density/2` predicate to get population densities for two countries\n2. Ensures they are different countries with `C1 \\= C2`\n3. Checks if their densities are within 5% of each other using arithmetic comparison\n4. Returns the country names and densities in the format [C1,D1,C2,D2]\n\nThe 5% threshold can be adjusted by changing the 0.05 factor as needed.'

rep = _find_all_blocks("code", model)
print(rep)

[{'CD6DE9CB': 'query_pop([C1,D1,C2,D2]) :- \n    density(C1,D1),\n    density(C2,D2),\n    C1 \\\\= C2,\n    abs(D1 - D2) =< max(D1,D2)*0.05.'}]
