In [None]:
import re
import yaml

In [None]:
def find_statistical_method(pages_text, method_keywords):
    """在文本中查找某种统计方法并记录其位置，返回包含该方法的前后各一句，按句子分割"""
    results = []

    # 构建正则表达式，忽略大小写并允许匹配单复数形式
    patterns = {
        keyword: re.compile(rf"\b{re.escape(keyword)}(?:[\s\-_])?s?\b", re.IGNORECASE)
        for keyword in method_keywords
    }

    for page_num, page_text in enumerate(pages_text, start=1):
        # 根据句号、感叹号、问号等分割文本为句子
        sentences = re.split(r'(?<=[.!?])\s+', page_text)

        for i, sentence in enumerate(sentences):
            # 对每个正则表达式进行查找匹配项
            for keyword, pattern in patterns.items():
                if re.search(pattern, sentence):
                    # 获取前后两句
                    prev_sentence = sentences[i-1].strip() if i > 0 else ""
                    next_sentence = sentences[i+1].strip() if i < len(sentences) - 1 else ""

                    # 记录结果
                    result = {
                        "page": page_num,
                        "method": keyword,
                        "sentence": sentence.strip(),
                        "prev_sentence": prev_sentence,
                        "next_sentence": next_sentence
                    }
                    results.append(result)

    return results

In [None]:
# Function to recursively extract all strings from a nested YAML structure
def extract_all_strings(yaml_data):
    strings = []
    
    # Recursive function to walk through the YAML structure
    def recursive_extract(value):
        if isinstance(value, str):
            # Separate main terms from parentheses
            main_term = re.split(r'\s*\(.*?\)', value)[0].strip()  # Extract main term before parentheses
            if main_term:
                strings.append(main_term)

            # Extract content inside parentheses separately
            parentheses_content = re.findall(r'\((.*?)\)', value)
            for content in parentheses_content:
                terms_in_parentheses = content.split(',')  # Split by commas inside parentheses
                strings.extend([term.strip() for term in terms_in_parentheses])  # Add each term separately
                
        elif isinstance(value, dict):
            for k, v in value.items():
                recursive_extract(k)  # Keys might also be strings
                recursive_extract(v)  # Recursively process values
        elif isinstance(value, list):
            for item in value:
                recursive_extract(item)  # Process each item in the list

    recursive_extract(yaml_data)
    return strings

# Reading the YAML file with UTF-8 encoding
with open('ontology.yaml', 'r', encoding='utf-8') as file:
    yaml_content = yaml.safe_load(file)

# Extract all strings
all_strings = extract_all_strings(yaml_content)

# Print the extracted strings
print(all_strings)


In [None]:
def find_result(pdf_path, method_keywords):
    pages_text = extract_text_from_pdf(pdf_path)
    results = find_statistical_method(pages_text, method_keywords)
    
    if results:
        return results
    else:
        return "null"

In [None]:
pdf_path = "neuro-1.pdf"  # 替换为你的PDF文件路径
method_keywords = all_strings  # 替换为你要搜索的统计方法关键词
result = find_result(pdf_path, method_keywords)
print(result)