In [None]:
import xml.etree.ElementTree as ET
import html
import re
import pandas as pd

def load_target_keywords(csv_path):
    df = pd.read_csv(csv_path)
    # target==1のkeywordのみ抽出してリスト化
    target_keywords = df.loc[df['target'] == 1, 'keyword'].dropna().tolist()
    return target_keywords

def extract_content_by_keywords(xml_path, keywords):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    collected_text = []

    for page in root.findall('page'):
        tag_elem = page.find('tag')
        content_elem = page.find('content')

        if tag_elem is not None:
            tag_text = html.unescape(tag_elem.text)  # ←ここがポイント！

            if any(k in tag_text for k in keywords):
                raw_text = content_elem.text
                if raw_text:
                    decoded_text = html.unescape(raw_text)
                    cdata_match = re.search(r'<!\[CDATA\[(.*?)\]\]>', decoded_text, re.DOTALL)
                    if cdata_match:
                        collected_text.append(cdata_match.group(1).strip())

    return '\n\n'.join(collected_text)

# --- 使用例 ---
xml_file = "ir/2025/20020_日清製粉グループ本社/Gx5z.xml"
csv_file = "ir/2025/20020_日清製粉グループ本社/info/target_tag.csv"

# CSVから対象キーワードを読み込み
keywords = load_target_keywords(csv_file)

# XMLからテキスト抽出
output_text = extract_content_by_keywords(xml_file, keywords)

# 出力
print(output_text)

# 必要に応じてファイル出力
#with open("extracted_text.txt", "w", encoding="utf-8") as f:
#    f.write(output_text)
