In [8]:
import MeCab
import re
import os
import json


def extract_proper_nouns(text):
    # MeCab Taggerを初期化
    tagger = MeCab.Tagger()

    # ルビや注釈を除去
    text = re.sub(r'《.*?》', '', text)
    text = re.sub(r'［＃.*?］', '', text)
    text = re.sub(r'｜', '', text)

    # 解析結果を取得
    node = tagger.parseToNode(text)
    
    # 固有名詞を保存するリスト
    proper_nouns = []
    
    # ノードを辿りながら解析
    while node:
        features = node.feature.split(',')
        if features[0] == '名詞' and features[1] == '固有名詞':
            proper_nouns.append(node.surface)
        node = node.next
    
    # 重複を除いたリストを返す
    return list(set(proper_nouns))

# ./data/originalフォルダ内の全ての.txtファイルを順に処理
input_folder = "../data/translated_claude"
output_folder = "keyword_output"
os.makedirs(output_folder, exist_ok=True)


for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_folder, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            proper_nouns = extract_proper_nouns(text)
            
            # 出力ファイルに固有名詞を書き込む（リスト形式、改行なし）
            output_file_path = os.path.join(output_folder, filename)
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(json.dumps(proper_nouns, ensure_ascii=False))