In [1]:
!pip install pandas mecab-python3 ipadic rank_bm25 sentence-transformers scikit-learn tqdm japanize-matplotlib




[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import MeCab
import ipadic
import re
import os
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
base_df = pd.read_csv(r'C:\Users\管理\Documents\GitHub\Data-Analysis_competition\Analysis\sigante_anime\Data\base_stories.tsv', sep='\t')
practice_df = pd.read_csv(r'C:\Users\管理\Documents\GitHub\Data-Analysis_competition\Analysis\sigante_anime\Data\fiction_stories_practice.tsv', sep='\t')
# --- 1. テストデータの読み込み ---
test_df = pd.read_csv(r'C:\Users\管理\Documents\GitHub\Data-Analysis_competition\Analysis\sigante_anime\Data\fiction_stories_test.tsv', sep='\t')

In [4]:
# ==========================================
# 3. 形態素解析（MeCab）の設定
# ==========================================
tagger = MeCab.Tagger(ipadic.MECAB_ARGS)

def extract_nouns(text):
    """文章から名詞・固有名詞（2文字以上）を抽出する"""
    if pd.isna(text): return []
    node = tagger.parseToNode(text)
    nouns = []
    while node:
        features = node.feature.split(',')
        if features[0] == '名詞' and features[1] in ['一般', '固有名詞', 'サ変接続']:
            if len(node.surface) > 1:
                nouns.append(node.surface)
        node = node.next
    return nouns

In [5]:
print("Initializing search engines...")

# --- BM25 ---
tokenized_base = [extract_nouns(s) for s in base_df['story']]
bm25 = BM25Okapi(tokenized_base)

# --- Vector (E5-baseへアップグレード: 施策1) ---
model_name = 'intfloat/multilingual-e5-base' 
model = SentenceTransformer(model_name)
base_embeddings = model.encode(["passage: " + s for s in base_df['story']], normalize_embeddings=True)

# --- TF-IDF (必殺キーワード抽出用) ---
vectorizer = TfidfVectorizer(tokenizer=extract_nouns, token_pattern=None)
tfidf_matrix = vectorizer.fit_transform(base_df['story'])
feature_names = vectorizer.get_feature_names_out()

Initializing search engines...


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mXLMRobertaModel LOAD REPORT[0m from: intfloat/multilingual-e5-base
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [6]:
def get_signature_keywords(df, tfidf_mat, feat_names, top_n=5):
    signatures = {}
    for i, row in df.iterrows():
        row_data = tfidf_mat.getrow(i).toarray().flatten()
        top_indices = row_data.argsort()[-top_n:]
        signatures[row['id']] = [feat_names[idx] for idx in top_indices]
    return signatures

signatures = get_signature_keywords(base_df, tfidf_matrix, feature_names)

In [7]:
# ==========================================
# 6. 施策(3): 窓分割と検索ロジックの実装
# ==========================================
def get_sliding_windows(text, window_size=2):
    sentences = [s.strip() + "。" for s in re.split(r'(?<=。)', text) if s.strip()]
    if len(sentences) <= window_size: return [text]
    return ["".join(sentences[i:i + window_size]) for i in range(len(sentences) - window_size + 1)]

def improved_hybrid_search(query_story, k=60, w_bm25=1.0, w_vec=1.2):
    num_docs = len(base_df)
    
    # --- 施策(3-1): 窓サイズの多重化 ---
    all_windows = []
    for size in [1, 2, 3]:
        all_windows.extend(get_sliding_windows(query_story, window_size=size))
    all_windows.append(query_story) # 全文も考慮

    best_bm25_ranks = np.full(num_docs, num_docs)
    best_vec_ranks = np.full(num_docs, num_docs)
    keyword_hits = np.zeros(num_docs)

    for window in all_windows:
        # --- BM25 ---
        q_tokens = extract_nouns(window)
        if q_tokens:
            scores = bm25.get_scores(q_tokens)
            ranks = np.argsort(np.argsort(scores)[::-1]) + 1
            best_bm25_ranks = np.minimum(best_bm25_ranks, ranks)
            
            # --- 施策(3-2): 特徴語ブースト ---
            for idx, b_id in enumerate(base_df['id']):
                if any(kw in window for kw in signatures[b_id]):
                    keyword_hits[idx] += 1 # ヒットした窓の数をカウント

        # --- Vector ---
        q_emb = model.encode(["query: " + window], normalize_embeddings=True)
        sims = cosine_similarity(q_emb, base_embeddings)[0]
        ranks = np.argsort(np.argsort(sims)[::-1]) + 1
        best_vec_ranks = np.minimum(best_vec_ranks, ranks)

    # --- 施策(4): RRF統合とブースト ---
    # 基本のRRFスコア
    rrf_scores = (w_bm25 / (k + best_bm25_ranks)) + (w_vec / (k + best_vec_ranks))
    
    # 特徴語加点
    rrf_scores += (keyword_hits * 0.02) 

    # カテゴリ・ブースト (簡易的な洋画・アニメ判定)
    anime_kws = ["アニメ", "作画", "声優", "マンガ", "異世界", "魔法"]
    if any(kw in query_story for kw in anime_kws):
        rrf_scores[base_df['category'] == 'アニメ'] *= 1.1

    top_indices = np.argsort(rrf_scores)[::-1]
    results = base_df.iloc[top_indices].copy()
    results['search_score'] = rrf_scores[top_indices]
    return results

In [8]:
# 7. 正解率の算出 (Validation)
# ==========================================
def evaluate_performance(df, top_n=10):
    perfect_matches = 0
    both_in_top_n = 0
    total = len(df)

    print(f"Evaluating metrics for {total} cases...")
    for _, row in tqdm(df.iterrows(), total=total):
        res = improved_hybrid_search(row['story'])
        top_ids = res['id'].values
        
        # 完全一致判定 (Top 2)
        if set(top_ids[:2]) == {row['id_a'], row['id_b']}:
            perfect_matches += 1
            
        # 包含率判定 (Top N)
        if {row['id_a'], row['id_b']}.issubset(set(top_ids[:top_n])):
            both_in_top_n += 1

    print(f"\n--- 最終評価結果 ---")
    print(f"完全一致正解率 (Perfect Match Accuracy): {perfect_matches/total:.2%} ({perfect_matches}/{total})")
    print(f"Top-{top_n} 両方包含率 (Recall@{top_n}): {both_in_top_n/total:.2%} ({both_in_top_n}/{total})")



In [9]:
# 実行
evaluate_performance(practice_df)

Evaluating metrics for 20 cases...


100%|██████████| 20/20 [00:22<00:00,  1.14s/it]


--- 最終評価結果 ---
完全一致正解率 (Perfect Match Accuracy): 10.00% (2/20)
Top-10 両方包含率 (Recall@10): 65.00% (13/20)



