In [41]:
import pandas as pd
import numpy as np
import MeCab
import ipadic
import re
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import time

### 前準備

In [2]:
# MeCabの初期化
tagger = MeCab.Tagger(ipadic.MECAB_ARGS)

In [3]:
def extract_nouns(text):
    """文章から分析に適した名詞を抽出する"""
    if pd.isna(text): return []
    node = tagger.parseToNode(text)
    nouns = []
    while node:
        features = node.feature.split(',')
        # 一般名詞、固有名詞、サ変接続を対象
        if features[0] == '名詞' and features[1] in ['一般', '固有名詞', 'サ変接続']:
            if len(node.surface) > 1:
                nouns.append(node.surface)
        node = node.next
    return nouns

In [4]:
base_df = pd.read_csv(r'C:\Users\管理\Documents\GitHub\Data-Analysis_competition\Analysis\sigante_anime\Data\base_stories.tsv', sep='\t')
practice_df = pd.read_csv(r'C:\Users\管理\Documents\GitHub\Data-Analysis_competition\Analysis\sigante_anime\Data\fiction_stories_practice.tsv', sep='\t')
test_df = pd.read_csv(r'C:\Users\管理\Documents\GitHub\Data-Analysis_competition\Analysis\sigante_anime\Data\fiction_stories_test.tsv', sep='\t')

In [5]:
tokenized_base = [extract_nouns(s) for s in base_df['story']]
bm25 = BM25Okapi(tokenized_base)

In [6]:
model = SentenceTransformer('intfloat/multilingual-e5-small')
base_embeddings = model.encode(["passage: " + s for s in base_df['story']], normalize_embeddings=True)



Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: intfloat/multilingual-e5-small
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [7]:
def hybrid_window_search(query_story, k=60):
    """スライディングウィンドウ + RRFによるハイブリッド検索"""
    # 2文ずつの窓に分割
    sentences = [s.strip() + "。" for s in re.split(r'(?<=。)', query_story) if s.strip()]
    windows = ["".join(sentences[i:i+2]) for i in range(len(sentences)-1)] or [query_story]
    
    num_docs = len(base_df)
    best_bm25_ranks = np.full(num_docs, num_docs)
    best_vec_ranks = np.full(num_docs, num_docs)

    for window in windows:
        # BM25スコアリング
        q_tokens = extract_nouns(window)
        if q_tokens:
            bm25_ranks = np.argsort(np.argsort(bm25.get_scores(q_tokens))[::-1]) + 1
            best_bm25_ranks = np.minimum(best_bm25_ranks, bm25_ranks)
        
        # ベクトルスコアリング
        q_emb = model.encode(["query: " + window], normalize_embeddings=True)
        vec_ranks = np.argsort(np.argsort(cosine_similarity(q_emb, base_embeddings)[0])[::-1]) + 1
        best_vec_ranks = np.minimum(best_vec_ranks, vec_ranks)

    # RRF (Reciprocal Rank Fusion) で統合
    rrf_scores = (1.0 / (k + best_bm25_ranks)) + (1.0 / (k + best_vec_ranks))
    top_indices = np.argsort(rrf_scores)[::-1]
    
    return base_df.iloc[top_indices].assign(search_score=rrf_scores[top_indices])

In [8]:
def calculate_metrics(df, top_n=10):
    perfect_matches = 0 # 上位2件が正解ペア
    both_in_top_n = 0   # 上位N件に2つとも含まれる
    total_cases = len(df)
    
    for _, row in df.iterrows():
        preds = hybrid_window_search(row['story'])
        top_ids = preds['id'].values
        
        true_ids = {row['id_a'], row['id_b']}
        if set(top_ids[:2]) == true_ids:
            perfect_matches += 1
        
        if len(set(top_ids[:top_n]).intersection(true_ids)) == 2:
            both_in_top_n += 1

    print(f"Perfect Match Accuracy (Top 2): {perfect_matches/total_cases:.2%}")
    print(f"Both in Top-{top_n}: {both_in_top_n/total_cases:.2%}")

In [9]:
results = []
print("Starting predictions for test data...")

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    search_results = hybrid_window_search(row['story'])
    # スコア上位2件を選択し、IDを昇順に並べ替え
    top_2_ids = sorted(search_results['id'].head(2).values)
    results.append([row['id'], top_2_ids[0], top_2_ids[1]])

# 提出用DataFrame作成
submit_df = pd.DataFrame(results)
# submit_df.to_csv('submission.csv', index=False, header=False)
# print("Submission file created: submission.csv")

Starting predictions for test data...


100%|██████████| 340/340 [01:35<00:00,  3.54it/s]


### モデルの学習

In [10]:
# --- 最終評価: 練習用データでの正解率算出 ---

def calculate_final_accuracy(df):
    perfect_matches = 0
    total_cases = len(df)
    
    print(f"Total cases: {total_cases}")
    print("Evaluating...")

    for _, row in df.iterrows():
        # 1. 検索実行
        search_results = hybrid_window_search(row['story'])
        
        # 2. 上位2件のIDを取得（順不同で比較するため set を使用）
        predicted_ids = set(search_results['id'].head(2).values)
        true_ids = {row['id_a'], row['id_b']}
        
        # 3. 完全一致判定
        if predicted_ids == true_ids:
            perfect_matches += 1

    # 正解率の算出
    accuracy = perfect_matches / total_cases
    
    print("\n" + "="*30)
    print(f"【最終評価結果】")
    print(f"完全一致数: {perfect_matches} / {total_cases}")
    print(f"正解率 (Accuracy): {accuracy:.2%}")
    print("="*30)

# 実行
calculate_final_accuracy(practice_df)

Total cases: 20
Evaluating...

【最終評価結果】
完全一致数: 6 / 20
正解率 (Accuracy): 30.00%


### 追加分析

In [11]:
# --- 片方だけ正解パターンの抽出 ---

def analyze_error_patterns(df):
    error_data = []
    
    for i, row in df.iterrows():
        preds = hybrid_window_search(row['story'])
        top_2_ids = set(preds['id'].head(2).values)
        true_ids = {row['id_a'], row['id_b']}
        
        matches = top_2_ids.intersection(true_ids)
        num_matches = len(matches)
        
        # Top 10 に入っているかも確認
        top_10_ids = set(preds['id'].head(10).values)
        in_top_10 = len(top_10_ids.intersection(true_ids))
        
        error_data.append({
            'practice_idx': i,
            'match_type': 'Perfect' if num_matches == 2 else ('Partial' if num_matches == 1 else 'Zero'),
            'found_ids': list(matches),
            'missed_ids': list(true_ids - matches),
            'top_10_recall': in_top_10
        })
    
    analysis_results_df = pd.DataFrame(error_data)
    return analysis_results_df

In [12]:
# 実行と集計
error_analysis_df = analyze_error_patterns(practice_df)
print("--- マッチングタイプの分布 ---")
print(error_analysis_df['match_type'].value_counts())

--- マッチングタイプの分布 ---
match_type
Partial    10
Perfect     6
Zero        4
Name: count, dtype: int64


In [13]:
# マッチタイプを変数に格納
partial_cases = error_analysis_df[error_analysis_df['match_type'] == 'Partial']
Perfect_cases = error_analysis_df[error_analysis_df['match_type'] == 'Perfect']
Zero_cases = error_analysis_df[error_analysis_df['match_type'] == 'Zero']

In [14]:
partial_cases.columns

Index(['practice_idx', 'match_type', 'found_ids', 'missed_ids',
       'top_10_recall'],
      dtype='object')

In [15]:
df_analysis = practice_df.copy()
df_analysis['match_type'] = error_analysis_df['match_type'].values

In [17]:
# 文章量（文字数）の比較
df_analysis['story_len'] = df_analysis['story'].str.len()
print("--- マッチタイプ別の平均文字数 ---")
df_analysis.groupby('match_type')['story_len'].agg(['mean', 'std', 'count'])

--- マッチタイプ別の平均文字数 ---


Unnamed: 0_level_0,mean,std,count
match_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Partial,358.0,35.724253,10
Perfect,341.333333,25.437505,6
Zero,330.0,28.401878,4


In [19]:
# 2. キーワード残存率の算出（理論：残っている単語が多いほどPerfectになりやすいか？）
def get_retention_rate(row):
    text = row['story']
    # id_a, id_b それぞれの元ネタの単語集合を取得
    words_a = set(base_df[base_df['id'] == row['id_a']]['keywords'].iloc[0])
    words_b = set(base_df[base_df['id'] == row['id_b']]['keywords'].iloc[0])
    
    # 実際にあらすじに含まれている数
    match_a = sum(1 for w in words_a if w in text) / len(words_a) if words_a else 0
    match_b = sum(1 for w in words_b if w in text) / len(words_b) if words_b else 0
    return (match_a + match_b) / 2

# base_dfにkeywords列がない場合は事前に作成（以前のコードで作成済みと想定）
if 'keywords' not in base_df.columns:
    base_df['keywords'] = base_df['story'].apply(extract_nouns)

df_analysis['avg_retention'] = df_analysis.apply(get_retention_rate, axis=1)

In [22]:
print("\n--- マッチタイプ別のキーワード残存率(%) ---")
df_analysis.groupby('match_type')['avg_retention'].mean()*100


--- マッチタイプ別のキーワード残存率(%) ---


match_type
Partial    12.255139
Perfect    11.359562
Zero        9.447753
Name: avg_retention, dtype: float64

In [25]:
missed_ids = []
for ids in error_analysis_df['missed_ids']:
    missed_ids.extend(ids)

if missed_ids:
    print("\n--- 見逃された回数が多いベース作品 ID ---")
    missed_series = pd.Series(missed_ids).value_counts()
    display(base_df[base_df['id'].isin(missed_series.index[:5])][['id', 'title']])


--- 見逃された回数が多いベース作品 ID ---


Unnamed: 0,id,title
1,2,マトリックス
4,5,インターステラー
22,23,フルメタル・ジャケット
23,24,マッドマックス 怒りのデス・ロード
34,35,七人の侍


### LLMの導入で、リランクキングを行う。

In [26]:
!pip install -U google-generativeai

Collecting google-generativeai
  Downloading google_generativeai-0.8.6-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.29.0-py3-none-any.whl.metadata (3.3 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.189.0-py3-none-any.whl.metadata (7.0 kB)
Collecting google-auth>=2.15.0 (from google-generativeai)
  Downloading google_auth-2.48.0-py3-none-any.whl.metadata (6.2 kB)
  Downloading google_auth-2.49.0.dev0-py3-none-any.whl.metadata (6.0 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading proto_plus-1.27.1-py3-none-any.whl.metadata (2.2 kB)
Collecting googleapis-common-protos<2.0.0,>=1.56.2 (from google-api-core->google-generativea


[notice] A new release of pip is available: 25.0.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [42]:
import google.generativeai as genai
import time
from google.api_core import exceptions

In [50]:
genai.configure(api_key="AIzaSyDj7R-6BvTq_AYvR3fdibXtqc7hbIZLz_E")
model_gemini = genai.GenerativeModel('models/gemini-1.5-flash') # 高速かつ無料枠に適したモデル

In [51]:
# 利用可能なモデルの一覧を表示
#print("--- 利用可能なモデル一覧 ---")
#for m in genai.list_models():
#    if 'generateContent' in m.supported_generation_methods:
#        print(m.name)

In [52]:
def gemini_reranker_minimal(query_story, candidates_df):
    """
    極限までトークンを節約したリランカー
    """
    # 候補リストをさらに短縮（タイトルのみ、あらすじは50文字）
    candidate_list = ""
    for i, row in candidates_df.iterrows():
        candidate_list += f"ID:{row['id']}, タイトル:{row['title']}, 概要:{row['story'][:50]}...\n"

    # プロンプトを極限まで短くする（英語を混ぜるとトークンが減る場合があります）
    prompt = f"""
Combine 2 movies from the list to make this story. Return ONLY 2 IDs.
Story: {query_story[:200]}
List:
{candidate_list}
Answer: (ID1, ID2)
"""

    try:
        response = model_gemini.generate_content(prompt)
        pred_ids = [int(s) for s in re.findall(r'\d+', response.text)]
        return pred_ids[:2] if len(pred_ids) >= 2 else candidates_df['id'].head(2).tolist()
    except Exception as e:
        # 429エラー等が出た場合は即座にフォールバックして待機時間を無駄にしない
        return candidates_df['id'].head(2).tolist()

In [53]:
def run_ultra_stable_eval(df, top_n=10):
    perfect_matches = 0
    total_cases = len(df)
    results = []
    
    print(f"Starting Ultra-Stable Evaluation (1 request per minute)...")

    for i, row in tqdm(df.iterrows(), total=total_cases):
        # 候補抽出
        search_candidates = hybrid_window_search(row['story']).head(top_n)
        
        # リランカー実行
        pred_ids = gemini_reranker_minimal(row['story'], search_candidates)
        
        # 判定
        predicted_set = set(pred_ids)
        true_set = {row['id_a'], row['id_b']}
        is_perfect = (predicted_set == true_set)
        if is_perfect:
            perfect_matches += 1
            
        results.append({'idx': i, 'is_perfect': is_perfect})
        
        # 1件ごとに60秒待機（これが無料枠回避の鍵です）
        if i < total_cases - 1: # 最後の1件以外
            time.sleep(60) 

    accuracy = perfect_matches / total_cases
    print(f"\n最終正解率: {accuracy:.2%} ({perfect_matches}/{total_cases})")
    return pd.DataFrame(results)

In [54]:
eval_results_df = run_ultra_stable_eval(practice_df)
eval_results_df

Starting Ultra-Stable Evaluation (1 request per minute)...


100%|██████████| 20/20 [19:04<00:00, 57.24s/it]


最終正解率: 30.00% (6/20)





Unnamed: 0,idx,is_perfect
0,0,False
1,1,False
2,2,True
3,3,False
4,4,False
5,5,True
6,6,True
7,7,False
8,8,False
9,9,False
