In [1]:
!pip -q install janome sentence-transformers tqdm

In [2]:
import pandas as pd
import numpy as np
import torch
import itertools
from janome.tokenizer import Tokenizer
from sentence_transformers import SentenceTransformer, util
from tqdm.auto import tqdm




In [3]:
torch.manual_seed(0)
np.random.seed(0)

In [4]:
base_df = pd.read_csv(r"C:\GitHub\Data-Analysis_competition\Data\base_stories.tsv" , sep='\t')
test_df  = pd.read_csv(r"C:\GitHub\Data-Analysis_competition\Data\fiction_stories_test.tsv", sep='\t')
practice_df  = pd.read_csv(r"C:\GitHub\Data-Analysis_competition\Data\fiction_stories_practice.tsv", sep='\t')

In [5]:
t = Tokenizer()

def get_keywords(text: str) -> str:
    if pd.isna(text):
        return ""
    tokens = t.tokenize(str(text))
    words = [
        token.surface for token in tokens
        if token.part_of_speech.split(",")[0] in ["名詞", "動詞", "形容詞"]
    ]
    return " ".join(words)

In [6]:
tqdm.pandas()

In [7]:
base_df["processed_story"] = base_df["story"].progress_apply(get_keywords)
test_df["processed_story"] = test_df["story"].progress_apply(get_keywords)
practice_df["processed_story"] = practice_df["story"].progress_apply(get_keywords)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/340 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device =", device)

model = SentenceTransformer("intfloat/multilingual-e5-base", device=device)

device = cpu


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [9]:
def embed_e5(text_list, prefix: str, batch_size: int = 64):
    """
    prefix: "query: " or "passage: "
    """
    texts = [prefix + (txt if isinstance(txt, str) else "") for txt in text_list]
    # normalize_embeddings=True にするとコサイン類似度計算が安定しやすいです
    emb = model.encode(
        texts,
        convert_to_tensor=True,
        batch_size=batch_size,
        show_progress_bar=True,
        normalize_embeddings=True
    )
    return emb

In [10]:
base_embeddings = embed_e5(base_df["processed_story"].tolist(), prefix="passage: ")
test_embeddings = embed_e5(test_df["processed_story"].tolist(), prefix="query: ")
practice_embeddings = embed_e5(practice_df["processed_story"].tolist(), prefix="query: ")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
base_indices = list(range(len(base_df)))
all_pairs = list(itertools.combinations(base_indices, 2))

# 2つの作品のベクトルを平均
pair_embeddings = torch.stack([
    (base_embeddings[i] + base_embeddings[j]) / 2
    for (i, j) in all_pairs
])

# normalize_embeddings=Trueでbaseを作っているので、平均後も一応正規化しておきます
pair_embeddings = torch.nn.functional.normalize(pair_embeddings, p=2, dim=1)

In [12]:
def predict_best_pairs(query_embeddings, pair_list, base_data, pair_emb):
    """
    query_embeddings: (N, D)
    pair_emb: (1225, D)
    """
    # コサイン類似度 (N, 1225)
    cos_sim = util.cos_sim(query_embeddings, pair_emb)

    best_pair_idx = torch.argmax(cos_sim, dim=1).cpu().numpy()

    preds = []
    for idx in best_pair_idx:
        i, j = pair_list[idx]
        id_a = int(base_data.iloc[i]["id"])
        id_b = int(base_data.iloc[j]["id"])
        a, b = sorted([id_a, id_b])
        preds.append((a, b))
    return preds

In [13]:
print("テストデータの予測を実行中...")
test_results = predict_best_pairs(test_embeddings, all_pairs, base_df, pair_embeddings)

submission = pd.DataFrame({
    "id": test_df["id"].astype(int),
    "id_a": [a for a, b in test_results],
    "id_b": [b for a, b in test_results],
})

テストデータの予測を実行中...


In [14]:
OUT_PATH = submission.to_csv('C:\GitHub\Data-Analysis_competition\Analysis\sigante_anime\output\output_1.csv', index=False)
submission.to_csv(OUT_PATH, index=False)
print(f"完了: {OUT_PATH}")
display(submission.head())

完了: None


  OUT_PATH = submission.to_csv('C:\GitHub\Data-Analysis_competition\Analysis\sigante_anime\output\output_1.csv', index=False)


Unnamed: 0,id,id_a,id_b
0,1,11,27
1,2,34,49
2,3,14,49
3,4,9,29
4,5,10,50


In [15]:
print("\n--- 練習データによる自己採点 ---")
practice_results = predict_best_pairs(practice_embeddings, all_pairs, base_df, pair_embeddings)

true_pairs = [
    {int(practice_df.iloc[i]["id_a"]), int(practice_df.iloc[i]["id_b"])}
    for i in range(len(practice_df))
]
pred_pairs = [set(p) for p in practice_results]

correct = sum(tp == pp for tp, pp in zip(true_pairs, pred_pairs))
acc = correct / len(practice_df)

print(f"練習データのペア完全一致率: {acc:.4f} ({correct}/{len(practice_df)})")


--- 練習データによる自己採点 ---
練習データのペア完全一致率: 0.2000 (4/20)


In [16]:
# 失敗ケースを確認（改善の糸口になります）
if correct < len(practice_df):
    miss = []
    for i, (tp, pp) in enumerate(zip(true_pairs, pred_pairs)):
        if tp != pp:
            miss.append({
                "row": i,
                "true": sorted(list(tp)),
                "pred": sorted(list(pp)),
                "story": practice_df.iloc[i]["story"][:120] + "..."
            })
    miss_df = pd.DataFrame(miss)
    print("\n--- 不一致ケース（先頭）---")
    display(miss_df.head(10))


--- 不一致ケース（先頭）---


Unnamed: 0,row,true,pred,story
0,0,"[23, 29]","[9, 29]",大都市で相次いだ爆発により通信と電力が断たれ、交通網と物流は停止する。インフラの弱点が露わに...
1,1,"[3, 35]","[3, 23]",泥に沈む前線で、崩壊寸前の共同体を守る部隊に、敵地に取り残された通信員の救出命令が下る。だが...
2,2,"[11, 43]","[11, 14]",景気後退が続き失業者が増える街で、主人公は過去の事故が原因で夢を諦め、家族とも距離を置いてい...
3,3,"[5, 19]","[9, 19]",重力異常が連鎖する未知の惑星へ、調査隊は自動運転車両で降下する。基地では「将来の映像を分析す...
4,4,"[2, 19]","[11, 19]",街中や店の天井に据え付けられた小型の撮像装置が、人々の行動を絶えず記録する社会。集まった映像...
5,7,"[18, 24]","[11, 19]",統制が行き届いた巨大な工業都市では、貧しい女性の身体が「供給源」として契約に縛られ、出産や治...
6,8,"[13, 28]","[11, 19]",管理が行き届きすぎた都市では、住民の行動や感情まで数値化され、規範に合わない者は「矯正プログ...
7,9,"[26, 30]","[9, 30]",戦乱が続く国で、都を守る評議機構は敵の動きを過小評価し、民の避難を遅らせる致命的な判断ミスを...
8,10,"[10, 41]","[28, 41]",景気の冷え込みと孤独が漂う街で、男子高校生は「強くあれ」という同調圧力に押し潰され、乱暴な衝...
9,11,"[8, 34]","[34, 35]",小さな町の図書館で働く男は、持病の悪化を隠しながら、病と死に向き合う日々を送っていた。ある日...
