In [3]:
import json
import os
import datasets
import tqdm
import MeCab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import random

In [6]:
# https://github.com/ku-nlp/bert-based-faqir/tree/master

OUTPUT_DIR = "output/expreiment4"
os.makedirs(OUTPUT_DIR, exist_ok=True)

!curl https://tulip.kuee.kyoto-u.ac.jp/localgovfaq/localgovfaq.zip > {OUTPUT_DIR}/localgovfaq.zip
!unzip {OUTPUT_DIR}/localgovfaq.zip -d {OUTPUT_DIR}

Archive:  output/expreiment4/localgovfaq.zip
   creating: output/expreiment4/localgovfaq/
  inflating: output/expreiment4/localgovfaq/testset_segmentation.txt  
  inflating: output/expreiment4/localgovfaq/testset.txt  
   creating: output/expreiment4/localgovfaq/qas/
  inflating: output/expreiment4/localgovfaq/qas/answers_in_Amagasaki.txt  
  inflating: output/expreiment4/localgovfaq/qas/questions_in_Amagasaki.txt  
   creating: output/expreiment4/localgovfaq/samples/
  inflating: output/expreiment4/localgovfaq/samples/tsubaki.txt  
  inflating: output/expreiment4/localgovfaq/samples/bert.txt  
  inflating: output/expreiment4/localgovfaq/samples/joint.txt  
  inflating: output/expreiment4/localgovfaq/README.md  


In [8]:
mecab = MeCab.Tagger("-Owakati")

def tokenize_jp(text: str) -> str:
    tokens = mecab.parse(text).split()
    return tokens

# {OUTPUT_DIR}/localgovfaq/qas/answers_in_Amagasaki.txtを読み込んで、各行splitして２要素目以降をjoinし、tokenize_jpで再度分かち書き
with open(f"{OUTPUT_DIR}/localgovfaq/qas/answers_in_Amagasaki.txt", "r") as f:
    amagasaki_answers = [tokenize_jp("".join(line.split()[1:])) for line in tqdm.tqdm(f)]
# {OUTPUT_DIR}/localgovfaq/qas/questions_in_Amagasaki.txtを読み込んで、各行splitして２要素目以降をjoinし、tokenize_jpで再度分かち書き
with open(f"{OUTPUT_DIR}/localgovfaq/qas/questions_in_Amagasaki.txt", "r") as f:
    amagasaki_questions = [tokenize_jp("".join(line.split()[1:])) for line in tqdm.tqdm(f)]

amagasaki_vectorizer = TfidfVectorizer(analyzer=lambda x: x)
amagasaki_vectorizer.fit(amagasaki_answers)


1786it [00:01, 1108.75it/s]
1786it [00:00, 16340.79it/s]


In [24]:
def calc_recall(vectorizer, n = 5):
    global amagasaki_answers, amagasaki_questions
        
    answer_matrix = vectorizer.transform(amagasaki_answers)
    query_matrix = vectorizer.transform(amagasaki_questions)

    # 類似度行列を計算し、queryのdocidのランクを取得
    similarity_matrix = cosine_similarity(query_matrix, answer_matrix)
    ranking_matrix = np.argsort(similarity_matrix, axis=1)[:, ::-1]

    # ranking_matrixのi行目で、iが第何位か計算
    rank_list = np.array([np.where(ranking_matrix[i] == i)[0][0] for i in range(len(ranking_matrix))])
    # recall@nを計算
    recall_at_n = np.sum(rank_list < n) / len(rank_list)
    return recall_at_n

recall_at_5 = calc_recall(amagasaki_vectorizer, n=5)
print(recall_at_5)

0.6114221724524076


In [26]:
# all corpus texts
CORPUS_PATH = f"{OUTPUT_DIR}/miracl_subset.json"
if os.path.exists(CORPUS_PATH):
    with open(CORPUS_PATH, "r") as f:
        miracle_subset = json.load(f)
else:
    full_corpus = datasets.load_dataset("miracl/miracl-corpus", "ja")
    miracl_subset = full_corpus["train"].select(random.sample(range(len(full_corpus["train"])), 2000000))
    miracl_subset = [tokenize_jp(item["text"]) for item in tqdm.tqdm(miracl_subset)]

  8%|▊         | 153028/2000000 [02:44<41:38, 739.10it/s]  

KeyboardInterrupt: 

  8%|▊         | 153028/2000000 [02:56<41:38, 739.10it/s]

In [27]:
amagasaki_idf_recalls = []
miracl_idf_recalls = []

for _ in tqdm.tqdm(range(1)):
    corpus = full_corpus["train"].select(random.sample(range(len(full_corpus["train"])), 100000))
    corpus_texts = [tokenize_jp(item["text"]) for item in corpus]
    vectorizer = TfidfVectorizer(analyzer=lambda x: x)
    vectorizer.fit(corpus_texts)

    recall_at_5 = calc_recall(vectorizer, n=5)
    miracl_idf_recalls.append(recall_at_5)

# 平均と標準偏差を計算し表示
miracl_idf_recall_mean = np.mean(miracl_idf_recalls)
miracl_idf_recall_std = np.std(miracl_idf_recalls)
print(f"miracl_idf_recall: {miracl_idf_recall_mean} ± {miracl_idf_recall_std}")



In [17]:
#corpus = full_corpus["train"].shuffle(seed=random.randint(0, 1000)).select(range(1000))
corpus = full_corpus["train"].select(random.sample(range(len(full_corpus["train"])), 1000))

In [24]:
def get_corpus(query_size, corpus_size):
    # ランダムにクエリを選択
    

    positive_corpus_json = []
    query_texts = []
    done_docids = set()
    for item in ds:
        query_texts.append(tokenize_jp(item["query"]))
        for pp in item["positive_passages"]:
            if pp["docid"] in done_docids:
                continue
            positive_corpus_json.append({
                "text": tokenize_jp(pp["text"]),
                "docid": pp["docid"]
            })
    positive_docids = set([x["docid"] for x in positive_corpus_json])

    # ランダムにコーパスを選択
    max_corpus_size = corpus_size*2 + len(positive_docids)
    corpus_without_positive = full_corpus["train"].select(random.sample(range(len(full_corpus["train"])), max_corpus_size)).filter(lambda x: x["docid"] not in positive_docids)
    corpus_without_positive_json = [{"docid": doc["docid"], "text": tokenize_jp(doc["text"])} for doc in corpus_without_positive]

    train_corpus = corpus_without_positive_json[:corpus_size]
    test_corpus = positive_corpus_json + corpus_without_positive_json[corpus_size:corpus_size*2-len(positive_corpus_json)]
    assert len(test_corpus) == corpus_size
    assert len(train_corpus) == corpus_size
    return ds, query_texts, train_corpus, test_corpus


In [36]:
item = full_ds[0]
docids = set([x["docid"] for x in item["positive_passages"]])
docs = full_corpus["train"].filter(lambda x: x["docid"] in docids)

Filter:   0%|          | 0/6953614 [00:00<?, ? examples/s]

In [64]:
CORPUS_SIZE = 500
QUERY_SIZE = 30
train_recalls = []
test_recalls = []
for _ in tqdm.tqdm(range(10)):
    ds, query_texts, train_corpus, test_corpus = get_corpus(QUERY_SIZE, CORPUS_SIZE)
    
    def calc_result(test_corpus, vectorizer, n = 5):
        global query_texts, ds
        
        test_matrix = vectorizer.transform([doc["text"] for doc in test_corpus])
        query_matrix = vectorizer.transform(query_texts)

        # 類似度行列を計算し、queryのdocidのランクを取得
        similarity_matrix = cosine_similarity(query_matrix, test_matrix)
        ranking_matrix = np.argsort(similarity_matrix, axis=1)[:, ::-1]

        test_docid2indice = {item["docid"]: i for i, item in enumerate(test_corpus)}    

        query_result = []
        for item, ranking in zip(ds, ranking_matrix):
            # rankingの何番目にdocidがあるかを取得
            docids = [pp["docid"] for pp in item["positive_passages"]]
            docid_indices = [test_docid2indice[docid] for docid in docids if docid in test_docid2indice]
            ranks = [list(ranking).index(docid_index) for docid_index in docid_indices]
            query_result.append({
                "query_id": item["query_id"],
                "ranks": ranks
            })
        
        # recall@nを計算
        recall_at_n = np.mean([np.mean([1 if rank < n else 0 for rank in item["ranks"]]) for item in query_result])
        return recall_at_n
    full_vocabulary = set()

    for query_text in query_texts:
        full_vocabulary.update(query_text)
    for doc in train_corpus + test_corpus:
        full_vocabulary.update(doc["text"])

    train_vectorizer = TfidfVectorizer(analyzer=lambda x: x, vocabulary=full_vocabulary, min_df=10)
    train_vectorizer.fit([x["text"] for x in train_corpus])

    test_vectorizer = TfidfVectorizer(analyzer=lambda x: x, vocabulary=full_vocabulary, min_df=10)
    test_vectorizer.fit([x["text"] for x in test_corpus])
    
    train_recall = calc_result(test_corpus, train_vectorizer, 5)
    test_recall = calc_result(test_corpus, test_vectorizer, 5)

    train_recalls.append(train_recall)
    test_recalls.append(test_recall)

# 平均と標準偏差を計算
train_recall_mean = np.mean(train_recalls)
train_recall_std = np.std(train_recalls)
test_recall_mean = np.mean(test_recalls)
test_recall_std = np.std(test_recalls)
# 表示
print(f"train recall@5: {train_recall_mean} ± {train_recall_std}")  
print(f"test recall@5: {test_recall_mean} ± {test_recall_std}")   

  0%|          | 0/10 [00:00<?, ?it/s]

Filter:   0%|          | 0/1061 [00:00<?, ? examples/s]

 10%|█         | 1/10 [00:01<00:12,  1.44s/it]

Filter:   0%|          | 0/1058 [00:00<?, ? examples/s]

 20%|██        | 2/10 [00:02<00:11,  1.49s/it]

Filter:   0%|          | 0/1058 [00:00<?, ? examples/s]

 30%|███       | 3/10 [00:04<00:10,  1.44s/it]

Filter:   0%|          | 0/1054 [00:00<?, ? examples/s]

 40%|████      | 4/10 [00:05<00:08,  1.41s/it]

Filter:   0%|          | 0/1059 [00:00<?, ? examples/s]

 50%|█████     | 5/10 [00:06<00:06,  1.35s/it]

Filter:   0%|          | 0/1064 [00:00<?, ? examples/s]

 60%|██████    | 6/10 [00:08<00:05,  1.49s/it]

Filter:   0%|          | 0/1069 [00:00<?, ? examples/s]

 70%|███████   | 7/10 [00:10<00:04,  1.49s/it]

Filter:   0%|          | 0/1050 [00:00<?, ? examples/s]

 80%|████████  | 8/10 [00:12<00:03,  1.65s/it]

Filter:   0%|          | 0/1064 [00:00<?, ? examples/s]

 90%|█████████ | 9/10 [00:13<00:01,  1.48s/it]

Filter:   0%|          | 0/1063 [00:00<?, ? examples/s]

100%|██████████| 10/10 [00:14<00:00,  1.43s/it]

train recall@5: 0.8698722943722943 ± 0.04472819471100963
test recall@5: 0.863038961038961 ± 0.03786960883767521





In [46]:
# recallsを対応ありのt検定で比較
from scipy import stats
t, p = stats.ttest_rel(train_recalls, test_recalls)
print(f"t検定: t={t}, p={p}")

t検定: t=1.6999624307531316, p=0.12335490730783027


In [62]:
print(query_texts[0])
id = train_vectorizer.vocabulary_["サッカー"]
print("サッカー")
print(train_vectorizer.idf_[id])
print(test_vectorizer.idf_[id])
id = train_vectorizer.vocabulary_["発祥"]
print("発祥")
print(train_vectorizer.idf_[id])
print(test_vectorizer.idf_[id])

['サッカー', 'の', '発祥', '地', 'は', 'どこ']
サッカー
5.830311739964975
5.270695952029551
発祥
7.2166061010848646
5.607168188650765
