## 키워드 중심 정보 검색 
- BM25(Best Matching 25)는 정보 검색에 널리 사용, 랭킹 함수 

In [42]:
import math
import numpy as np
from typing import List
from transformers import PreTrainedTokenizer
from collections import defaultdict


In [43]:

class BM25:
    
    def __init__(self, corpus: List[List[str]], tokenizer: PreTrainedTokenizer):
        # Initialize BM25 with a list of tokenized documents and a tokenizer.
        self.tokenizer = tokenizer
        self.corpus = corpus
        
        # Tokenize the entire corpus. This converts words into token IDs.
        self.tokenized_corpus = self.tokenizer(corpus, add_special_tokens=False)['input_ids']
        
        # Number of documents in the corpus.
        self.n_docs = len(self.tokenized_corpus)
        
        # Calculate the average document length in tokens.
        self.avg_doc_lens = sum(len(doc) for doc in self.tokenized_corpus) / self.n_docs
        
        # Compute the Inverse Document Frequency (IDF) values.
        self.idf = self._calculate_idf()
        
        # Compute the term frequencies for each document.
        self.term_freqs = self._calculate_term_freqs()
        
    def _calculate_idf(self):
        # Calculate Inverse Document Frequency (IDF) for each unique token in the corpus.
        idf = defaultdict(float)
        
        # Count the number of documents containing each token.
        for doc in self.tokenized_corpus:
            for token_id in set(doc):
                idf[token_id] += 1
                
        # Apply the BM25-specific IDF formula for each token.
        for token_id, doc_frequency in idf.items():
            idf[token_id] = math.log(((self.n_docs - doc_frequency + 0.5) / (doc_frequency + 0.5)) + 1)
            
        return idf
        
    def _calculate_term_freqs(self):
        # Compute the frequency of each token in each document.
        term_freqs = [defaultdict(int) for _ in range(self.n_docs)]
        
        for i, doc in enumerate(self.tokenized_corpus):
            for token_id in doc:
                term_freqs[i][token_id] += 1
        
        return term_freqs
        
    def get_scores(self, query: str, k1: float = 1.2, b: float = 0.75):
        # Calculate BM25 scores for all documents given a query.
        # k1 controls term frequency saturation; b adjusts document length normalization.
        query = self.tokenizer([query], add_special_tokens=False)['input_ids'][0]
        scores = np.zeros(self.n_docs)
        
        # Compute BM25 scores for each query token.
        for q in query:
            idf = self.idf[q]  # Retrieve the precomputed IDF for the query token.
            
            for i, term_freq in enumerate(self.term_freqs):
                q_frequency = term_freq[q]  # Term frequency of the query token in the current document.
                doc_len = len(self.tokenized_corpus[i])
                
                # BM25 formula to compute the score contribution of this token.
                score_q = idf * (q_frequency * (k1 + 1)) / (q_frequency + k1 * (1 - b + b * (doc_len / self.avg_doc_lens)))
                
                # Accumulate the score for document i.
                scores[i] += score_q
                
        return scores
        
    def get_top_k(self, query: str, k: int):
        # Get the top-k documents based on BM25 scores for the given query.
        scores = self.get_scores(query)
        # Sort document indices by scores in descending order and select top-k.
        top_k_indices = np.argsort(scores)[-k:][::-1]
        # Retrieve the scores for the top-k documents.
        top_k_scores = scores[top_k_indices]
        
        return top_k_scores, top_k_indices

### 데이터셋

In [44]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('klue/roberta-base')

In [52]:
tokenizer

BertTokenizerFast(name_or_path='klue/roberta-base', vocab_size=32000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [45]:
text_list = ['안녕하세요','반갑습니다','안녕 서울 안녕', '부산 안녕 안녕']

index_bm25 = BM25(text_list,tokenizer)
index_bm25

<__main__.BM25 at 0x7a54d4296320>

In [46]:
vars(index_bm25).keys()

dict_keys(['tokenizer', 'corpus', 'tokenized_corpus', 'n_docs', 'avg_doc_lens', 'idf', 'term_freqs'])

In [47]:
vars(index_bm25)['tokenized_corpus']

[[5891, 2205, 5971],
 [9927, 2219, 3606],
 [5891, 3671, 5891],
 [3902, 5891, 5891]]

In [48]:
vars(index_bm25)['n_docs'], vars(index_bm25)['avg_doc_lens'], vars(index_bm25)['idf'], vars(index_bm25)['term_freqs']


(4,
 3.0,
 defaultdict(float,
             {5971: 1.2039728043259361,
              5891: 0.3566749439387324,
              2205: 1.2039728043259361,
              2219: 1.2039728043259361,
              3606: 1.2039728043259361,
              9927: 1.2039728043259361,
              3671: 1.2039728043259361,
              3902: 1.2039728043259361}),
 [defaultdict(int, {5891: 1, 2205: 1, 5971: 1}),
  defaultdict(int, {9927: 1, 2219: 1, 3606: 1}),
  defaultdict(int, {5891: 2, 3671: 1}),
  defaultdict(int, {3902: 1, 5891: 2})])

In [49]:
index_bm25.get_scores('안녕 인천'), index_bm25.get_top_k('안녕 인천',2)

(array([0.35667494, 0.        , 0.49042805, 0.49042805]),
 (array([0.49042805, 0.49042805]), array([3, 2])))

In [50]:
# https://huggingface.co/datasets/klue/klue/viewer/mrc
from datasets import load_dataset

klue_mrc_dataset = load_dataset('klue', 'mrc', split='train')

README.md:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17554 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5841 [00:00<?, ? examples/s]

In [51]:
index_bm25 = BM25(klue_mrc_dataset['context'], tokenizer)
index_bm25 

Token indices sequence length is longer than the specified maximum sequence length for this model (965 > 512). Running this sequence through the model will result in indexing errors


<__main__.BM25 at 0x7a54af826590>

In [61]:
# query = '이번 연도에는 언제 비가 많이 올까 ?'
query = '로버트 헨리 딕이 1946년에 매사추세츠 연구소에서 개발한 것은 무엇인가?'
index_bm25.get_scores(query)

array([1.34057261, 7.91141135, 7.91141135, ..., 4.83792464, 0.35935409,
       0.76558927])

In [62]:
top_scores, top_indices = index_bm25.get_top_k(query,5)
top_scores, top_indices

(array([33.54588925, 21.3944501 , 15.91892909, 15.86530076, 14.96643802]),
 array([    3,  8289,  1079, 14462, 11915]))

In [63]:
[klue_mrc_dataset['context'][idx][:50] for idx in top_indices]    

['미국 세인트루이스에서 태어났고, 프린스턴 대학교에서 학사 학위를 마치고 1939년에 로체스',
 '잭슨은 영국의 컴벌랜드 카운티에서 태어나 부모가 죽은 후에, 사우스캐롤라이나의 찰스턴에 이',
 '영국과 북미 식민지 간의 관계가 악화되어 1775년 4월에 뉴잉글랜드에서 렉싱턴 콩코드 전',
 '케네디는 1962년 11월 7일 테드 케네디는 연방 상원으로 선서되었다. 그는 그가 처음 ',
 '윌리 딕슨의 블루스 천국은 미국 일리노이주 시카고에 있는 전시관 겸 공연장이다. 세계 2차']