# Handle Data

In [1]:
!pip install pyvi
!pip install pytrec_eval
!pip install rank_bm25

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.10 pyvi-0.1.1 sklearn-crfsui

In [2]:
import re
import numpy as np
import string
from pyvi.ViTokenizer import tokenize
def remove_stopword(text):
    filename = '/kaggle/input/law-qa/vietnamese.txt'
    with open(filename, 'r', encoding='utf-8') as file:
        list_stopwords = file.read().splitlines()
    pre_text = []
    words = text.split()
    for word in words:
        if word not in list_stopwords:
            pre_text.append(word)
    text2 = ' '.join(pre_text)

    return text2

def clean_text(text):
    text = re.sub('<.*?>', '', text).strip()        # Remove HTML tags
    text = re.sub('(\s)+', r'\1', text)            # Remove extra spaces
    return text

def normalize_text(text):
    listpunctuation = string.punctuation.replace('_', '')
    for i in listpunctuation:
        text = text.replace(i, ' ')         # Remove punctuation
    return text.lower()

def word_segment(text):
    text = tokenize(text.encode('utf-8').decode('utf-8'))
    return text

def clean_title(title):
    for i, char in enumerate(title):
        if char.isupper():
            return title[i:]
    return title

def isArticle(text):
    count = text.count('_')
    if count == 1:
        return True
    elif count == 2:
        return False
     
    return False

In [3]:
import json
import pickle

with open('/kaggle/input/law-train-evalute/chunk_id_mapping', 'rb') as f:
    idx_mapping = pickle.load(f)
with open('/kaggle/input/law-train-evalute/chunk_corpus', 'rb') as f:
    segmented_chunk_corpus = pickle.load(f)
with open('/kaggle/input/law-train-evalute/queries_relevant_chunks', 'rb') as f:
    segmented_queries_relevant_chunk = pickle.load(f)

In [4]:
no_segment_queries = []

with open("/kaggle/input/law-qa/query_set_evaluate.json", "r") as file:
    for line in file:
        data = json.loads(line)
        no_segment_queries.append(data)

In [5]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(no_segment_queries, test_size=0.2, random_state=42)

In [6]:
test_set[100]

{'query_id': '66635754c2f544363eef170a',
 'query': 'Khi nào người nghiện ma túy từ đủ 12 tuổi đến dưới 18 tháng tuổi bị đưa vào trung tâm cai nghiện bắt buộc?',
 'relevant_docs': ['73/2021/qh14_33']}

In [7]:
segmented_chunk_corpus[10]

{'doc_id': '32/2024/qh15_4',
 'chunk_id': '32/2024/qh15_4_3',
 'title': 'Điều 4 .Giải_thích từ_ngữ',
 'chunk': '3 . Can_thiệp sớm là việc Ngân_hàng Nhà_nước Việt_Nam ( sau đây gọi là Ngân_hàng Nhà_nước ) áp_dụng các yêu_cầu , biện_pháp hạn_chế đối_với tổ_chức tín_dụng , chi_nhánh ngân_hàng nước_ngoài và yêu_cầu tổ_chức tín_dụng , chi_nhánh ngân_hàng nước_ngoài đó thực_hiện phương_án khắc_phục dưới sự giám_sát của Ngân_hàng Nhà_nước nhằm khắc_phục tình_trạng theo quy_định tại khoản 1 Điều 156 của Luật này .',
 'text': 'Điều 4 .Giải_thích từ_ngữ :3 . Can_thiệp sớm là việc Ngân_hàng Nhà_nước Việt_Nam ( sau đây gọi là Ngân_hàng Nhà_nước ) áp_dụng các yêu_cầu , biện_pháp hạn_chế đối_với tổ_chức tín_dụng , chi_nhánh ngân_hàng nước_ngoài và yêu_cầu tổ_chức tín_dụng , chi_nhánh ngân_hàng nước_ngoài đó thực_hiện phương_án khắc_phục dưới sự giám_sát của Ngân_hàng Nhà_nước nhằm khắc_phục tình_trạng theo quy_định tại khoản 1 Điều 156 của Luật này .'}

In [8]:
queries = []
for query in test_set:
    if (len(query['relevant_docs']) == 1):
        queries.append(query)
        
# queries = queries[:100] # Split to test

In [9]:
query_test_relevant_chunks = []
for query in queries:
    add_record = True
    relevant_chunks = []
    
    for relevant_doc in query['relevant_docs']:
        try:
            if isArticle(relevant_doc):
                relevant_chunks += idx_mapping[relevant_doc]
            else:
                relevant_chunks.append(relevant_doc)
        except KeyError:
            add_record = False
            
    if add_record:
        query_test_relevant_chunks.append({
            'query_id' : query['query_id'],
            'query' : query['query'],
            'relevant_docs' : query['relevant_docs'],
            'relevant_chunks' : relevant_chunks
        })

In [10]:
query_test_relevant_chunks[2]

{'query_id': '66635669c2f544363eeb5d6a',
 'query': 'Các hành vi nào bị nghiêm cấm trong lĩnh vực giá, thẩm định giá đối với doanh nghiệp thẩm định giá?',
 'relevant_docs': ['16/2023/qh15_7_3'],
 'relevant_chunks': ['16/2023/qh15_7_3']}

In [11]:
def process_query(query):
#     query = clean_title(query)
    query = clean_text(query)
    query = word_segment(query)
    query = normalize_text(query)
    # query = remove_stopword()
    
    return query

In [12]:
corpus_text = [doc['text'] for doc in segmented_chunk_corpus]

In [13]:
texts = [
    [word for word in record['text'].lower().split()]
    for record in segmented_chunk_corpus
]

In [14]:
relevant_docs_list = {}

for query in query_test_relevant_chunks:
    relevant_docs_list[query['query_id']] = { rc:1 for rc in query['relevant_chunks']}

# Evaluate TF/IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf_vectorizer = TfidfVectorizer()

In [16]:
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus_text)

In [17]:
predicted_docs_list_tfidf = {}

def get_relevant_predict_tfidf(tfidf_matrix, query_tfidf, query_id, k):
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    top_k_indices = np.argsort(cosine_similarities)[::-1][:k]
    top_k_scores = cosine_similarities[top_k_indices]
    
    top_n_documents = [segmented_chunk_corpus[idx] for idx in top_k_indices]
    
    predicted_results = []
    for doc_dict, score in zip(top_n_documents, top_k_scores):
        doc_id = doc_dict['chunk_id']
        predicted_results.append(doc_id)
        
        if query_id not in predicted_docs_list_tfidf:
            predicted_docs_list_tfidf[query_id] = {}
    
        predicted_docs_list_tfidf[query_id][doc_id] = score
    
    
    return predicted_results

In [18]:
all_relevant_docs_tfidf = []

for query in query_test_relevant_chunks:
    query_text = process_query(query['query'])
    query_tfidf = tfidf_vectorizer.transform([query_text])
    predicted_results = get_relevant_predict_tfidf(tfidf_matrix, query_tfidf, query['query_id'], 10)
    all_relevant_docs_tfidf.append(predicted_results)

In [None]:
from sklearn.metrics import recall_score, precision_score
sum_recall_tfidf = 0.0
sum_pre_tfidf = 0.0

k = 10
predicted_labels = [1] * k

for query, predicted_results in zip(query_test_relevant_chunks, all_relevant_docs_tfidf):
    true_labels = [1 if doc in query['relevant_chunks'] else 0 for doc in predicted_results[:k]]
    
    recall_at_k = recall_score(true_labels, predicted_labels)
    sum_recall_tfidf += recall_at_k
    
    pre_at_k = precision_score(true_labels, predicted_labels)
    sum_pre_tfidf += pre_at_k

In [20]:
print(sum_recall_tfidf/len(query_test_relevant_chunks))
print(sum_pre_tfidf/len(query_test_relevant_chunks))

0.5943992773261066
0.1020475760313162


In [21]:
import pytrec_eval
metrics = {'map', 'recip_rank', 'ndcg'}

evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs_list, metrics)

In [22]:
results = evaluator.evaluate(predicted_docs_list_tfidf)

In [23]:
sum_measures = {measure: 0.0 for measure in metrics}

for query_id, query_measures in results.items():
    for measure, value in query_measures.items():
        sum_measures[measure] += value

average_measures = {measure: value / len(query_test_relevant_chunks) for measure, value in sum_measures.items()}

In [24]:
# k = 10
for measure, value in average_measures.items():
    print(f'{measure}: {value}')

ndcg: 0.324528139517165
map: 0.24992675742703177
recip_rank: 0.3631972823255562


# Evaluate BM25

In [25]:
from rank_bm25 import BM25Plus
bm25plus = BM25Plus(texts)

In [26]:
predicted_docs_list_bm25 = {}

def get_relevant_predict_bm25(bm25plus, query_bm25, query_id, k):
    bm25_scores = bm25plus.get_scores(query_bm25)
    top_n_indices = np.argsort(bm25_scores)[::-1][:100]
    top_n_score = bm25_scores[top_n_indices]

    top_n_documents = [segmented_chunk_corpus[idx] for idx in top_n_indices]

    predicted_results = []
    for doc_dict, score in zip(top_n_documents, top_n_score):
        doc_id = doc_dict['chunk_id']
        predicted_results.append(doc_id)
        
        if query_id not in predicted_docs_list_bm25:
            predicted_docs_list_bm25[query_id] = {}
    
        predicted_docs_list_bm25[query_id][doc_id] = score
    return predicted_results

In [27]:
all_relevant_docs_bm25 = []

for query in query_test_relevant_chunks:
    query_bm25 = process_query(query['query'])
    query_bm25 = query_bm25.split()
    predicted_results = get_relevant_predict_bm25(bm25plus, query_bm25, query['query_id'], 10)
    all_relevant_docs_bm25.append(predicted_results)

In [None]:
from sklearn.metrics import recall_score, precision_score
sum_recall = 0.0
sum_pre_bm25 = 0.0

k = 10
predicted_labels = [1] * k

for query, predicted_results in zip(query_test_relevant_chunks, all_relevant_docs_bm25):
    true_labels = [1 if doc in query['relevant_chunks'] else 0 for doc in predicted_results[:k]]
    
    recall_at_k = recall_score(true_labels, predicted_labels)
    sum_recall += recall_at_k
    
    pre_at_k = precision_score(true_labels, predicted_labels)
    sum_pre_bm25 += pre_at_k

In [29]:
# k = 10
print(sum_recall/len(query_test_relevant_chunks))
print(sum_pre_bm25/len(query_test_relevant_chunks))

0.634447455585667
0.10828063836194024


In [30]:
import pytrec_eval
metrics = {'map', 'recip_rank', 'ndcg'}

evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs_list, metrics)

In [31]:
results = evaluator.evaluate(predicted_docs_list_bm25)

In [32]:
sum_measures = {measure: 0.0 for measure in metrics}

for query_id, query_measures in results.items():
    for measure, value in query_measures.items():
        sum_measures[measure] += value

average_measures = {measure: value / len(query_test_relevant_chunks) for measure, value in sum_measures.items()}

In [33]:
# k = 10
for measure, value in average_measures.items():
    print(f'{measure}: {value}')

ndcg: 0.4380703874417419
map: 0.3077745874329615
recip_rank: 0.40507816214014436
