# Dump BM25

In [1]:
!pip install pyvi
!pip install rank_bm25

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m48.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.10 pyvi-0.1.1 sklearn-crfsui

In [2]:
from rank_bm25 import BM25Plus
import json
import pickle
import re, string
from pyvi.ViTokenizer import tokenize

In [3]:
def remove_stopword(text):
    filePath = '/kaggle/input/law-qa/vietnamese.txt'
    with open(filePath, 'r', encoding='utf-8') as file:
        list_stopwords = file.read().splitlines()
    pre_text = []
    words = text.split()
    for word in words:
        if word not in list_stopwords:
            pre_text.append(word)
    text2 = ' '.join(pre_text)

    return text2

In [4]:
def clean_text(text):
    text = re.sub('<.*?>', '', text).strip()
    text = re.sub('(\s)+', r'\1', text)
    return text

In [5]:
def normalize_text(text):
    listpunctuation = string.punctuation.replace('_', '')
    for i in listpunctuation:
        text = text.replace(i, ' ')
    return text.lower()

In [6]:
def word_segment(text):
    text = tokenize(text.encode('utf-8').decode('utf-8'))

    return text

In [7]:
def process_record(record):
    content = record.get('title', '')
    content_id = record.get('_id', '')
    idString = content_id.get('$oid', '')

    content = clean_text(content)
    content = word_segment(content)
    content = remove_stopword(normalize_text(content))

    return {'id': idString, 'title': content}

In [8]:
def get_docs(file_path):
    docs = []
    with open(file_path, 'r', encoding='utf-8') as file:
        try:
            content = file.read()
            records = json.loads(content)
            for record in records:
                try:
                    doc = process_record(record)
                    docs.append(doc)
                except Exception as e:
                    continue

        except json.JSONDecodeError:
            print(f"Error decoding JSON: {e}")

    return docs

In [9]:
docs = get_docs("/kaggle/input/law-qa/lawlaboratory.questions_new.json")
print(len(docs))

65624


In [10]:
import os
def save_docs_to_file(docs, output_file):
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    with open(output_file, 'w', encoding='utf-8') as file:
        for doc in docs:
            json.dump(doc, file, ensure_ascii=False)
            file.write('\n')

In [11]:
save_docs_to_file(docs, '/kaggle/working/bm25/docs.jsonl')

In [12]:
with open("/kaggle/input/law-qa/vietnamese.txt", 'r', encoding='utf-8') as file:
    list_stopwords = file.read().splitlines()

In [13]:
texts = [
    [word for word in record.get('title', '').lower().split() if word not in list_stopwords]
    for record in docs
]

In [14]:
bm25plus = BM25Plus(texts)

In [15]:
with open('/kaggle/working/bm25/bm25plus.pkl', 'wb') as f:
    pickle.dump(bm25plus, f)

# Embedding title BKAI

In [1]:
! pip install -U sentence-transformers
! pip install pyvi
! pip install faiss-cpu

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1
Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading sklea

In [2]:
import json
import pandas as pd
import numpy as np
from pyvi.ViTokenizer import tokenize

In [3]:
df = pd.read_json('/kaggle/input/law-qa/lawlaboratory.questions_new.json')
df.shape

(65624, 9)

In [4]:
df['id'] = df['_id'].apply(lambda x: x['$oid'])
df.drop(columns=['_id'], inplace=True)

In [5]:
def merge_quote(quote):
    if isinstance(quote, dict):
        name = quote.get('name', ' ')
        content = ' '.join(quote.get('content', []))

        if name is None:
            name = ' '
        if content is None:
            content = ' '

        return f"{name}. {content}"
    else:
        return None


df['conclusion'] = df.apply(lambda row: merge_quote(row['quote']) if row['conclusion'] == [] else row['conclusion'], axis=1)

In [6]:
def is_invalid_conclusion(conclusion):
    if conclusion is None:
        return True
    if isinstance(conclusion, float) and np.isnan(conclusion):
        return True
    if isinstance(conclusion, list) and len(conclusion) == 0:
        return True
    return False

df = df[~df['conclusion'].apply(is_invalid_conclusion)]

In [7]:
def create_answer(row):
    conclusion_value = row['conclusion']

    if isinstance(conclusion_value, list):
        conclusion_value = ' '.join(conclusion_value)
    elif pd.isna(conclusion_value):
        conclusion_value = ''
    
    if pd.isna(row['quote']):
        return conclusion_value
    else:
        reference = row['reference'] if not pd.isna(row['reference']) else ''
        merged_quote = merge_quote(row['quote'])
        return f"{reference} {merged_quote}"
    
df['answer'] = df.apply(create_answer, axis=1)

In [8]:
df = df.drop(['date_answer', 'field', 'description', 'reference', 'quote', 'conclusion', 'source_url'], axis=1)
df.reset_index(drop=True, inplace=True)

In [9]:
df.info()
df.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65624 entries, 0 to 65623
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   65624 non-null  object
 1   id      65624 non-null  object
 2   answer  65624 non-null  object
dtypes: object(3)
memory usage: 1.5+ MB


Unnamed: 0,title,id,answer
63984,Người nước ngoài đang tạm trú tại HCM thì đến ...,6663561fc2f544363eeae46b,"Dạ, thưa luật sư: Em có người bạn là người nướ..."
48362,Thời gian nghỉ hè hằng năm của nhà giáo giáo d...,66635713c2f544363eed8128,"Tại Điều 7 Thông tư 07/2017/TT-BLĐTBXH, được s..."
7910,Tiền lương đi làm ngày 30 tháng 4 và 01 tháng ...,6663571ac2f544363eedf800,Căn cứ Điều 98 Bộ luật Lao động 2019 quy định ...
6301,Tiêu chí nào để người lao động có thể kê khai ...,666357a2c2f544363ef06a5a,Căn cứ điểm b khoản 1 Điều 9 Nghị định 126/202...
1318,Học sinh nào được Nhà nước cấp học bổng khuyến...,667aa98f1766e26488471557,Tại Điều 85 Luật Giáo dục 2019 có quy định về ...


In [10]:
ids = df['id'].values
id_to_index = {i: _id for i, _id in enumerate(ids)}

In [11]:
import os
output_file = 'sbert/id_to_index.npy'
os.makedirs(os.path.dirname(output_file), exist_ok=True)
np.save(output_file, id_to_index)

In [12]:
def word_segment(text):
    text = tokenize(text.encode('utf-8').decode('utf-8'))
    return text

In [13]:
df['title'] = df['title'].apply(word_segment)
sentences = df['title'].tolist()
# sentences = sentences[:40000]

In [14]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [15]:
import torch
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('bkai-foundation-models/vietnamese-bi-encoder')
model = AutoModel.from_pretrained('bkai-foundation-models/vietnamese-bi-encoder')

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/540M [00:00<?, ?B/s]

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(64001, 768, padding_idx=1)
    (position_embeddings): Embedding(258, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [17]:
import faiss
dimension = model.config.hidden_size 
index = faiss.IndexFlatIP(dimension)

In [18]:
batch_size = 5000
for start_idx in range(0, len(sentences), batch_size):
    end_idx = min(start_idx + batch_size, len(sentences))
    batch_sentences = sentences[start_idx:end_idx]
    
    encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt', max_length=256)
    
    encoded_input = {key: value.to(device) for key, value in encoded_input.items()}
    
    with torch.no_grad():
        model_output = model(**encoded_input)
        embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        embeddings = embeddings.cpu().numpy()
    
    faiss.normalize_L2(embeddings)
    
    index.add(embeddings)
    
    torch.cuda.empty_cache()
    print(f"Processed batch {start_idx // batch_size + 1}/{len(sentences) // batch_size + 1}")

Processed batch 1/14
Processed batch 2/14
Processed batch 3/14
Processed batch 4/14
Processed batch 5/14
Processed batch 6/14
Processed batch 7/14
Processed batch 8/14
Processed batch 9/14
Processed batch 10/14
Processed batch 11/14
Processed batch 12/14
Processed batch 13/14
Processed batch 14/14


In [19]:
print("Number of vectors in the index:", index.ntotal)

Number of vectors in the index: 65624


In [29]:
faiss.write_index(index, "sbert/embeddings_65k_index.index")