In [1]:
import pandas as pd
df = pd.read_csv('./question_answer.csv')
df.head(5)

Unnamed: 0,Question,Context
0,"Bị sâu răng số 5 , đã đi điều trị tuỷ được một...",Chào bạn! Bạn điều trị tủy 1 năm mới thấy đau ...
1,Bệnh ung thư vú có bị di truyền không thưa Bs?,Ước tính có khoảng 5 - 10 % bệnh nhân...
2,"Em đã từng nội soi dạ dày, kết luận có túi thừ...","""Chào bạn!\nCác biến chứng túi thừa dạ dày vỡ,..."
3,"Xin chào bác sĩ, Con tôi lúc sinh đc 3kg,...",- Con đi ngoài trước hay sau khi ăn dặm mẹ?\n-...
4,"Em chào bác sĩ, khoảng 1 năm gần đây ngón tay ...",Chào bạn! nếu xương đã chụp rồi ko sao thì có ...


In [2]:
len(df.Question.unique())

210571

In [3]:
len(df.Context.unique())

239449

In [4]:
import re
def clean(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'[\n\r\t]+', ' ', text)      
    text = re.sub(r'\s{2,}', ' ', text)        
    return text.strip()


In [5]:
df['Context'] = df['Context'].map(clean)
df['Question'] = df['Question'].map(clean)


In [6]:
df.head(5)

Unnamed: 0,Question,Context
0,"Bị sâu răng số 5 , đã đi điều trị tuỷ được một...",Chào bạn! Bạn điều trị tủy 1 năm mới thấy đau ...
1,Bệnh ung thư vú có bị di truyền không thưa Bs?,Ước tính có khoảng 5 - 10 % bệnh nhân...
2,"Em đã từng nội soi dạ dày, kết luận có túi thừ...","""Chào bạn! Các biến chứng túi thừa dạ dày vỡ, ..."
3,"Xin chào bác sĩ, Con tôi lúc sinh đc 3kg,...",- Con đi ngoài trước hay sau khi ăn dặm mẹ? - ...
4,"Em chào bác sĩ, khoảng 1 năm gần đây ngón tay ...",Chào bạn! nếu xương đã chụp rồi ko sao thì có ...


In [7]:
df['context_length_word'] = df['Context'].astype(str).str.split().apply(len)
df['question_length_word'] = df['Question'].astype(str).str.split().apply(len)


In [8]:
df = df[df['context_length_word'] >= 30]
df = df[df['question_length_word'] <= 100]

In [9]:
df.describe()

Unnamed: 0,context_length_word,question_length_word
count,197379.0,197379.0
mean,70.602222,19.598331
std,65.93752,16.444316
min,30.0,1.0
25%,43.0,10.0
50%,54.0,14.0
75%,72.0,21.0
max,1966.0,100.0


In [10]:
from sentence_transformers import SentenceTransformer
from pyvi.ViTokenizer import tokenize

model = SentenceTransformer('bkai-foundation-models/vietnamese-bi-encoder', trust_remote_code=True, device="cuda")


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
def filter_by_token_length(sentences, min_len=10, max_len=256):
    filtered_sentences = []
    for sent in sentences:
        tokenized = tokenize(sent)
        token_count = len(tokenized.split())
        if min_len <= token_count <= max_len:
            filtered_sentences.append(sent.strip())
    return filtered_sentences

In [12]:
import pandas as pd
import re

def split_sentence(context):
    if pd.isna(context):
        return None
    
    context = re.sub(r'[\r\n]+', ' ', context)
    context = re.sub(r'(?<=[.!?…])(?=\S)', ' ', context)
    sentences = re.split(r'(?<=[.!?…])\s+', context.strip())
    filtered = filter_by_token_length(sentences)

    return filtered

In [13]:
# def embeddings_sentences(question, sentences):
#     if sentences:
#         q_c = [question] + sentences
#         tokenizer_sent = [tokenize(sent) for sent in q_c]
#         embeddings = model.encode(tokenizer_sent)

#         return embeddings
#     return []


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_score(embeddings):
    if embeddings is not None and len(embeddings) > 1:
        first_vector = embeddings[0].reshape(1, -1)
        rest_vectors = embeddings[1:]
        cos_scores = cosine_similarity(first_vector, rest_vectors)[0]

        return cos_scores
    return []

In [15]:
import numpy as np

def get_top_k_span(sentences, cos_scores, context, k=2):
    if cos_scores is not None and len(cos_scores) > 1:
        top_k_idx = np.argsort(cos_scores)[-k:][::-1]
        
        top_k_idx_sorted = sorted(top_k_idx)

        sentence_start = context.find(sentences[top_k_idx_sorted[0]])
        sentence_end = context.find(sentences[top_k_idx_sorted[-1]]) + len(sentences[top_k_idx_sorted[-1]])

        return sentence_start, sentence_end
    
    return -1, -1


In [16]:
# results = []

# for idx, row in df.iterrows():
#     context = row['Context']
#     question = row['Question']
#     question = re.sub(r'[\r\n]+', ' ', question)
#     sentences = split_sentence(context)
#     embeddings = embeddings_sentences(question, sentences)

#     cos_scores = cosine_similarity_score(embeddings)

#     sentence_start, sentence_end = get_top_k_span(sentences, cos_scores, context, k=2)
#     if sentence_start:
#         answer = context[sentence_start:sentence_end]
#         print(answer)
        
#     results.append(answer)

In [27]:
batch_size = 50
answers = []
sentence_starts = []
sentence_ends = []


batch_questions = []
batch_sentences = []
batch_sentence_lists = []
batch_contexts = []
batch_indices = []

In [28]:
for idx, (i, row) in enumerate(df.iterrows()):
    print(f"Processing row {idx}")
    context = row['Context']
    question = re.sub(r'[\r\n]+', ' ', row['Question'])
    sentences = split_sentence(context)

    batch_questions.append(question)
    batch_sentences.append(sentences)
    batch_sentence_lists.append([question] + sentences)
    batch_contexts.append(context)
    batch_indices.append(idx)

    # Khi đủ batch_size hoặc dòng cuối cùng
    if len(batch_questions) == batch_size or idx == len(df) - 1:
        all_inputs = []
        sentence_counts = []

        for q_and_sents in batch_sentence_lists:
            tokenized = [tokenize(sent) for sent in q_and_sents]
            all_inputs.extend(tokenized)
            sentence_counts.append(len(q_and_sents))

        # Gọi encode 1 lần cho tất cả
        embeddings = model.encode(all_inputs)

        pointer = 0
        for i in range(len(batch_questions)):
            count = sentence_counts[i]
            emb_slice = embeddings[pointer:pointer + count]
            pointer += count

            cos_scores = cosine_similarity_score(emb_slice)

            context = batch_contexts[i]
            sentences = batch_sentences[i]

            sentence_start, sentence_end = get_top_k_span(sentences, cos_scores, context, k=2)

            if sentence_start != -1 and sentence_end != -1:
                answer = context[sentence_start:sentence_end]
            else:
                answer = ""

            answers.append(answer)
            sentence_starts.append(sentence_start)
            sentence_ends.append(sentence_end)  

        # Reset batch
        batch_questions.clear()
        batch_sentences.clear()
        batch_sentence_lists.clear()
        batch_contexts.clear()
        batch_indices.clear()


Processing row 0
Processing row 1
Processing row 2
Processing row 3
Processing row 4
Processing row 5
Processing row 6
Processing row 7
Processing row 8
Processing row 9
Processing row 10
Processing row 11
Processing row 12
Processing row 13
Processing row 14
Processing row 15
Processing row 16
Processing row 17
Processing row 18
Processing row 19
Processing row 20
Processing row 21
Processing row 22
Processing row 23
Processing row 24
Processing row 25
Processing row 26
Processing row 27
Processing row 28
Processing row 29
Processing row 30
Processing row 31
Processing row 32
Processing row 33
Processing row 34
Processing row 35
Processing row 36
Processing row 37
Processing row 38
Processing row 39
Processing row 40
Processing row 41
Processing row 42
Processing row 43
Processing row 44
Processing row 45
Processing row 46
Processing row 47
Processing row 48
Processing row 49
Processing row 50
Processing row 51
Processing row 52
Processing row 53
Processing row 54
Processing row 55
Pr

In [29]:
print(len(df), len(answers), len(sentence_starts), len(sentence_ends))

197379 197379 197379 197379


In [30]:
df['Answer'] = answers
df['Answer_Start'] = sentence_starts
df['Answer_End'] = sentence_ends

In [None]:
df_to_save = df[['Question', 'Context', 'Answer', 'Answer_Start', 'Answer_End']].copy()
df_to_save.to_csv("QA_data.csv", index=False)

In [33]:
non_empty_count = df[df['Answer'].str.strip() != ''].shape[0]
print("Số dòng Answer không rỗng:", non_empty_count)


Số dòng Answer không rỗng: 159501
