## 1. Cài đặt và Import các thư viện cần thiết

In [None]:
import numpy as np
import collections
import torch
import faiss
import evaluate

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from tqdm.auto import tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


## 2. Tải bộ dữ liệu

In [None]:
DATASET_NAME = "squad_v2"
raw_datasets = load_dataset(DATASET_NAME, split='train+validation')
raw_datasets

## 3. Loại bỏ các mẫu không có đáp án

In [None]:
raw_datasets = raw_datasets.filter(
    lambda x: len(x['answers']['text']) > 0
)

## 4. Khởi tạo mô hình

In [None]:
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)

## 5. Xây dựng hàm lấy vector embedding

In [None]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [None]:
def get_embeddings(text_list):
    encoded_input - tokenizer(
        text_list,
        padding= True,
        truncation = True,
        return_tensors = 'pt',
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)

    return cls_pooling(model_output)

## 6. Xây dựng vector database

In [None]:
EMBEDDING_COLUMN = 'question_embedding'
embeddings_dataset = raw_datasets.map(
    lambda x: {
        EMBEDDING_COLUMN: get_embeddings(
            x['question']
        ).detach().cpu().numpy()[0]
    }
)

In [None]:
embeddings_dataset.add_faiss_index(column = EMBEDDING_COLUMN)

In [None]:
input_question = 'When did Beyonce start becoming popular?'

input_ques_embedding = get_embeddings([input_question])
input_ques_embedding = input_ques_embedding.cpu().detach().numpy()

TOP_K = 5
scores, samples = embeddings_dataset.get_nearest_examples(
    EMBEDDING_COLUMN, input_ques_embedding, k = TOP_K
)

for idx, score in enumerate(scores):
    print(f'Top (idx + 1)\tScore: {scores}')
    print(f'Question: {samples["question"][idx]}')
    print(f'Context: {samples["context"][idx]}')
    print()

## 7. Áp dụng mô hình hỏi - đáp để trả lời câu hỏi

In [None]:
from transformer import pipeline

PIPELINE_NAME = 'question-answering'
MODEL_NAME = '........' # Tên model đã fine-tuning trên hugging face
pipe = pipeline(PIPELINE_NAME, model=MODEL_NAME)

In [None]:
print(f'Input question: ' {input_question})
for idx, score in enumerate(scores):
    question = samples["question"][idx]
    context = samples["context"][idx]
    answer = pipe(
        question = question,
        context = context
    )
    print(f'Top (idx + 1)\tScore: {scores}')
    print(f'Context: {context}')
    print(f'Answer: {answer}')
    print()