### 1.Import Libary

In [1]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments,Trainer, EarlyStoppingCallback
import numpy as np
import collections
from tqdm.auto import tqdm
from datasets import Dataset, DatasetDict
import evaluate
import wandb

### 2.Data Preprocessing

In [3]:


def read_dataset(file_path):
    """Đọc dataset từ file CSV."""
    df = pd.read_csv(file_path)
    df['context'] = df['context'].astype(str)
    df['question'] = df['question'].astype(str)
    df['answer'] = df['answer'].astype(str)
    return df

def find_start_index(context, answer):
    """Tìm chỉ số bắt đầu của answer trong context."""
    return str(context).find(str(answer))

def prepare_dataset(df):
    TRAIN_RATIO = 0.8
    VAL_RATIO = 0.1
    """Chuẩn bị dataset cho huấn luyện, xác thực và kiểm tra."""
    df['start_index'] = df.apply(lambda row: find_start_index(context=row['context'], answer=row['answer']), axis=1)
    df = df[df['start_index'] != -1]

    dataset_temp = []
    for _, row in df.iterrows():
        sample = {
            'context': row['context'],
            'question': row['question'],
            'answer': {'text': [row['answer']], 'answer_start': [row['start_index']]}
        }
        dataset_temp.append(sample)

    dataset = pd.DataFrame(dataset_temp)

    num_of_total_sample = len(dataset)
    num_of_train_sample = TRAIN_RATIO * num_of_total_sample
    num_of_val_sample = VAL_RATIO * num_of_total_sample

    train_set = dataset.sample(n=int(num_of_train_sample), random_state=42)
    dataset.drop(index=train_set.index, inplace=True)

    val_set = dataset.sample(n=int(num_of_val_sample), random_state=42)
    dataset.drop(index=val_set.index, inplace=True)

    return Dataset.from_pandas(train_set), Dataset.from_pandas(val_set), Dataset.from_pandas(dataset)

def preprocess_training_validation_examples(examples, tokenizer, max_length, stride):
    """Tiền xử lý ví dụ huấn luyện cho mô hình."""
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answer"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

def preprocess_test_examples(examples, tokenizer, max_length, stride):
    """Tiền xử lý ví dụ cho bộ kiểm tra và xác thực."""
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["question"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs


### 3.Fine-Tuning Model

> Function Evaluation

In [4]:

metric = evaluate.load("squad")

def compute_metrics(start_logits, end_logits, features, examples, n_best, max_answer_length, metric):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["question"]
        context = example["context"]
        answers = []

        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["question"], "answers": ex["answer"]} for ex in examples]
    
    for index, (i, ii) in enumerate(zip(predicted_answers, theoretical_answers)):
        if index >= 2:  # Dừng sau 2 mẫu
            break
        print("-" * 99)
        print(f"ID: {i['id']}")
        print(f"Predicted Answer: {i['prediction_text']}")
        print(f"Correct Answers: {', '.join(ii['answers']['text'])}")
        print("-" * 99)
    
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

> Function Load Model and Load tokenizer

In [5]:
def load_model_and_tokenizer(model_checkpoint):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
    return model, tokenizer

> declare hyperparameters

In [6]:
wandb.login(key = '8a5cbfdaa29778a896996cc679358b1d96cf66b0')

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [7]:
MODEL_CHECKPOINT = "google-bert/bert-base-multilingual-cased"
MAX_LENGTH = 512
STRIDE = 380
N_BEST = 180
MAX_ANSWER_LENGTH = 2000

 # Early stopping callback
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=1 ,early_stopping_threshold=0.001)

    # Training arguments
training_args = TrainingArguments(
        output_dir="./bert_question_answer",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=34,
        per_device_eval_batch_size=34,
        gradient_accumulation_steps=12,
        num_train_epochs=10,
        disable_tqdm=False,
        weight_decay=0.25,
        save_total_limit=3,
        optim="adamw_hf",
        fp16=True,
        max_grad_norm=0.6,
        warmup_ratio=0.2,
        group_by_length=True,
        report_to="wandb",
        load_best_model_at_end=True,
        label_names=['start_positions', 'end_positions'],
        lr_scheduler_type="linear"
)




> Call functions read dataset

In [8]:
df = read_dataset('/kaggle/input/data-final-legal-full/final_train.csv')

In [9]:
df.shape


(32325, 4)

> call function  prepare_dateset

In [10]:
train_set, val_set, test_set = prepare_dataset(df)

In [11]:
print(f'tập train {len(train_set)}')
print(f'tập val {len(val_set)}')
print(f'test_test{len(test_set)}')

tập train 24368
tập val 3046
test_test3046


> call function Load model and tokenizer

In [12]:
model, tokenizer = load_model_and_tokenizer(MODEL_CHECKPOINT)
model.to('cuda')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

> Embedding 

In [13]:

train_dataset = train_set.map(
        lambda examples: preprocess_training_validation_examples(examples, tokenizer, MAX_LENGTH, STRIDE),
        batched=True,
        remove_columns=train_set.column_names,
    )
val_dataset = val_set.map(
        lambda examples: preprocess_training_validation_examples(examples, tokenizer, MAX_LENGTH, STRIDE),
        batched=True,
        remove_columns=train_set.column_names,
    )
test_dataset = test_set.map(
        lambda examples: preprocess_test_examples(examples, tokenizer, MAX_LENGTH, STRIDE),
        batched=True,
        remove_columns=train_set.column_names,
    )
    

Map:   0%|          | 0/24368 [00:00<?, ? examples/s]

Map:   0%|          | 0/3046 [00:00<?, ? examples/s]

Map:   0%|          | 0/3046 [00:00<?, ? examples/s]

> training process

In [14]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset ,
        callbacks=[early_stopping_callback],
    )


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


> Evaluation model pre-train for extract 

In [15]:
predictions, _, _ = trainer.predict(test_dataset)
start_logits, end_logits = predictions    
results = compute_metrics(start_logits, end_logits, test_dataset, test_set, N_BEST, MAX_ANSWER_LENGTH,metric)
print(results)

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


[34m[1mwandb[0m: Currently logged in as: [33mhdang1696[0m ([33mhdang1696-no-work-experience[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250212_090325-2gwi9s6n[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m./bert_question_answer[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/hdang1696-no-work-experience/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/hdang1696-no-work-experience/huggingface/runs/2gwi9s6n[0m


  0%|          | 0/3046 [00:00<?, ?it/s]

---------------------------------------------------------------------------------------------------
ID: BHXH bắt buộc và tự nguyện khác nhau thế nào?
Predicted Answer: BHXH bao gồm BHXH bắt buộc và BHXH tự nguyện, vậy, làm sao để phân biệt hai loại hình BHXH này?Bảo hiểm xã hội Việt Nam trả lời vấn đề này như sau:Về giải thích từ ngữ, tại Điều 3
Correct Answers: Bảo hiểm xã hội Việt Nam trả lời vấn đề này như sau:Về giải thích từ ngữ, tại Điều 3Luật BHXH năm 2014quy định cụ thể như sau:- BHXH bắt buộc là loại hình BHXH do Nhà nước tổ chức mà người lao động và người sử dụng lao động phải tham gia.- BHXH tự nguyện là loại hình BHXH do Nhà nước tổ chức mà người tham gia được lựa chọn mức đóng, phương thức đóng phù hợp với thu nhập của mình và Nhà nước có chính sách hỗ trợ tiền đóng BHXH để người tham gia hưởng chế độ hưu trí và tử tuất.Về đối tượng áp dụng, tại Điều 2 Luật BHXH năm 2014 quy định, người lao động thuộc đối tượng tham gia BHXH bắt buộc quy định tại Khoản 1 và Khoản 2 bao gồm