In [1]:
#%pip install evaluate
#%pip install transformers[torch]
#%pip install accelerate -U


## 1. Import th∆∞ vi·ªán

In [2]:
import numpy as np
from tqdm.auto import tqdm
import collections

import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import evaluate

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


  from .autonotebook import tqdm as notebook_tqdm





## 2. Setup Config

In [3]:
MODEL_NAME = "distilbert-base-uncased"
MAX_LENGTH = 384
STRIDE = 128

## 3. Setup Dataset

> Download dataset

In [4]:
DATASET_NAME = "squad_v2"
raw_datasets = load_dataset(DATASET_NAME)

> Load tokenizer and run some examples

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



## 4. Tokenize dataset

In [6]:
def preprocess_training_examples(examples):
    # tr√≠ch xu·∫•t danh s√°ch c√¢u h·ªèi t·ª´ examples v√† lo·∫°i b·ªè c√°c kho·∫£ng tr·∫Øng d∆∞ th·ª´a
    questions = [q.strip() for q in examples["question"]]
    # ti·∫øn h√†nh m√£ h√≥a th√¥ng tin ƒë·∫ßu v√†o s·ª≠ d·ª•ng tokenizer
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length= MAX_LENGTH,
        truncation= "only_second",
        stride = STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # tr√≠ch xu·∫•t offset_mapping t·ª´ inputs v√† lo·∫°i b·ªè n√≥ ra kh·ªèi inputs
    offset_mapping = inputs.pop("offset_mapping")

    # tr√≠ch xu·∫•t sample_map t·ª´ inputs v√† lo·∫°i b·ªè n√≥ ra kh·ªèi inputs
    sample_map = inputs.pop("overflow_to_sample_mapping")

    # tr√≠ch xu·∫•t th√¥ng tin v·ªÅ c√¢u tr·∫£ l·ªùi (answers) t·ª´ examples
    answers = examples["answers"]

    # kh·ªüi t·∫°o danh s√°ch c√°c v·ªã tr√≠ b·∫Øt ƒë√†u v√† k·∫øt th√∫c c√¢u tr·∫£ l·ªùi
    start_positions = []
    end_positions = []

    # Duy·ªát qua danh s√°ch offset_mapping
    for i, offset in enumerate(offset_mapping):
        # x√°c ƒë·ªãnh index c·ªßa m·∫´u (Sample) li√™n quan ƒë·∫øn offset hi·ªán t·∫°i
        sample_idx = sample_map[i]

        # tr√≠ch xu·∫•t sequence_ids t·ª´ inputs
        sequence_ids = inputs.sequence_ids(i)

        # X√°c ƒë·ªãnh v·ªã tr√≠ b·∫Øt ƒë·∫ßu v√† k·∫øt th√∫c c·ªßa ng·ªØ c·∫£nh
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx +=1
        context_end = idx - 1

        # Tr√≠ch xu·∫•t thoong tin v·ªÅ c√¢u tr·∫£ l·ªùi cho m·∫´u n√†y
        answer = answers[sample_idx]

        if len(answer['text']) == 0:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # x√°c ƒë·ªãnh v·ªã tr√≠ ky t·ª± b·∫Øt ƒë·∫ßu v√† k·∫øt th√∫c c·ªßa c√¢u tr·∫£ l·ªùi trong ng·ªØ c·∫£nh
            start_char = answer['answer_start'][0]
            end_char = answer['answer_start'][0] + len(answer["text"][0])

            # n·∫øu c√¢u tr·∫£ l·ªùi kh√¥ng n·∫±m ho√†n to√†n trong ng·ªØ c·∫£nh g·∫Øn nh√£n l√† (0, 0)
            if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # n·∫øu kh√¥ng, g√°n v·ªã tr√≠ b·∫Øt ƒë·∫ßu v√† k·∫øt th√∫c d·ª±a tr√™n v·ªã tr√≠ c·ªßa c√°c m√£ th√¥ng tin
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

    # Th√™m th√¥ng tin v·ªã tr√≠ b·∫Øt ƒë·∫ßu v√† k·∫øt th√∫c v√† inputs
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs


In [7]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [8]:
len(raw_datasets["train"]), len(train_dataset)

(130319, 131754)

## 5. Tokenize val set

In [9]:
def preprocess_validation_examples(examples):
    # Chu·∫©n b·ªã danh s√°ch c√¢u h·ªèi b·∫±ng c√°ch lo·∫°i b·ªè c√°c kho·∫£ng tr·∫Øng d∆∞ th·ª´a
    questions = [q.strip() for q in examples["question"]]
    
    # s·ª≠ d·ª•ng tokenizer ƒë·ªÉ m√£ h√≥a c√°c c√¢u h·ªèi v√† vƒÉn b·∫£n li√™n quan
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length= MAX_LENGTH,
        truncation= "only_second",
        stride = STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # l·∫•y √°nh x·∫° ƒë·ªÉ √°nh x·∫° l·∫°i v√≠ d·ª• tham chi·∫øu cho t·ª´ng d√≤ng trong inputs
    sample_map = inputs.pop("overflow_to_sample_mapping")
    examples_ids = []

    # X√°c ƒë·ªãnh v√≠ d·ª• tham chi·∫øu cho m·ªói d√≤ng ƒë·∫ßu v√†o v√† ƒëi·ªÅu ch√≠nh √°nh x·∫° offset
    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        examples_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]

        # Lo·∫°i b·ªè c√°c offset kh√¥ng ph√π h·ª£p v·ªõi sequence_ids
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = examples_ids

    return inputs

In [10]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11873/11873 [00:07<00:00, 1553.35 examples/s]


In [11]:
len(raw_datasets["validation"]), len(validation_dataset)

(11873, 12134)

## 6. Train model

In [12]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
args = TrainingArguments(
    output_dir = "distilbert-finetuned-squadv2", # Th∆∞ m·ª•c l∆∞u output
    evaluation_strategy="no", # Ch·∫ø ƒë·ªô ƒë√°nh gi√° kh√¥ng t·ª± ƒë·ªông sau m·ªói epoch
    save_strategy="epoch", # L∆∞u checkpoint sau m·ªói epoch
    learning_rate = 2e-5, # T·ªëc ƒë·ªô h·ªçc
    num_train_epochs=3, # S·ªë epoch hu·∫•n luy·ªán
    weight_decay=0.01, # Gi·∫£m tr·ªçng l∆∞·ª£ng m√¥ h√¨nh ƒë·ªÉ tr√°nh overfitting
    fp16 = True, # S·ª≠ d·ª•ng ki·ªÉu d·ªØ li·ªáu half-precision ƒë·ªÉ t·ªëi ∆∞u t√†i nguy√™n
    push_to_hub = True, # ƒê·∫©y k·∫øt qu·∫£ hu·∫•n luy·ªán l√™n HuggingFace Hub
    hub_token= "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" # ƒêi·ªÅn token c·ªßa t√†i kho·∫£n HuggingFace
)



In [None]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset= validation_dataset,
    tokenizer = tokenizer,
)

trainer.train()

In [None]:
trainer.push_to_hub(commit_message="Training complete")

## 7. Evaluate model

In [None]:
metric = evaluate.load("squad_v2")

In [None]:
N_BEST = 20 # S·ªë l∆∞·ª£ng k·∫øt qu·∫£ t·ªët nh√°t ƒë∆∞·ª£c l·ª±a ch·ªçn sau khi d·ª± ƒëo√°n
MAX_ANS_LENGTH = 30 # ƒë·ªô d√†i t·ªëi ƒë√£ cho c√¢u tr·∫£ l·ªùi d·ª± ƒëo√°n

def compute_metrics(start_logits, end_logits, features, examples):
    # t·∫°o m·ªôt t·ª´ ƒëi·ªÉn m·∫∑c ƒë·ªãnh ƒë·ªÉ tr√°nh √°nh x·∫° m·ªói v√≠ d·ª• v·ªõi danh s√°ch c√°c ƒë·∫∑c tr∆∞ng t∆∞∆°ng ·ª©ng
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[features['example_id']].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example['id']
        context = example['context']
        answers = []

        # L·∫∑p qua t·∫•t c·∫£ c√°c ƒë·∫∑c tr∆∞ng li√™n quan ƒë·∫øn v√≠ d·ª• ƒë√≥
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]['offset_mapping']

            # L·∫•y c√°c ch·ªâ s·ªë c√≥ gi√° tr·ªã l·ªõn nh·∫•t cho start v√† end logits
            start_indexes = np.argsort(start_logit)[-1:-N_BEST-1:-1].tolist()
            end_indexes = np.argsort(end_logit)[-1:-N_BEST-1:-1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # B·ªè qua c√°c c√¢u tr·∫£ l·ªùi kh√¥ng ho√†n to√†n n·∫±m trong ng·ªØ c·∫£nh
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # B·ªè qua c√°c c√¢u tr·∫£ l·ªùi c√≥ ƒë·ªô d√†i > max_answwer_length
                    if end_index - start_index + 1 > MAX_ANS_LENGTH:
                        continue

                    # T·∫°o m·ªôt c√¢u tr·∫£ l·ªùi m·ªõi
                    text = context [ offsets[start_index][0]:offsets[end_index][1]]
                    logit_score = start_logit[start_index] + end_logit[end_index]
                    answer = {
                        'text': text,
                        'logit_score': logit_score,
                    }
                    answers.append(answer)
        # Ch·ªçn c√¢u tr·∫£ l·ªùi c√≥ ƒëi·ªÉm s·ªë t·ªët nh·∫•t
        if len(answers) > 0:
            best_answer = max(answers, key = lambda x: x['logit_Score'])
            answer_dict = {
                'id': example_id,
                'prediction_text': best_answer['text'],
                'no_answer_probability': 1 - best_answer['logit_score']
            }
        else:
            answer_dict = {
                'id': example_id,
                'prediction_text': '',
                'no_answer_probability': 1.0
            }
        predicted_answers.append(answer_dict)

    # t·∫°o danh s√°ch c√¢u tr·∫£ l·ªùi l√Ω thuy·∫øt t·ª´ c√°c v√≠ d·ª•
    theoretical_answers = [
        { 'id': ex['id'], 'answers': ex['answers']} for ex in examples
    ]
    # s·ª≠ d·ª•ng metric.compute ƒë·ªÉ t√≠nh to√°n c√°c ƒë·ªô ƒëo v√† tr·∫£ v·ªÅ k·∫øt qu·∫£
    return metric.compute(
        predictions = predicted_answers,
        references = theoretical_answers
    )


In [None]:
predictions, _, _= trainer.predict(validation_dataset)

start_logits, end_logits = predictions

results = compute_metrics(
    start_logits,
    end_logits,
    validation_dataset,
    raw_datasets["validation"]
)

results

## 8. Load model from hub

In [None]:
from transformer import pipeline

PIPELINE_NAME = 'question-answering'
MODEL_NAME = '........' # T√™n model ƒë√£ fine-tuning tr√™n hugging face
pipe = pipeline(PIPELINE_NAME, model=MODEL_NAME)

> Test


In [None]:
INPUT_QUESTION = 'What is my name?'
INPUT_CONTEXT = 'My name is Han and I live in Vietnam.'
pipe(question=INPUT_QUESTION, context=INPUT_CONTEXT)

In [None]:
from 