In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

os.environ["WANDB_API_KEY"] = "74cc30c0b87e7e7cf070cf9e2acc2061a2203c88"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/qa-dataset/train-v2.0.json
/kaggle/input/qa-dataset/dev-v2.0.json


In [2]:
import json

train_file_path = "/kaggle/input/qa-dataset/train-v2.0.json"
with open(train_file_path, "r", encoding="utf-8") as file:
    train_data = json.load(file)

test_file_path = "/kaggle/input/qa-dataset/dev-v2.0.json"
with open(test_file_path, "r", encoding="utf-8") as file:
    test_data = json.load(file)

In [3]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, DataCollatorWithPadding, pipeline
from datasets import Dataset
from sklearn.metrics import f1_score
import torch
import pandas as pd

2024-08-10 08:32:14.385965: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-10 08:32:14.386086: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-10 08:32:14.496507: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
def prepare_data(data):
    data_list = []
    for article in data["data"]:
        for para in article['paragraphs']:
            context = para['context']
            for qa in para['qas']:
                question = qa['question']
                if qa['answers']:
                    start = qa['answers'][0]['answer_start']
                    end = start + len(qa['answers'][0]['text'])
                else:
                    start = end = 0
                data_list.append({
                    'context': context,
                    'question': question,
                    'start': start,
                    'end': end
                })
    df = pd.DataFrame(data_list)
    return Dataset.from_pandas(df)

In [5]:
def preprocess_function(examples, tokenizer):
    inputs = tokenizer(
        examples['question'],
        examples['context'],
        max_length=512,
        truncation='only_second',
        padding='max_length',
        return_offsets_mapping=True
    )

    start_positions = []
    end_positions = []

    for i, offset_mapping in enumerate(inputs['offset_mapping']):
        start = examples['start'][i]
        end = examples['end'][i]

        if start == 0 and end == 0:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_position = next((idx for idx, pair in enumerate(offset_mapping) if pair[0] == start), None)
            end_position = next((idx for idx, pair in enumerate(offset_mapping) if pair[1] == end), None)

            if start_position is None or end_position is None:
                start_positions.append(0)
                end_positions.append(0)
            else:
                start_positions.append(start_position)
                end_positions.append(end_position)

    assert len(start_positions) == len(end_positions) == len(inputs['input_ids']), \
        "Length of start_positions, end_positions and inputs['input_ids'] must be equal"

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions

    return inputs

In [6]:
def compute_metrics(pred):
    
    start_logits, end_logits = pred.predictions
    start_preds = start_logits.argmax(axis=-1)
    end_preds = end_logits.argmax(axis=-1)
    
    start_labels = pred.label_ids[0]
    end_labels = pred.label_ids[1]
    
    start_f1 = f1_score(start_labels, start_preds, average='weighted')

    end_f1 = f1_score(end_labels, end_preds, average='weighted')

    return {
        'start_f1': start_f1,
        'end_f1': end_f1,
    }

In [7]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = AutoTokenizer.from_pretrained("nguyenvulebinh/vi-mrc-large")
model = AutoModelForQuestionAnswering.from_pretrained("nguyenvulebinh/vi-mrc-large").to(device)

tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [8]:
train_dataset = prepare_data(train_data)
tokenized_train_dataset = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)

test_dataset = prepare_data(test_data)
tokenized_test_dataset = test_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)

Map:   0%|          | 0/2976 [00:00<?, ? examples/s]

Map:   0%|          | 0/478 [00:00<?, ? examples/s]

In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch' if tokenized_test_dataset else 'no',
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    warmup_steps=100,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="start_f1",
    greater_is_better=True,
    save_total_limit=1,
)

data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics
)

In [10]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mleviethung251204[0m ([33mhunglv2512[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.17.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240810_083356-v124gjws[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m./results[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/hunglv2512/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/hunglv2512/huggingface/runs/v124gjws[0m


Epoch,Training Loss,Validation Loss,Start F1,End F1
1,0.3486,0.222895,0.948895,0.961069
2,0.1816,0.229772,0.959641,0.964184
3,0.3305,0.194482,0.962473,0.969323
4,0.002,0.237367,0.962775,0.966752
5,0.0,0.243688,0.9604,0.968001
6,0.1222,0.279488,0.965565,0.968521
7,0.0241,0.299151,0.962335,0.969794
8,0.0013,0.349379,0.965565,0.971515
9,0.3827,0.340212,0.966969,0.971515
10,0.0069,0.366374,0.969408,0.971515


TrainOutput(global_step=7440, training_loss=0.11986575733302565, metrics={'train_runtime': 6658.8808, 'train_samples_per_second': 4.469, 'train_steps_per_second': 1.117, 'total_flos': 2.763832008572928e+16, 'train_loss': 0.11986575733302565, 'epoch': 10.0})

In [11]:
from huggingface_hub import login
login("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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [12]:
model.push_to_hub("HungLV2512/Vietnamese-QA-fine-tuned")

README.md:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HungLV2512/Vietnamese-QA-fine-tuned/commit/4b36a6ecd570560853f36a2b3923e7790ea3d056', commit_message='Upload RobertaForQuestionAnswering', commit_description='', oid='4b36a6ecd570560853f36a2b3923e7790ea3d056', pr_url=None, pr_revision=None, pr_num=None)

In [13]:
tokenizer.push_to_hub("HungLV2512/Vietnamese-QA-fine-tuned")

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/HungLV2512/Vietnamese-QA-fine-tuned/commit/c2edf8d78a63ea30910b2e6a89fb20c8d1e521eb', commit_message='Upload tokenizer', commit_description='', oid='c2edf8d78a63ea30910b2e6a89fb20c8d1e521eb', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
results = trainer.evaluate()
print("Evaluation results:", results)

Evaluation results: {'eval_loss': 0.3663741946220398, 'eval_start_f1': 0.9694084855172721, 'eval_end_f1': 0.9715146562470671, 'eval_runtime': 29.9236, 'eval_samples_per_second': 15.974, 'eval_steps_per_second': 4.01, 'epoch': 10.0}


In [15]:
nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)

QA_input = {
  'question': "Hùng là chuyên gia về gì?",
  'context': "Hùng Lê là một người đam mê với lĩnh vực trí tuệ nhân tạo . Anh nhận chứng chỉ Google Developer Expert năm 2020."
}

res = nlp(QA_input)
print('pipeline: {}'.format(res))

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


pipeline: {'score': 0.9999879598617554, 'start': 41, 'end': 57, 'answer': 'trí tuệ nhân tạo'}
