In [1]:
import os
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import DefaultDataCollator
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
import os
from huggingface_hub import login
login(token="REDACTED_HF_TOKEN")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
squad = load_dataset("squad", split="train[:100]")
squad = squad.train_test_split(test_size=0.2)
squad_0 = squad['train'][0]
squad_0
# id : 질문의 고유 ID
# title : 질문이 속한 문서의 제목
# context : 질문에 대한 답이 포함된 문장
# question : 질문
# answers : 정답 정보
#   - answer_start : 정답의 시작 위치
#   - text : 정답의 내용

{'id': '5733b0fb4776f41900661043',
 'title': 'University_of_Notre_Dame',
 'context': "Father Joseph Carrier, C.S.C. was Director of the Science Museum and the Library and Professor of Chemistry and Physics until 1874. Carrier taught that scientific research and its promise for progress were not antagonistic to the ideals of intellectual and moral culture endorsed by the Church. One of Carrier's students was Father John Augustine Zahm (1851–1921) who was made Professor and Co-Director of the Science Department at age 23 and by 1900 was a nationally prominent scientist and naturalist. Zahm was active in the Catholic Summer School movement, which introduced Catholic laity to contemporary intellectual issues. His book Evolution and Dogma (1896) defended certain aspects of evolutionary theory as true, and argued, moreover, that even the great Church teachers Thomas Aquinas and Augustine taught something like it. The intervention of Irish American Catholics in Rome prevented Zahm's censure b

In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]] # question을 리스트로 변환
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second", # 두 번째 텍스트(context)가 길어질 경우, context만 384넘으면 잘라냄
        return_offsets_mapping=True, # 답변의 시작 위치와 끝 위치를 원래의 context에 매핑
        padding="max_length",
    )
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    return {
        **inputs, 
        "offset_mapping": offset_mapping,
        "answers": answers,
    }

# tokenized_dataset = squad.map(preprocess_function, 
#                               batched=True, # 여러 샘플을 한 번에 처리
#                               remove_columns=squad["train"].column_names) # train 데이터셋의 컬럼을 제거
tokenized_dataset = squad.map(preprocess_function, 
                              batched=True) # 여러 샘플을 한 번에 처리

Map: 100%|██████████| 80/80 [00:00<00:00, 1664.40 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 1560.15 examples/s]


In [4]:
# 데이터셋의 첫 번째 샘플을 확인
print('Context length:', len(tokenized_dataset['train']['context'][0]))
print('Input IDs length:', len(tokenized_dataset['train']['input_ids'][0]))
print('Attention mask length:', len(tokenized_dataset['train']['attention_mask'][0]))
print('Offset mapping length:', len(tokenized_dataset['train']['offset_mapping'][0]))
print('Answers length:', len(tokenized_dataset['train']['answers'][0]))
print('*'*50)
print('Context:', tokenized_dataset['train']['context'][0])
# input id : 토크나이저의 어휘 사전(vocabulary)에 등록된 토큰의 고유 번호
print('Input IDs:', tokenized_dataset['train']['input_ids'][0])
# attention mask : 어떤 토큰을 실제로 처리할지 여부를 나타냄, 1은 처리, 0은 무시 
# 모델 입력 길이 맞춰야 해서 짧은 문장 뒤에 PAD토큰 채움->모델이 PAD 무시하도록 알려주는게 attention mask
print('Attention mask:', tokenized_dataset['train']['attention_mask'][0])
# offset mapping : 토크나이저가 자른 각 토큰이 원래 문장에서 어느 위치(시작, 끝)
# tokenizer가 전처리한 후, 각 토큰이 원래 문장에서 어디에 위치하는지 알려줌
print('Offset mapping:', tokenized_dataset['train']['offset_mapping'][0])
print('Answers:', tokenized_dataset['train']['answers'][0])
print('Question:', tokenized_dataset['train']['question'][0])

Context length: 1033
Input IDs length: 384
Attention mask length: 384
Offset mapping length: 384
Answers length: 2
**************************************************
Context: Father Joseph Carrier, C.S.C. was Director of the Science Museum and the Library and Professor of Chemistry and Physics until 1874. Carrier taught that scientific research and its promise for progress were not antagonistic to the ideals of intellectual and moral culture endorsed by the Church. One of Carrier's students was Father John Augustine Zahm (1851–1921) who was made Professor and Co-Director of the Science Department at age 23 and by 1900 was a nationally prominent scientist and naturalist. Zahm was active in the Catholic Summer School movement, which introduced Catholic laity to contemporary intellectual issues. His book Evolution and Dogma (1896) defended certain aspects of evolutionary theory as true, and argued, moreover, that even the great Church teachers Thomas Aquinas and Augustine taught something

In [5]:
context = "In January 2013, Destiny's Child released Love Songs"
tokens = tokenizer.tokenize(context)
encoding = tokenizer(context, return_offsets_mapping=True)
tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'])
offsets = encoding['offset_mapping']

print(f"{'Index':<5} {'Token':<12} {'Offset':<15} {'Matched Text'}")
print("-" * 50)
for i, (tok, (start, end)) in enumerate(zip(tokens, offsets)):
    matched_text = context[start:end]
    print(f"{i:<5} {tok:<12} ({start}, {end})   '{matched_text}'")

Index Token        Offset          Matched Text
--------------------------------------------------
0     [CLS]        (0, 0)   ''
1     in           (0, 2)   'In'
2     january      (3, 10)   'January'
3     2013         (11, 15)   '2013'
4     ,            (15, 16)   ','
5     destiny      (17, 24)   'Destiny'
6     '            (24, 25)   '''
7     s            (25, 26)   's'
8     child        (27, 32)   'Child'
9     released     (33, 41)   'released'
10    love         (42, 46)   'Love'
11    songs        (47, 52)   'Songs'
12    [SEP]        (0, 0)   ''


In [6]:
# 전처리 함수 정의
def preprocess_function(examples):
    # 질문 텍스트에서 앞뒤 공백 제거
    questions = [q.strip() for q in examples["question"]]

    # 토크나이즈 수행: 질문과 문맥을 함께 인코딩
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,               # 최대 길이 설정 (384 토큰)
        truncation="only_second",    # context가 너무 길 경우 context만 자르기
        return_offsets_mapping=True, # 각 토큰이 원문에서 차지하는 문자 범위 반환
        padding="max_length",        # 길이 맞추기 (패딩 추가)
    )

    # offset_mapping은 나중에 사용하므로 따로 꺼내고, inputs에서는 제거
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]    # 정답(annotation) 정보 가져오기

    start_positions = []  # 정답의 시작 토큰 인덱스 리스트
    end_positions = []    # 정답의 끝 토큰 인덱스 리스트

    # 각 데이터 샘플에 대해 반복
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]             # 정답 시작 문자 위치
        end_char = start_char + len(answer["text"][0])     # 정답 끝 문자 위치

        # 각 토큰이 질문/문맥/패딩 중 어디에 속하는지 알려주는 리스트
        sequence_ids = inputs.sequence_ids(i)

        # context 영역의 시작 인덱스 찾기
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx

        # context 영역의 끝 인덱스 찾기
        while idx < len(sequence_ids) and sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # 정답이 context 범위를 벗어난 경우 → 학습에서 무시할 값 (0, 0) 지정
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # context 내에서 정답 시작 토큰 찾기
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            # context 내에서 정답 끝 토큰 찾기
            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    # inputs에 정답 토큰 위치 정보 추가
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# 전처리 함수 적용: 여러 샘플(batch)을 한 번에 처리하며, 원래의 컬럼은 제거
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map: 100%|██████████| 80/80 [00:00<00:00, 3212.18 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 2471.45 examples/s]


In [7]:
# 모델 로드
data_collator = DefaultDataCollator()
model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# 모델 학습
# Training arguments
training_args = TrainingArguments(
    output_dir="../experiment/qa_bert",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    data_collator=data_collator,
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,5.871279
2,No log,5.820331
3,No log,5.797934




TrainOutput(global_step=6, training_loss=5.8269602457682295, metrics={'train_runtime': 4.4481, 'train_samples_per_second': 53.956, 'train_steps_per_second': 1.349, 'total_flos': 23517558005760.0, 'train_loss': 5.8269602457682295, 'epoch': 3.0})

In [13]:
from transformers import pipeline
qa_pipeline = pipeline("question-answering", model=trainer.model, tokenizer=tokenizer)
example = {
    "context": "The capital of France is Paris. It is known for the Eiffel Tower.",
    "question": "What is the capital of France?"
}
result = qa_pipeline(example)
print(f"Answer: {result['answer']}")

Device set to use cuda:0


Answer: Eiffel Tower.


In [15]:
# 학습한 모델 가져오기
model = AutoModelForQuestionAnswering.from_pretrained("../experiment/qa_bert")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

example = {
    "context": "The capital of France is Paris. It is known for the Eiffel Tower.",
    "question": "What is the capital of France?"
}
result = qa_pipeline(example)
print(f"Answer: {result['answer']}")

Device set to use cuda:0


Answer: Eiffel Tower.
