In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_datasets = load_dataset("squad")

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [4]:
type(raw_datasets)

datasets.dataset_dict.DatasetDict

In [5]:
type(raw_datasets["train"][2:6])

dict

In [6]:
raw_datasets["train"][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [7]:
from transformers import AutoTokenizer

In [8]:
model_checkpoint = "bert-base-cased"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [10]:
context = raw_datasets["train"][0]["context"]
question = raw_datasets["train"][0]["question"]

In [11]:
inputs = tokenizer(
    question,
    context,
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True
)

In [12]:
context = raw_datasets["train"][2:6]["context"]
question = raw_datasets["train"][2:6]["question"]

In [13]:
inputs = tokenizer(
    question,
    context,
    max_length=100,
    truncation="only_second",
    stride=50,
    return_overflowing_tokens=True,
    return_offsets_mapping=True
)

In [14]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

In [15]:
answers = raw_datasets["train"][2:6]["answers"]


In [16]:
answers

[{'text': ['the Main Building'], 'answer_start': [279]},
 {'text': ['a Marian place of prayer and reflection'], 'answer_start': [381]},
 {'text': ['a golden statue of the Virgin Mary'], 'answer_start': [92]},
 {'text': ['September 1876'], 'answer_start': [248]}]

In [17]:
sample_datasets = raw_datasets["train"][2:6]
sample_datasets

{'id': ['5733be284776f41900661180',
  '5733be284776f41900661181',
  '5733be284776f4190066117e',
  '5733bf84d058e614000b61be'],
 'title': ['University_of_Notre_Dame',
  'University_of_Notre_Dame',
  'University_of_Notre_Dame',
  'University_of_Notre_Dame'],
 'context': ['Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
  'Architecturally, the school

In [18]:
start_positions = []
end_positions = []
for i, offset in enumerate(inputs["offset_mapping"]):
    sample_idx = inputs["overflow_to_sample_mapping"][i]
    answer = answers[sample_idx]
    start_char = answer["answer_start"][0]
    end_char = answer["answer_start"][0] + len(answer["text"][0])
    sequence_ids = inputs.sequence_ids(i)
"""
i  0 -> 18 , offset [(0, 0), (0, 3), (4, 12), (13, 15), (16, 19), (20, 26), (27, 32), (33, 35), (36, 41),...]
i [cls][question][sep][context][sep]      [cls] [sep] (0,0)
sample_idx 0000 1111 2222 3333333 và trong các feature của overflow thì có chứa các offset - các tupple là các subword từ 1 từ gốc
answer -> {'answer text'}, 'answer_start': [start_char]} x4 x4 x4 x7 tại vì có 4 câu
-> start_char  279 381 92 248
answer["text"] = ["The Main Building"], ... -> answer["text"][0] chỉ là cái str ở trong các cái list thôi -> len answer
end_char = start_char + len answer
sequece_ids -> [None,0,0,0,0,0,0,None,1,1,1,1,1,1,..,1,None] phần 0 là question phần 1 là context
"""
    # Find the start and the end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1
"""
sequence_ids[0] sẽ là ra 19 cái None, sequence_ids[idx] nó sẽ là slice qua các tokens trong cái sequence đấy và loop cho đủ tất cả các sequence của ta (ở đây là có 19 cái đấy)
idx += 1 là để ta nhích dần lên đến khi nào sequence_ids[idx] = 1 là bắt đầu phần context của sequence đấy, khi đấy cái while loop sẽ dừng và ta có context_start
còn khi mà đến token là None thì idx đấy -1 sẽ ra token cuối cùng của context
"""

     # If the answer is not fully inside the context, label is (0, 0)
    if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)
"""
if: nếu mà sequence đó mà thỏa mãn các điều kiện đó thì answer không nằm trong sequence đó -> label của sequence đó sẽ là (0,0)
context_start <= context_end nghĩa là while loop cho đến khi idx nhỏ hơn idx của context end (ta đi qua từng offset một cho đến cuối cái sequence, context_start và context_end chỉ là index của offset thôi, còn star/end char là inputs_ids)
- offset[idx] -> là trả lại cái offset tức cái tuple ở vị trí idx đó và offset[idx][0] là cái subword đầu tiên của cái cụm gốc đấy -> và ta loop (idx+=1) cho đến khi cái offset của chúng ta chạm đến start của answer
"""

start_positions, end_positions

IndentationError: unexpected indent (199559200.py, line 20)

In [19]:
answers = raw_datasets["train"][2:6]["answers"]
start_positions = []
end_positions = []

for i, offset in enumerate(inputs["offset_mapping"]):
    sample_idx = inputs["overflow_to_sample_mapping"][i]
    answer = answers[sample_idx]
    start_char = answer["answer_start"][0]
    end_char = answer["answer_start"][0] + len(answer["text"][0])
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    # If the answer is not fully inside the context, label is (0, 0)
    if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
        start_positions.append(0)
        end_positions.append(0)
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_positions.append(idx - 1)

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_positions.append(idx + 1)

start_positions, end_positions

"""
([83, 51, 19, 0, 0, 64, 27, 0, 34, 0, 0, 0, 67, 34, 0, 0, 0, 0, 0],
 [85, 53, 21, 0, 0, 70, 33, 0, 40, 0, 0, 0, 68, 35, 0, 0, 0, 0, 0])

đây chính là đang label cho train data của mình
nhìn theo cặp dọc nhé: (83, 85) là của sequence đầu tiên
"""

'\n([83, 51, 19, 0, 0, 64, 27, 0, 34, 0, 0, 0, 67, 34, 0, 0, 0, 0, 0],\n [85, 53, 21, 0, 0, 70, 33, 0, 40, 0, 0, 0, 68, 35, 0, 0, 0, 0, 0])\n\nđây chính là đang label cho train data của mình\nnhìn theo cặp dọc nhé: (83, 85) là của sequence đầu tiên\n'

In [20]:
idx = 0
sample_idx = inputs["overflow_to_sample_mapping"][idx]
answer = answers[sample_idx]["text"][0]

start = start_positions[idx]
end = end_positions[idx]
labeled_answer = tokenizer.decode(inputs["input_ids"][idx][start : end + 1])

print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")
"""
inputs["overflow_to_sample_mapping"] [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]
answers[sample_idx] {'text': ['the Main Building'], 'answer_start': [279]} -> answer: "the Main Building"
start -> 83
end -> 85
inputs["input_ids"][0] -> dãy ids của sequnce ở idx 0 -> [start : end + 1] là lấy ids của answers xong decode ra 
"""

Theoretical answer: the Main Building, labels give: the Main Building


'\ninputs["overflow_to_sample_mapping"] [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]\nanswers[sample_idx] {\'text\': [\'the Main Building\'], \'answer_start\': [279]} -> answer: "the Main Building"\nstart -> 83\nend -> 85\ninputs["input_ids"][0] -> dãy ids của sequnce ở idx 0 -> [start : end + 1] là lấy ids của answers xong decode ra \n'

In [21]:
print(inputs["input_ids"][0])

[101, 1109, 19349, 1104, 1103, 11373, 1762, 1120, 10360, 8022, 1110, 3148, 1106, 1134, 2401, 136, 102, 22182, 1193, 117, 1103, 1278, 1144, 170, 2336, 1959, 119, 1335, 4184, 1103, 4304, 4334, 112, 188, 2284, 10945, 1110, 170, 5404, 5921, 1104, 1103, 6567, 2090, 119, 13301, 1107, 1524, 1104, 1103, 4304, 4334, 1105, 4749, 1122, 117, 1110, 170, 7335, 5921, 1104, 4028, 1114, 1739, 1146, 14089, 5591, 1114, 1103, 7051, 107, 159, 21462, 1566, 24930, 2508, 152, 1306, 3965, 107, 119, 5893, 1106, 1103, 4304, 4334, 1110, 1103, 19349, 1104, 1103, 11373, 4641, 119, 13301, 1481, 1103, 171, 17506, 102]


In [22]:
car = {
  "brand": "Ford",
  "model": "Mustang",
  "year": 1964
}

model = car.pop("model")

print(model)

Mustang


In [23]:
sample_valid_datasets = raw_datasets["validation"][2:6]
sample_valid_datasets

{'id': ['56be4db0acb8001400a502ee',
  '56be4db0acb8001400a502ef',
  '56be4db0acb8001400a502f0',
  '56be8e613aeaaa14008c90d1'],
 'title': ['Super_Bowl_50', 'Super_Bowl_50', 'Super_Bowl_50', 'Super_Bowl_50'],
 'context': ['Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',


In [24]:
max_length = 384
stride = 128

In [25]:
inputs = tokenizer(
        sample_valid_datasets["question"],
        sample_valid_datasets["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

In [26]:
sample_map = inputs.pop("overflow_to_sample_mapping")
sample_map

#Tại ở đây mình để max_length lớn hơn nên mỗi sample sẽ là câu trọn vẹn chứ không bị truncate như khi mình set max_length = 100

[0, 1, 2, 3]

In [29]:
example_ids = []
for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

        inputs["example_id"] = example_ids

"""
inputs["input_ids"] -> inputs ids của 4 sample có dạng [[sample1],[sample2],[sample3],[sample4]]
len(inputs["input_ids"]) -> 4 
example_ids append 
examples là cái validation set của chúng ta có 1 cái id cùng sample index sẽ trả lại 1 cái id '56be8e613aeaaa14008c90d1'
sequence_ids -> sẽ ra dạng [None,0,0,..,0,None,1,1,..,1,None] -> đại diện cho [CLS,Question, context, answer,sep] answer sẽ trả về toàn None
offset sẽ trả về nguyên offset của cái sample đấy
enumerate offset
mỗi cái tuple offset có 1 index, nếu vị trí tương tự ở bên sequence_ids = 1, tức là vị trí của context -> thì sẽ thay vào offset mapping của trong inputs = o không thì là None, nghĩa là mình tạo offset var cho clean thôi chứ thực chất thì
là loop qua chính inputs["offset_mapping"] và với index trong sequence_ids là context thì return lại sang bên

They will contain offsets for the question and the context, but once we’re in the post-processing stage we won’t have any way to know which part of the input IDs corresponded to the context and which part was the question (the sequence_ids() method we used is available for the output of the tokenizer only).
So, we’ll set the offsets corresponding to the question to None
"""

NameError: name 'examples' is not defined

In [80]:
sample_idx = sample_map[5]
sample_idx

IndexError: list index out of range

In [88]:
sample_valid_datasets["id"][3]

'56be8e613aeaaa14008c90d1'

In [91]:
offset = inputs["offset_mapping"][0]

In [95]:
inputs.keys()


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])

In [30]:
max_length = 384
stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [31]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 87599/87599 [00:55<00:00, 1580.07 examples/s]


(87599, 88729)

In [32]:
train_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 88729
})

In [33]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [34]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 10570/10570 [00:13<00:00, 791.52 examples/s]


In [35]:
validation_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 10822
})

In [98]:
raw_datasets = raw_datasets["train"][2:10]

In [100]:
train_dataset = raw_datasets.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)
len(raw_datasets["train"]), len(train_dataset)

AttributeError: 'dict' object has no attribute 'map'

In [104]:
type(raw_datasets["train"])

dict